def create_cluster(**kwargs): workers = task['inputs'].get('workers', 0) cpu = task['inputs'].get('worker_cores', 2) memory = task['inputs'].get('worker_memory', 2) image = task['inputs'].get('worker_image', 'daskdev/dask:latest') resources = { 'cpu': str(cpu), 'memory': str(memory), } container = client.V1Container( name='dask', image=image, args=[ 'dask-worker', '--nthreads', str(cpu_to_threads(cpu)), '--no-bokeh', '--memory-limit', f'{memory}B', '--death-timeout', '60', ], resources=client.V1ResourceRequirements( limits=resources, requests=resources, ), ) pod = client.V1Pod( metadata=client.V1ObjectMeta(labels={ 'cowait/task': 'worker-' + task.get('id'), 'cowait/parent': task.get('id'), }, ), spec=client.V1PodSpec( restart_policy='Never', image_pull_secrets=[ client.V1LocalObjectReference(name=secret) for secret in task['inputs'].get('pull_secrets', ['docker']) ], containers=[container], ), ) return KubeCluster( pod_template=pod, n_workers=workers, )
def default_cluster_agent_deployment(): """ Default cluster agent deployment """ labels = { 'app': 'epsagon-cluster-agent' } return client.V1Deployment( api_version='apps/v1', kind='Deployment', metadata=client.V1ObjectMeta(name='cluster-agent', namespace='epsagon-monitoring'), spec=client.V1DeploymentSpec( selector=client.V1LabelSelector( match_labels=labels.copy() ), replicas=1, template=client.V1PodTemplateSpec( metadata=client.V1ObjectMeta(labels=labels.copy()), spec=client.V1PodSpec( service_account_name='cluster-agent', containers=[ client.V1Container( name='cluster-agent', image='epsagon/cluster-agent:test', # required for pulling from the docker local loaded images # and not from Epsagon remote hub image_pull_policy='Never', env=[ client.V1EnvVar(name='EPSAGON_TOKEN', value='123'), client.V1EnvVar(name='EPSAGON_CLUSTER_NAME', value='test'), client.V1EnvVar(name='EPSAGON_DEBUG', value='false'), client.V1EnvVar(name='EPSAGON_COLLECTOR_URL', value='http://localhost:5000'), ] ), ] ), ), ), )
async def deploy_resource( self, resource_attributes: AttributeDict ) -> AttributeDict: drone_environment = self.drone_environment( resource_attributes.drone_uuid, resource_attributes.obs_machine_meta_data_translation_mapping, ) spec = k8s_client.V1DeploymentSpec( replicas=1, selector=k8s_client.V1LabelSelector( match_labels={"app": resource_attributes.drone_uuid} ), template=k8s_client.V1PodTemplateSpec(), ) spec.template.metadata = k8s_client.V1ObjectMeta( name=resource_attributes.drone_uuid, labels={"app": resource_attributes.drone_uuid}, ) container = k8s_client.V1Container( image=self.machine_type_configuration.image, args=self.machine_type_configuration.args, name=resource_attributes.drone_uuid, resources=k8s_client.V1ResourceRequirements( requests={ "cpu": self.machine_meta_data.Cores, "memory": convert_to(self.machine_meta_data.Memory * 1e09, int), } ), env=[ k8s_client.V1EnvVar(name=f"TardisDrone{key}", value=str(value)) for key, value in drone_environment.items() ], ) spec.template.spec = k8s_client.V1PodSpec(containers=[container]) body = k8s_client.V1Deployment( metadata=k8s_client.V1ObjectMeta(name=resource_attributes.drone_uuid), spec=spec, ) response_temp = await self.client.create_namespaced_deployment( namespace=self.machine_type_configuration.namespace, body=body ) response = { "uid": response_temp.metadata.uid, "name": response_temp.metadata.name, "type": "Booting", } if self.machine_type_configuration.hpa: spec = k8s_client.V1HorizontalPodAutoscalerSpec( max_replicas=self.machine_type_configuration.max_replicas, min_replicas=self.machine_type_configuration.min_replicas, target_cpu_utilization_percentage=self.machine_type_configuration.cpu_utilization, # noqa: B950 scale_target_ref=k8s_client.V1CrossVersionObjectReference( api_version="apps/v1", kind="Deployment", name=resource_attributes.drone_uuid, ), ) dep = k8s_client.V1HorizontalPodAutoscaler( metadata=k8s_client.V1ObjectMeta(name=resource_attributes.drone_uuid), spec=spec, ) await self.hpa_client.create_namespaced_horizontal_pod_autoscaler( namespace=self.machine_type_configuration.namespace, body=dep ) return self.handle_response(response)
def deploy_function(function: DaskCluster, secrets=None): try: from dask_kubernetes import KubeCluster, make_pod_spec from dask.distributed import Client, default_client from kubernetes_asyncio import client import dask except ImportError as e: print('missing dask or dask_kubernetes, please run ' '"pip install dask distributed dask_kubernetes", %s', e) raise e spec = function.spec meta = function.metadata spec.remote = True image = function.full_image_path() or 'daskdev/dask:latest' env = spec.env namespace = meta.namespace or config.namespace if spec.extra_pip: env.append(spec.extra_pip) pod_labels = get_resource_labels(function) args = ['dask-worker', "--nthreads", str(spec.nthreads)] if spec.args: args += spec.args container = client.V1Container(name='base', image=image, env=env, args=args, image_pull_policy=spec.image_pull_policy, volume_mounts=spec.volume_mounts, resources=spec.resources) pod_spec = client.V1PodSpec(containers=[container], restart_policy='Never', volumes=spec.volumes, service_account=spec.service_account) if spec.image_pull_secret: pod_spec.image_pull_secrets = [ client.V1LocalObjectReference(name=spec.image_pull_secret)] pod = client.V1Pod(metadata=client.V1ObjectMeta(namespace=namespace, labels=pod_labels), #annotations=meta.annotation), spec=pod_spec) svc_temp = dask.config.get("kubernetes.scheduler-service-template") if spec.service_type or spec.node_port: if spec.node_port: spec.service_type = 'NodePort' svc_temp['spec']['ports'][1]['nodePort'] = spec.node_port update_in(svc_temp, 'spec.type', spec.service_type) norm_name = normalize_name(meta.name) dask.config.set({"kubernetes.scheduler-service-template": svc_temp, 'kubernetes.name': 'mlrun-' + norm_name + '-{uuid}'}) cluster = KubeCluster( pod, deploy_mode='remote', namespace=namespace, scheduler_timeout=spec.scheduler_timeout) logger.info('cluster {} started at {}'.format( cluster.name, cluster.scheduler_address )) function.status.scheduler_address = cluster.scheduler_address function.status.cluster_name = cluster.name if spec.service_type == 'NodePort': ports = cluster.scheduler.service.spec.ports function.status.node_ports = {'scheduler': ports[0].node_port, 'dashboard': ports[1].node_port} if spec.replicas: cluster.scale(spec.replicas) else: cluster.adapt(minimum=spec.min_replicas, maximum=spec.max_replicas) return cluster
def deploy_function(function: DaskCluster, secrets=None): # TODO: why is this here :| try: from dask_kubernetes import KubeCluster, make_pod_spec # noqa: F401 from dask.distributed import Client, default_client # noqa: F401 from kubernetes_asyncio import client import dask except ImportError as e: print( "missing dask or dask_kubernetes, please run " '"pip install dask distributed dask_kubernetes", %s', e, ) raise e spec = function.spec meta = function.metadata spec.remote = True image = function.full_image_path() or "daskdev/dask:latest" env = spec.env namespace = meta.namespace or config.namespace if spec.extra_pip: env.append(spec.extra_pip) pod_labels = get_resource_labels(function, scrape_metrics=False) args = ["dask-worker", "--nthreads", str(spec.nthreads)] memory_limit = spec.resources.get("limits", {}).get("memory") if memory_limit: args.extend(["--memory-limit", str(memory_limit)]) if spec.args: args.extend(spec.args) container = client.V1Container( name="base", image=image, env=env, args=args, image_pull_policy=spec.image_pull_policy, volume_mounts=spec.volume_mounts, resources=spec.resources, ) pod_spec = client.V1PodSpec( containers=[container], restart_policy="Never", volumes=spec.volumes, service_account=spec.service_account, ) if spec.image_pull_secret: pod_spec.image_pull_secrets = [ client.V1LocalObjectReference(name=spec.image_pull_secret) ] pod = client.V1Pod( metadata=client.V1ObjectMeta(namespace=namespace, labels=pod_labels), # annotations=meta.annotation), spec=pod_spec, ) svc_temp = dask.config.get("kubernetes.scheduler-service-template") if spec.service_type or spec.node_port: if spec.node_port: spec.service_type = "NodePort" svc_temp["spec"]["ports"][1]["nodePort"] = spec.node_port update_in(svc_temp, "spec.type", spec.service_type) norm_name = normalize_name(meta.name) dask.config.set( { "kubernetes.scheduler-service-template": svc_temp, "kubernetes.name": "mlrun-" + norm_name + "-{uuid}", } ) cluster = KubeCluster( pod, deploy_mode="remote", namespace=namespace, scheduler_timeout=spec.scheduler_timeout, ) logger.info( "cluster {} started at {}".format(cluster.name, cluster.scheduler_address) ) function.status.scheduler_address = cluster.scheduler_address function.status.cluster_name = cluster.name if spec.service_type == "NodePort": ports = cluster.scheduler.service.spec.ports function.status.node_ports = { "scheduler": ports[0].node_port, "dashboard": ports[1].node_port, } if spec.replicas: cluster.scale(spec.replicas) else: cluster.adapt(minimum=spec.min_replicas, maximum=spec.max_replicas) return cluster
def make_pod_spec( image, labels={}, threads_per_worker=1, env={}, extra_container_config={}, extra_pod_config={}, memory_limit=None, memory_request=None, cpu_limit=None, cpu_request=None, ): """ Create generic pod template from input parameters Examples -------- >>> make_pod_spec(image='daskdev/dask:latest', memory_limit='4G', memory_request='4G') """ args = [ "dask-worker", "$(DASK_SCHEDULER_ADDRESS)", "--nthreads", str(threads_per_worker), "--death-timeout", "60", ] if memory_limit: args.extend(["--memory-limit", str(memory_limit)]) pod = client.V1Pod( metadata=client.V1ObjectMeta(labels=labels), spec=client.V1PodSpec( restart_policy="Never", containers=[ client.V1Container( name="dask-worker", image=image, args=args, env=[ client.V1EnvVar(name=k, value=v) for k, v in env.items() ], ) ], ), ) resources = client.V1ResourceRequirements(limits={}, requests={}) if cpu_request: resources.requests["cpu"] = cpu_request if memory_request: resources.requests["memory"] = memory_request if cpu_limit: resources.limits["cpu"] = cpu_limit if memory_limit: resources.limits["memory"] = memory_limit pod.spec.containers[0].resources = resources for key, value in extra_container_config.items(): _set_k8s_attribute(pod.spec.containers[0], key, value) for key, value in extra_pod_config.items(): _set_k8s_attribute(pod.spec, key, value) return pod
async def create_deployment( self, container: str, num_replicas: int, cpus: float = 1.0, memory: float = 1.0, ) -> Tuple[str, str]: assert self.auth_client assert self.cluster_endpoint cfg = client.Configuration( host=f"https://{self.cluster_endpoint}:443", api_key={ "authorization": f"Bearer {await self.auth_client.get()}" }, ) cfg.verify_ssl = False async with ApiClient(configuration=cfg) as kube_api: apps_api = client.AppsV1Api(kube_api) core_api = client.CoreV1Api(kube_api) # Create deployment deployment_id = f"dep-{uuid.uuid4()}" deployment = client.V1Deployment( api_version="apps/v1", kind="Deployment", metadata=client.V1ObjectMeta(name=deployment_id), spec=client.V1DeploymentSpec( replicas=num_replicas, selector={"matchLabels": { "dep": deployment_id }}, template=client.V1PodTemplateSpec( metadata=client.V1ObjectMeta( labels={"dep": deployment_id}), spec=client.V1PodSpec(containers=[ client.V1Container( name=deployment_id, env=[ client.V1EnvVar(name="PORT", value=str(INTERNAL_PORT)) ], image=container, resources=client.V1ResourceRequirements( requests={ "cpu": str(cpus), "memory": f"{int(memory * 1024)}M", }), ports=[ client.V1ContainerPort( container_port=INTERNAL_PORT) ], ) ]), ), ), ) await apps_api.create_namespaced_deployment( namespace=KUBE_NAMESPACE, body=deployment) # Create service service_id = f"{deployment_id}-svc" service_port = self.get_unassigned_port() service = client.V1Service( api_version="v1", kind="Service", metadata=client.V1ObjectMeta( name=service_id, # annotations={"cloud.google.com/load-balancer-type": "Internal"}, ), spec=client.V1ServiceSpec( selector={"dep": deployment_id}, ports=[ client.V1ServicePort( protocol="TCP", port=service_port, target_port=INTERNAL_PORT, ) ], type="LoadBalancer", ), ) await core_api.create_namespaced_service(namespace=KUBE_NAMESPACE, body=service) # Poll for external URL service_ip = None while not service_ip: await asyncio.sleep(POLL_INTERVAL) ingress = (await core_api.read_namespaced_service( name=service_id, namespace=KUBE_NAMESPACE)).status.load_balancer.ingress if ingress: service_ip = ingress[0].ip service_url = f"http://{service_ip}:{service_port}" print(f"Started deployment {deployment_id} at {service_url}") return deployment_id, service_url
def setUp(self): config = self.mock_config.return_value test_site_config = config.TestSite # Endpoint of Kube cluster test_site_config.host = "https://127.0.0.1:443" # Barer token we are going to use to authenticate test_site_config.token = "31ada4fd-adec-460c-809a-9e56ceb75269" test_site_config.MachineTypeConfiguration = AttributeDict( test2large=AttributeDict( namespace="default", image="busybox:1.26.1", args=["sleep", "3600"], hpa="True", min_replicas="1", max_replicas="2", cpu_utilization="50", ) ) test_site_config.MachineMetaData = AttributeDict( test2large=AttributeDict(Cores=2, Memory=4) ) kubernetes_api = self.mock_kubernetes_api.return_value kubernetes_hpa = self.mock_kubernetes_hpa.return_value spec = client.V1DeploymentSpec( replicas=1, selector=client.V1LabelSelector(match_labels={"app": "testsite-089123"}), template=client.V1PodTemplateSpec(), ) container = client.V1Container( image="busybox:1.26.1", args=["sleep", "3600"], name="testsite-089123", resources=client.V1ResourceRequirements( requests={ "cpu": test_site_config.MachineMetaData.test2large.Cores, "memory": test_site_config.MachineMetaData.test2large.Memory * 1e9, } ), env=[ client.V1EnvVar(name="TardisDroneCores", value="2"), client.V1EnvVar(name="TardisDroneMemory", value="4096"), client.V1EnvVar(name="TardisDroneUuid", value="testsite-089123"), ], ) spec.template.metadata = client.V1ObjectMeta( name="testsite-089123", labels={"app": "testsite-089123"}, ) spec.template.spec = client.V1PodSpec(containers=[container]) self.body = client.V1Deployment( metadata=client.V1ObjectMeta(name="testsite-089123"), spec=spec, ) self.create_return_value = client.V1Deployment( metadata=client.V1ObjectMeta(name="testsite-089123", uid="123456"), spec=spec, ) kubernetes_api.create_namespaced_deployment.return_value = async_return( return_value=self.create_return_value ) condition_list = [ client.V1DeploymentCondition( status="True", type="Progressing", ) ] self.read_return_value = client.V1Deployment( metadata=client.V1ObjectMeta(name="testsite-089123", uid="123456"), spec=spec, status=client.V1DeploymentStatus(conditions=condition_list), ) kubernetes_api.read_namespaced_deployment.return_value = async_return( return_value=self.read_return_value ) kubernetes_api.replace_namespaced_deployment.return_value = async_return( return_value=None ) kubernetes_api.delete_namespaced_deployment.return_value = async_return( return_value=None ) kubernetes_hpa.create_namespaced_horizontal_pod_autoscaler.return_value = ( async_return(return_value=None) ) kubernetes_hpa.delete_namespaced_horizontal_pod_autoscaler.return_value = ( async_return(return_value=None) ) self.kubernetes_adapter = KubernetesAdapter( machine_type="test2large", site_name="TestSite" )