def test_pod_spec(self): cluster_spec = ClusterSpec(cluster_spec_json=test_spec) pod = create_test_pod("test_spec") pod = cluster_spec.patch_pod(pod, "other") self.assertEqual( pod.metadata.labels["elasticdl.org/app-name"], "elasticdl" ) self.assertEqual(pod.metadata.labels["elasticdl.org/site"], "hangzhou") self.assertEqual( pod.metadata.annotations["tag.elasticdl.org/optimization"], "enabled", ) expected_tolerations = [ client.V1Toleration( effect="NoSchedule", key="elasticdl.org/logic-pool", operator="Equal", value="ElasticDL", ) ] self.assertEqual(pod.spec.tolerations, expected_tolerations) match_expressions = [ client.V1NodeSelectorRequirement( key="elasticdl.org/logic-pool", operator="In", values=["ElasticDL"], ) ] expected_affinity = client.V1Affinity( node_affinity=client.V1NodeAffinity( required_during_scheduling_ignored_during_execution=( client.V1NodeSelector( node_selector_terms=[ client.V1NodeSelectorTerm( match_expressions=match_expressions ) ] ) ) ) ) self.assertEqual(pod.spec.affinity, expected_affinity) expected_env = [] expected_env.append(client.V1EnvVar(name="LOG_ENABLED", value="true")) self.assertEqual(pod.spec.containers[0].env, expected_env) pod = create_test_pod("test_spec") pod = cluster_spec.patch_pod(pod, PodType.MASTER) self.assertEqual(pod.metadata.labels["elasticdl.org/xyz"], "Sun") pod = create_test_pod("test_spec") pod = cluster_spec.patch_pod(pod, PodType.WORKER) self.assertEqual(pod.metadata.labels["elasticdl.org/xyz"], "Earth") pod = create_test_pod("test_spec") pod = cluster_spec.patch_pod(pod, PodType.PS) self.assertEqual(pod.metadata.labels["elasticdl.org/xyz"], "Moon")
def get_affinity(self): """Determine the affinity term for the build pod. There are a two affinity strategies, which one is used depends on how the BinderHub is configured. In the default setup the affinity of each build pod is an "anti-affinity" which causes the pods to prefer to schedule on separate nodes. In a setup with docker-in-docker enabled pods for a particular repository prefer to schedule on the same node in order to reuse the docker layer cache of previous builds. """ resp = self.api.list_namespaced_pod( self.namespace, label_selector="component=dind,app=binder", _request_timeout=KUBE_REQUEST_TIMEOUT, _preload_content=False, ) dind_pods = json.loads(resp.read()) if self.sticky_builds and dind_pods: node_names = [ pod["spec"]["nodeName"] for pod in dind_pods["items"] ] ranked_nodes = rendezvous_rank(node_names, self.repo_url) best_node_name = ranked_nodes[0] affinity = client.V1Affinity(node_affinity=client.V1NodeAffinity( preferred_during_scheduling_ignored_during_execution=[ client.V1PreferredSchedulingTerm( weight=100, preference=client.V1NodeSelectorTerm( match_expressions=[ client.V1NodeSelectorRequirement( key="kubernetes.io/hostname", operator="In", values=[best_node_name], ) ]), ) ])) else: affinity = client.V1Affinity( pod_anti_affinity=client.V1PodAntiAffinity( preferred_during_scheduling_ignored_during_execution=[ client.V1WeightedPodAffinityTerm( weight=100, pod_affinity_term=client.V1PodAffinityTerm( topology_key="kubernetes.io/hostname", label_selector=client.V1LabelSelector( match_labels=dict( component=self._component_label)), ), ) ])) return affinity
def _generate_affinity(self): return k8s_client.V1Affinity( node_affinity=k8s_client.V1NodeAffinity( preferred_during_scheduling_ignored_during_execution=[ k8s_client.V1PreferredSchedulingTerm( weight=1, preference=k8s_client.V1NodeSelectorTerm( match_expressions=[ k8s_client.V1NodeSelectorRequirement( key="some_node_label", operator="In", values=[ "possible-label-value-1", "possible-label-value-2", ], ) ]), ) ], required_during_scheduling_ignored_during_execution=k8s_client. V1NodeSelector(node_selector_terms=[ k8s_client.V1NodeSelectorTerm(match_expressions=[ k8s_client.V1NodeSelectorRequirement( key="some_node_label", operator="In", values=[ "required-label-value-1", "required-label-value-2", ], ) ]), ]), ), pod_affinity=k8s_client.V1PodAffinity( required_during_scheduling_ignored_during_execution=[ k8s_client.V1PodAffinityTerm( label_selector=k8s_client.V1LabelSelector( match_labels={ "some-pod-label-key": "some-pod-label-value" }), namespaces=["namespace-a", "namespace-b"], topology_key="key-1", ) ]), pod_anti_affinity=k8s_client.V1PodAntiAffinity( preferred_during_scheduling_ignored_during_execution=[ k8s_client.V1WeightedPodAffinityTerm( weight=1, pod_affinity_term=k8s_client.V1PodAffinityTerm( label_selector=k8s_client.V1LabelSelector( match_expressions=[ k8s_client.V1LabelSelectorRequirement( key="some_pod_label", operator="NotIn", values=[ "forbidden-label-value-1", "forbidden-label-value-2", ], ) ]), namespaces=["namespace-c"], topology_key="key-2", ), ) ]), )
def export_deployment(self): # Configureate Pod template container volume_mounts = [] containers = [] volumes = [] volume_mounts.append( client.V1VolumeMount(mount_path='/docker/logs', name='logs')) volumes.append( client.V1Volume(name='logs', host_path=client.V1HostPathVolumeSource( path='/opt/logs', type='DirectoryOrCreate'))) if self.mounts: for path in self.mounts: volume_mounts.append( client.V1VolumeMount(mount_path=path, name=self.mounts[path])) volumes.append( client.V1Volume(name=self.mounts[path], host_path=client.V1HostPathVolumeSource( path=path, type='DirectoryOrCreate'))) liveness_probe = client.V1Probe(initial_delay_seconds=15, tcp_socket=client.V1TCPSocketAction( port=int(self.container_port[0]))) readiness_probe = client.V1Probe(initial_delay_seconds=15, tcp_socket=client.V1TCPSocketAction( port=int(self.container_port[0]))) if self.healthcheck: liveness_probe = client.V1Probe(initial_delay_seconds=15, http_get=client.V1HTTPGetAction( path=self.healthcheck, port=int( self.container_port[0]))) readiness_probe = client.V1Probe(initial_delay_seconds=15, http_get=client.V1HTTPGetAction( path=self.healthcheck, port=int( self.container_port[0]))) Env = [ client.V1EnvVar(name='LANG', value='en_US.UTF-8'), client.V1EnvVar(name='LC_ALL', value='en_US.UTF-8'), client.V1EnvVar(name='POD_NAME', value_from=client.V1EnvVarSource( field_ref=client.V1ObjectFieldSelector( field_path='metadata.name'))), client.V1EnvVar(name='POD_IP', value_from=client.V1EnvVarSource( field_ref=client.V1ObjectFieldSelector( field_path='status.podIP'))), ] container = client.V1Container( name=self.dm_name, image=self.image, ports=[ client.V1ContainerPort(container_port=int(port)) for port in self.container_port ], image_pull_policy='Always', env=Env, resources=client.V1ResourceRequirements(limits=self.re_limits, requests=self.re_requests), volume_mounts=volume_mounts, liveness_probe=liveness_probe, readiness_probe=readiness_probe) containers.append(container) if self.sidecar: sidecar_container = client.V1Container( name='sidecar-%s' % self.dm_name, image=self.sidecar, image_pull_policy='Always', env=Env, resources=client.V1ResourceRequirements( limits=self.re_limits, requests=self.re_requests), volume_mounts=volume_mounts) containers.append(sidecar_container) # Create and configurate a spec section secrets = client.V1LocalObjectReference('registrysecret') template = client.V1PodTemplateSpec( metadata=client.V1ObjectMeta(labels={"project": self.dm_name}), spec=client.V1PodSpec( containers=containers, image_pull_secrets=[secrets], volumes=volumes, affinity=client.V1Affinity(node_affinity=client.V1NodeAffinity( preferred_during_scheduling_ignored_during_execution=[ client.V1PreferredSchedulingTerm( preference=client.V1NodeSelectorTerm( match_expressions=[ client.V1NodeSelectorRequirement( key='project', operator='In', values=['moji']) ]), weight=30), client.V1PreferredSchedulingTerm( preference=client.V1NodeSelectorTerm( match_expressions=[ client.V1NodeSelectorRequirement( key='deploy', operator='In', values=[self.dm_name]) ]), weight=70) ])))) selector = client.V1LabelSelector( match_labels={"project": self.dm_name}) # Create the specification of deployment spec = client.ExtensionsV1beta1DeploymentSpec(replicas=int( self.replicas), template=template, selector=selector, min_ready_seconds=3) # Instantiate the deployment object deployment = client.ExtensionsV1beta1Deployment( api_version="extensions/v1beta1", kind="Deployment", metadata=client.V1ObjectMeta(name=self.dm_name), spec=spec) return deployment
def create_deployment_old(config_file): """ Create IBM Spectrum Scale CSI Operator deployment object in operator namespace using deployment_operator_image_for_crd and deployment_driver_image_for_crd parameters from config.json file Args: param1: config_file - configuration json file Returns: None Raises: Raises an exception on kubernetes client api failure and asserts """ deployment_apps_api_instance = client.AppsV1Api() deployment_labels = { "app.kubernetes.io/instance": "ibm-spectrum-scale-csi-operator", "app.kubernetes.io/managed-by": "ibm-spectrum-scale-csi-operator", "app.kubernetes.io/name": "ibm-spectrum-scale-csi-operator", "product": "ibm-spectrum-scale-csi", "release": "ibm-spectrum-scale-csi-operator" } deployment_annotations = { "productID": "ibm-spectrum-scale-csi-operator", "productName": "IBM Spectrum Scale CSI Operator", "productVersion": "2.0.0" } deployment_metadata = client.V1ObjectMeta( name="ibm-spectrum-scale-csi-operator", labels=deployment_labels, namespace=namespace_value) deployment_selector = client.V1LabelSelector( match_labels={ "app.kubernetes.io/name": "ibm-spectrum-scale-csi-operator" }) podtemplate_metadata = client.V1ObjectMeta( labels=deployment_labels, annotations=deployment_annotations) pod_affinity = client.V1Affinity(node_affinity=client.V1NodeAffinity( required_during_scheduling_ignored_during_execution=client. V1NodeSelector(node_selector_terms=[ client.V1NodeSelectorTerm(match_expressions=[ client.V1NodeSelectorRequirement(key="beta.kubernetes.io/arch", operator="Exists") ]) ]))) ansible_pod_container = client.V1Container( image=config_file["deployment_operator_image_for_crd"], command=[ "/usr/local/bin/ao-logs", "/tmp/ansible-operator/runner", "stdout" ], liveness_probe=client.V1Probe( _exec=client.V1ExecAction(command=["/health_check.sh"]), initial_delay_seconds=10, period_seconds=30), readiness_probe=client.V1Probe( _exec=client.V1ExecAction(command=["/health_check.sh"]), initial_delay_seconds=3, period_seconds=1), name="ansible", image_pull_policy="IfNotPresent", security_context=client.V1SecurityContext( capabilities=client.V1Capabilities(drop=["ALL"])), volume_mounts=[ client.V1VolumeMount(mount_path="/tmp/ansible-operator/runner", name="runner", read_only=True) ], env=[ client.V1EnvVar( name="CSI_DRIVER_IMAGE", value=config_file["deployment_driver_image_for_crd"]) ]) operator_pod_container = client.V1Container( image=config_file["deployment_operator_image_for_crd"], name="operator", image_pull_policy="IfNotPresent", liveness_probe=client.V1Probe( _exec=client.V1ExecAction(command=["/health_check.sh"]), initial_delay_seconds=10, period_seconds=30), readiness_probe=client.V1Probe( _exec=client.V1ExecAction(command=["/health_check.sh"]), initial_delay_seconds=3, period_seconds=1), security_context=client.V1SecurityContext( capabilities=client.V1Capabilities(drop=["ALL"])), env=[ client.V1EnvVar(name="WATCH_NAMESPACE", value_from=client.V1EnvVarSource( field_ref=client.V1ObjectFieldSelector( field_path="metadata.namespace"))), client.V1EnvVar(name="POD_NAME", value_from=client.V1EnvVarSource( field_ref=client.V1ObjectFieldSelector( field_path="metadata.name"))), client.V1EnvVar(name="OPERATOR_NAME", value="ibm-spectrum-scale-csi-operator"), client.V1EnvVar( name="CSI_DRIVER_IMAGE", value=config_file["deployment_driver_image_for_crd"]) ], volume_mounts=[ client.V1VolumeMount(mount_path="/tmp/ansible-operator/runner", name="runner") ]) pod_spec = client.V1PodSpec( affinity=pod_affinity, containers=[ansible_pod_container, operator_pod_container], service_account_name="ibm-spectrum-scale-csi-operator", volumes=[ client.V1Volume( empty_dir=client.V1EmptyDirVolumeSource(medium="Memory"), name="runner") ]) podtemplate_spec = client.V1PodTemplateSpec(metadata=podtemplate_metadata, spec=pod_spec) deployment_spec = client.V1DeploymentSpec(replicas=1, selector=deployment_selector, template=podtemplate_spec) body_dep = client.V1Deployment(kind='Deployment', api_version='apps/v1', metadata=deployment_metadata, spec=deployment_spec) try: LOGGER.info("creating deployment for operator") deployment_apps_api_response = deployment_apps_api_instance.create_namespaced_deployment( namespace=namespace_value, body=body_dep) LOGGER.debug(str(deployment_apps_api_response)) except ApiException as e: LOGGER.error( f"Exception when calling RbacAuthorizationV1Api->create_namespaced_deployment: {e}" ) assert False
def clean_pod_template(pod_template, match_node_purpose="prefer", pod_type="worker"): """ Normalize pod template and check for type errors """ if isinstance(pod_template, str): msg = ( "Expected a kubernetes.client.V1Pod object, got %s" "If trying to pass a yaml filename then use " "KubeCluster.from_yaml" ) raise TypeError(msg % pod_template) if isinstance(pod_template, dict): msg = ( "Expected a kubernetes.client.V1Pod object, got %s" "If trying to pass a dictionary specification then use " "KubeCluster.from_dict" ) raise TypeError(msg % str(pod_template)) pod_template = copy.deepcopy(pod_template) # Make sure metadata / labels / env objects exist, so they can be modified # later without a lot of `is None` checks if pod_template.metadata is None: pod_template.metadata = client.V1ObjectMeta() if pod_template.metadata.labels is None: pod_template.metadata.labels = {} if pod_template.spec.containers[0].env is None: pod_template.spec.containers[0].env = [] # add default tolerations tolerations = [ client.V1Toleration( key="k8s.dask.org/dedicated", operator="Equal", value=pod_type, effect="NoSchedule", ), # GKE currently does not permit creating taints on a node pool # with a `/` in the key field client.V1Toleration( key="k8s.dask.org_dedicated", operator="Equal", value=pod_type, effect="NoSchedule", ), ] if pod_template.spec.tolerations is None: pod_template.spec.tolerations = tolerations else: pod_template.spec.tolerations.extend(tolerations) # add default node affinity to k8s.dask.org/node-purpose=worker if match_node_purpose != "ignore": # for readability affinity = pod_template.spec.affinity if affinity is None: affinity = client.V1Affinity() if affinity.node_affinity is None: affinity.node_affinity = client.V1NodeAffinity() # a common object for both a preferred and a required node affinity node_selector_term = client.V1NodeSelectorTerm( match_expressions=[ client.V1NodeSelectorRequirement( key="k8s.dask.org/node-purpose", operator="In", values=[pod_type] ) ] ) if match_node_purpose == "require": if ( affinity.node_affinity.required_during_scheduling_ignored_during_execution is None ): affinity.node_affinity.required_during_scheduling_ignored_during_execution = client.V1NodeSelector( node_selector_terms=[] ) affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms.append( node_selector_term ) elif match_node_purpose == "prefer": if ( affinity.node_affinity.preferred_during_scheduling_ignored_during_execution is None ): affinity.node_affinity.preferred_during_scheduling_ignored_during_execution = ( [] ) preferred_scheduling_terms = [ client.V1PreferredSchedulingTerm( preference=node_selector_term, weight=100 ) ] affinity.node_affinity.preferred_during_scheduling_ignored_during_execution.extend( preferred_scheduling_terms ) else: raise ValueError( 'Attribute must be one of "ignore", "prefer", or "require".' ) pod_template.spec.affinity = affinity return pod_template
def update_deploy_v2(): data = json.loads(request.get_data().decode('UTF-8')) current_app.logger.debug("接受到的数据:{}".format(data)) namespace = handle_input(data.get('namespace')) deploy_name = handle_input(data.get('deploy_name')) action = handle_input(data.get('action')) image = None replicas = None toleration = None pod_anti_affinity = None pod_affinity = None node_affinity = None labels = None if action == "add_pod_anti_affinity": print("正在运行{}操作".format(action)) affinity = handle_input(data.get('pod_anti_affinity')) affinity_type = handle_input(affinity.get('type')) labelSelector = handle_input(affinity.get('labelSelector')) key = handle_input(affinity.get('key')) value = handle_input(affinity.get('value')) topologyKey = handle_input(affinity.get('topologyKey')) if affinity_type == "required": if labelSelector == "matchExpressions": if not isinstance(value, list): value = [value] operator = handle_input(affinity.get('operator')) if operator != 'In' and operator != 'NotIn': value = None print(value) label_selector = client.V1LabelSelector(match_expressions=[ client.V1LabelSelectorRequirement( key=key, operator=operator, values=value) ]) elif labelSelector == "matchLabels": if isinstance(value, list): return jsonify( {"error": "{}模式下不支持values设置为数组".format(labelSelector)}) label_selector = client.V1LabelSelector( match_labels={key: value}) else: return jsonify( {"error": "不支持{} labelSelector".format(labelSelector)}) client.V1Affinity pod_anti_affinity = client.V1PodAntiAffinity( required_during_scheduling_ignored_during_execution=[ client.V1PodAffinityTerm(label_selector=label_selector, topology_key=topologyKey) ]) print("添加的互斥调度为:{}".format(pod_anti_affinity)) elif affinity_type == "preferred": weight = string_to_int(handle_input(affinity.get('weight'))) if weight == None: return jsonify( {"error": "{}类型必须设置weight".format(affinity_type)}) if labelSelector == "matchExpressions": if not isinstance(value, list): value = [value] operator = handle_input(affinity.get('operator')) if operator != 'In' and operator != 'NotIn': value = None label_selector = client.V1LabelSelector(match_expressions=[ client.V1LabelSelectorRequirement( key=key, operator=operator, values=value) ]) elif labelSelector == "matchLabels": if isinstance(value, list): return jsonify( {"error": "{}模式下不支持values设置为数组".format(labelSelector)}) label_selector = client.V1LabelSelector( match_labels={key: value}) else: return jsonify( {"error": "不支持{} labelSelector".format(labelSelector)}) pod_anti_affinity = client.V1PodAntiAffinity( preferred_during_scheduling_ignored_during_execution=[ client.V1WeightedPodAffinityTerm( pod_affinity_term=client.V1PodAffinityTerm( label_selector=label_selector, topology_key=topologyKey), weight=weight) ]) print("添加的互斥调度为:{}".format(pod_anti_affinity)) else: return jsonify({"error": "不支持{}这种调度".format(affinity_type)}) elif action == "delete_pod_anti_affinity": print("正在运行{}操作".format(action)) pass elif action == "add_node_affinity": current_app.logger.debug("正在运行{}操作".format(action)) affinity = handle_input(data.get('node_affinity')) node_affinity_type = handle_input(affinity.get('type')) nodeSelector = handle_input(affinity.get('nodeSelector')) key = handle_input(affinity.get('key')) value = handle_input(affinity.get('value')) operator = handle_input(affinity.get('operator')) values = [] if operator == 'Exists' or operator == 'DoesNotExist': values == None else: if not isinstance(value, list): values.append(value) else: values = value if node_affinity_type == "preferred": weight = string_to_int(handle_input(affinity.get('weight'))) if weight == None: return simple_error_handle( "{}类型必须设置weight".format(node_affinity_type)) preferred_term = [] if nodeSelector == "matchExpressions": match_expressions = [] expression = client.V1NodeSelectorRequirement( key=key, operator=operator, values=values, ) match_expressions.append(expression) preference = client.V1NodeSelectorTerm( match_expressions=match_expressions) # nodeSelector == "matchFields" else: match_fields = [] field = client.V1NodeSelectorRequirement( key=key, operator=operator, values=values, ) match_fields.append(field) preference = client.V1NodeSelectorTerm( match_fields=match_fields) term = client.V1PreferredSchedulingTerm( weight=weight, preference=preference, ) preferred_term.append(term) node_affinity = client.V1NodeAffinity( #直接append preferred_during_scheduling_ignored_during_execution= preferred_term) elif node_affinity_type == "required": current_app.logger.debug( "node_affinity_type:{}".format(node_affinity_type)) node_selector_terms = [] if nodeSelector == "matchExpressions": match_expressions = [] expression = client.V1NodeSelectorRequirement( key=key, operator=operator, values=values, ) match_expressions.append(expression) term = client.V1NodeSelectorTerm( match_expressions=match_expressions) else: match_fields = [] field = client.V1NodeSelectorRequirement( key=key, operator=operator, values=values, ) match_fields.append(field) term = client.V1NodeSelectorTerm(match_fields=match_fields) node_selector_terms.append(term) node_affinity = client.V1NodeAffinity( required_during_scheduling_ignored_during_execution=client. V1NodeSelector(node_selector_terms=node_selector_terms)) else: return simple_error_handle("不支持{}这种调度".format(node_affinity_type)) elif action == "delete_node_affinity": print("正在运行{}操作".format(action)) pass elif action == "add_toleration": print("正在运行{}操作".format(action)) t = handle_input(data.get("toleration")) print(type(toleration), toleration) effect = t.get('effect') key = t.get('key') operator = t.get('operator') value = t.get('value') toleration_seconds = handle_toleraion_seconds( t.get('toleration_seconds')) print("toleration_seconds:{}".format(toleration_seconds)) toleration = client.V1Toleration(effect=effect, key=key, operator=operator, toleration_seconds=toleration_seconds, value=value) print(toleration) if not toleration: msg = "{}需要提供toleration(effect,key,operator,value,)".format(action) return jsonify({"error": msg}) elif action == "delete_toleration": print("正在运行{}操作".format(action)) t = handle_input(data.get("toleration")) effect = handle_toleration_item(t.get('effect')) key = handle_toleration_item(t.get('key')) operator = handle_toleration_item(t.get('operator')) value = handle_toleration_item(t.get('value')) toleration_seconds = handle_toleraion_seconds( t.get('toleration_seconds')) print("toleration_seconds:{}".format(toleration_seconds)) # if (effect != None and key != None and operator != None): toleration = client.V1Toleration(effect=effect, key=key, operator=operator, toleration_seconds=toleration_seconds, value=value) if not toleration: msg = "{}需要提供toleration(effect,key,operator,value,)".format(action) return jsonify({"error": msg}) elif action == "add_pod_affinity": pass elif action == "delete_pod_affinity": pass elif action == "update_replicas": replicas = handle_input(data.get('replicas')) if not replicas: msg = "{}需要提供replicas".format(action) return jsonify({"error": msg}) elif action == "update_image": project = handle_input(data.get('project')) env = handle_input(data.get('env')) imageRepo = handle_input(data.get('imageRepo')) imageName = handle_input(data.get('imageName')) imageTag = handle_input(data.get('imageTag')) if (imageRepo != None and project != None and env != None and imageName != None and imageTag != None): image = "{}/{}-{}/{}:{}".format(imageRepo, project, env, imageName, imageTag) print("image值{}".format(image)) if not image: msg = "{}需要提供image".format(action) return jsonify({"error": msg}) elif action == "add_labels": pass elif action == "delete_labels": pass else: msg = "暂时不支持{}操作".format(action) print(msg) return jsonify({"error": msg}) return update_deployment_v2(deploy_name=deploy_name, namespace=namespace, action=action, image=image, replicas=replicas,toleration=toleration,node_affinity=node_affinity,\ pod_anti_affinity=pod_anti_affinity,pod_affinity=pod_affinity,labels=labels)
def export_deployment(self): # Configureate Pod template container volume_mounts = [] containers = [] volumes = [] ports = [] liveness_probe = None readiness_probe = None volume_mounts.append( client.V1VolumeMount(mount_path='/docker/logs', name='logs')) volumes.append( client.V1Volume(name='logs', host_path=client.V1HostPathVolumeSource( path='/opt/logs', type='DirectoryOrCreate'))) if self.mounts: for path in self.mounts: volume_mounts.append( client.V1VolumeMount(mount_path=path, name=self.mounts[path])) volumes.append( client.V1Volume(name=self.mounts[path], host_path=client.V1HostPathVolumeSource( path=path, type='DirectoryOrCreate'))) if self.container_port: ports = [ client.V1ContainerPort(container_port=int(port)) for port in self.container_port ] liveness_probe = client.V1Probe( initial_delay_seconds=15, tcp_socket=client.V1TCPSocketAction( port=int(self.container_port[0]))) readiness_probe = client.V1Probe( initial_delay_seconds=15, tcp_socket=client.V1TCPSocketAction( port=int(self.container_port[0]))) if self.healthcheck: liveness_probe = client.V1Probe( initial_delay_seconds=15, http_get=client.V1HTTPGetAction( path=self.healthcheck, port=int(self.container_port[0]))) readiness_probe = client.V1Probe( initial_delay_seconds=15, http_get=client.V1HTTPGetAction( path=self.healthcheck, port=int(self.container_port[0]))) Env = [ client.V1EnvVar(name='LANG', value='en_US.UTF-8'), client.V1EnvVar(name='LC_ALL', value='en_US.UTF-8'), client.V1EnvVar(name='POD_NAME', value_from=client.V1EnvVarSource( field_ref=client.V1ObjectFieldSelector( field_path='metadata.name'))), client.V1EnvVar(name='POD_IP', value_from=client.V1EnvVarSource( field_ref=client.V1ObjectFieldSelector( field_path='status.podIP'))), ] container = client.V1Container(name=self.dm_name, image=self.image, ports=ports, image_pull_policy='Always', env=Env, resources=client.V1ResourceRequirements( limits=self.re_limits, requests=self.re_requests), volume_mounts=volume_mounts) if liveness_probe and readiness_probe: container = client.V1Container( name=self.dm_name, image=self.image, ports=ports, image_pull_policy='Always', env=Env, resources=client.V1ResourceRequirements( limits=self.re_limits, requests=self.re_requests), volume_mounts=volume_mounts, liveness_probe=liveness_probe, readiness_probe=readiness_probe) containers.append(container) if self.sidecar: sidecar_container = client.V1Container( name='sidecar-%s' % self.dm_name, image=self.sidecar, image_pull_policy='Always', env=Env, resources=client.V1ResourceRequirements( limits=self.re_limits, requests=self.re_requests), volume_mounts=volume_mounts) containers.append(sidecar_container) # Create and configurate a spec section secrets = client.V1LocalObjectReference('registrysecret') preference_key = self.dm_name project_values = ['xxxx'] host_aliases = [] db_docker_hosts = db_op.docker_hosts values = db_docker_hosts.query.with_entities( db_docker_hosts.ip, db_docker_hosts.hostname).filter( and_(db_docker_hosts.deployment == self.dm_name, db_docker_hosts.context == self.context)).all() db_op.DB.session.remove() if values: ips = [] for value in values: try: ip, hostname = value key = "op_docker_hosts_%s" % ip Redis.lpush(key, hostname) ips.append(ip) except Exception as e: logging.error(e) for ip in set(ips): try: key = "op_docker_hosts_%s" % ip if Redis.exists(key): hostnames = Redis.lrange(key, 0, -1) if hostnames: host_aliases.append( client.V1HostAlias(hostnames=hostnames, ip=ip)) Redis.delete(key) except Exception as e: logging.error(e) if self.labels: if 'deploy' in self.labels: preference_key = self.labels['deploy'] if 'project' in self.labels: project_values = [self.labels['project']] template = client.V1PodTemplateSpec( metadata=client.V1ObjectMeta(labels={"project": self.dm_name}), spec=client.V1PodSpec( containers=containers, image_pull_secrets=[secrets], volumes=volumes, host_aliases=host_aliases, affinity=client.V1Affinity(node_affinity=client.V1NodeAffinity( preferred_during_scheduling_ignored_during_execution=[ client.V1PreferredSchedulingTerm( preference=client.V1NodeSelectorTerm( match_expressions=[ client.V1NodeSelectorRequirement( key=preference_key, operator='In', values=['mark']) ]), weight=100) ], required_during_scheduling_ignored_during_execution=client. V1NodeSelector(node_selector_terms=[ client.V1NodeSelectorTerm(match_expressions=[ client.V1NodeSelectorRequirement( key='project', operator='In', values=project_values) ]) ]))))) selector = client.V1LabelSelector( match_labels={"project": self.dm_name}) # Create the specification of deployment spec = client.ExtensionsV1beta1DeploymentSpec(replicas=int( self.replicas), template=template, selector=selector, min_ready_seconds=3) # Instantiate the deployment object deployment = client.ExtensionsV1beta1Deployment( api_version="extensions/v1beta1", kind="Deployment", metadata=client.V1ObjectMeta(name=self.dm_name), spec=spec) return deployment
def add_servers(): if count_servers(): return "You can't have more than 2 servers", 403 game_id = request.json['game_id'] params = request.json['parms'] u_ip = request.remote_addr game = get_game_by_id(game_id) try: game.validate_params(params, game) except Exception as e: return str(e), 404 uid = uuid.uuid4().hex[:12] name = "gaas-{}".format(uid) labels = { "app": "gaas", "game": game_id, "server": uid, "creator": u_ip, } metadata = client.V1ObjectMeta( labels=labels, name=name, ) ip_ext = alloc_ip() extra_env = [ client.V1EnvVar(name="IP_ALLOC", value=ip_ext), client.V1EnvVar(name="IP_CREATOR", value=u_ip) ] containers = game.make_deployment(params) for container in containers: if container.env: container.env.extend(extra_env) else: container.env = extra_env if not container.resources: container.resources = client.V1ResourceRequirements(limits={ "cpu": "2", "memory": "1G" }, requests={ "cpu": "1", "memory": "1G" }) deployment = client.V1Deployment(spec=client.V1DeploymentSpec( replicas=1, strategy=client.AppsV1beta1DeploymentStrategy( rolling_update=client.AppsV1beta1RollingUpdateDeployment( max_surge=0, max_unavailable=1)), selector=client.V1LabelSelector(match_labels=labels, ), template=client.V1PodTemplateSpec(spec=client.V1PodSpec( containers=containers, termination_grace_period_seconds=0, affinity=client.V1Affinity(node_affinity=client.V1NodeAffinity( required_during_scheduling_ignored_during_execution=client. V1NodeSelector(node_selector_terms=[ client.V1NodeSelectorTerm(match_expressions=[ client.V1NodeSelectorRequirement( key="kubernetes.io/role", operator="NotIn", values=["shared"]) ]) ]))))))) service = client.V1Service(spec=client.V1ServiceSpec( type="ClusterIP", selector=labels, ports=game.make_service(params), )) deployment.metadata = metadata deployment.spec.template.metadata = metadata service.metadata = metadata client.AppsV1Api().create_namespaced_deployment( namespace="gaas", body=deployment, ) service_resp = client.CoreV1Api().create_namespaced_service( namespace="gaas", body=service, ) return {"uid": uid, "ip": u_ip}
def export_deployment(self): # Configureate Pod template container volume_mounts = [] volume_mounts.append( client.V1VolumeMount(mount_path='/opt/logs', name='logs')) if self.dm_name == 'launch': volume_mounts.append( client.V1VolumeMount(mount_path='/opt/%s/conf' % self.dm_name, name=self.dm_name)) container = client.V1Container( name=self.dm_name, image=self.image, ports=[ client.V1ContainerPort(container_port=int(port)) for port in self.container_port ], image_pull_policy='Always', env=[ client.V1EnvVar(name='LANG', value='en_US.UTF-8'), client.V1EnvVar(name='LC_ALL', value='en_US.UTF-8') ], resources=client.V1ResourceRequirements(limits=self.re_limits, requests=self.re_requests), volume_mounts=volume_mounts, liveness_probe=client.V1Probe( initial_delay_seconds=30, tcp_socket=client.V1TCPSocketAction( port=int(self.container_port[0]))), readiness_probe=client.V1Probe( initial_delay_seconds=30, tcp_socket=client.V1TCPSocketAction( port=int(self.container_port[0])))) # Create and configurate a spec section secrets = client.V1LocalObjectReference('registrysecret') volumes = [] volume = client.V1Volume( name='logs', host_path=client.V1HostPathVolumeSource(path='/opt/logs')) volumes.append(volume) template = client.V1PodTemplateSpec( metadata=client.V1ObjectMeta(labels={"project": self.dm_name}), spec=client.V1PodSpec( containers=[container], image_pull_secrets=[secrets], volumes=volumes, affinity=client.V1Affinity(node_affinity=client.V1NodeAffinity( preferred_during_scheduling_ignored_during_execution=[ client.V1PreferredSchedulingTerm( preference=client.V1NodeSelectorTerm( match_expressions=[ client.V1NodeSelectorRequirement( key='project', operator='In', values=['moji']) ]), weight=30), client.V1PreferredSchedulingTerm( preference=client.V1NodeSelectorTerm( match_expressions=[ client.V1NodeSelectorRequirement( key='deploy', operator='In', values=[self.dm_name]) ]), weight=70) ])))) selector = client.V1LabelSelector( match_labels={"project": self.dm_name}) # Create the specification of deployment spec = client.ExtensionsV1beta1DeploymentSpec(replicas=int( self.replicas), template=template, selector=selector, min_ready_seconds=3) # Instantiate the deployment object deployment = client.ExtensionsV1beta1Deployment( api_version="extensions/v1beta1", kind="Deployment", metadata=client.V1ObjectMeta(name=self.dm_name), spec=spec) return deployment
def submit_job(args, command=None): container_image = args.container container_name = args.name body = client.V1Job(api_version="batch/v1", kind="Job", metadata=client.V1ObjectMeta(name=container_name)) body.status = client.V1JobStatus() template = client.V1PodTemplate() labels = { 'hugin-job': "1", 'hugin-job-name': f'{container_name}' } template.template = client.V1PodTemplateSpec( metadata=client.V1ObjectMeta(labels=labels) ) tolerations = [] env = [] if args.environment: for env_spec in args.environment: env_name,env_value = env_spec.split("=", 1) env.append(client.V1EnvVar(name=env_name, value=env_value)) containe_args = dict( name=f"container-{container_name}", image=container_image, env=env, ) if args.gpu: tolerations.append(client.V1Toleration( key='nvidia.com/gpu', operator='Exists', effect='NoSchedule')) containe_args['resources'] = client.V1ResourceRequirements(limits={"nvidia.com/gpu": 1}) if command or args.command: containe_args['command'] = command if command else args.command container = client.V1Container(**containe_args) pull_secrets = [] if args.pull_secret is not None: pull_secrets.append(client.V1LocalObjectReference(name=args.pull_secret)) pod_args = dict(containers=[container], restart_policy='Never', image_pull_secrets=pull_secrets) if tolerations: pod_args['tolerations'] = tolerations if args.node_selector is not None: parts = args.node_selector.split("=", 1) if len(parts) == 2: affinity = client.V1Affinity( node_affinity=client.V1NodeAffinity( required_during_scheduling_ignored_during_execution=client.V1NodeSelector( node_selector_terms=[client.V1NodeSelectorTerm( match_expressions=[client.V1NodeSelectorRequirement( key=parts[0], operator='In', values=[parts[1]])] )] ) ) ) pod_args['affinity'] = affinity template.template.spec = client.V1PodSpec(**pod_args) body.spec = client.V1JobSpec(ttl_seconds_after_finished=1800, template=template.template) try: api_response = batch_v1.create_namespaced_job("default", body, pretty=True) #print (api_response) except client.exceptions.ApiException as e: logging.critical(f"Failed to start job: {e.reason}")
def create_run_pod(k8s_settings, run_context): run_id = run_context.id run_name = run_context.run.to_json()["name"] labels = { "run-name": run_name, "run": run_id, } env = get_run_pod_env_vars(run_context) node_topology_key = "kubernetes.io/hostname" # NOTE(taylor): preference to run on nodes with other runs pod_affinities = [ k8s_client.V1WeightedPodAffinityTerm( weight=50, pod_affinity_term=k8s_client.V1PodAffinityTerm( label_selector=k8s_client.V1LabelSelector(match_labels={ "type": "run", }, ), topology_key=node_topology_key, ), ), ] volumes = [] volume_mounts = [] experiment_id = run_context.experiment if experiment_id: labels.update({"experiment": experiment_id}) # NOTE(taylor): highest preference to run on nodes with runs in the same experiment pod_affinities.append( k8s_client.V1WeightedPodAffinityTerm( weight=100, pod_affinity_term=k8s_client.V1PodAffinityTerm( label_selector=k8s_client.V1LabelSelector(match_labels={ "type": "run", "experiment": experiment_id, }, ), topology_key=node_topology_key, ), )) unacceptable_node_group_types = ["system"] requests = k8s_settings.resources.get("requests") or {} limits = k8s_settings.resources.get("limits") or {} # NOTE(taylor): Preventing GPU-less jobs from running on GPU nodes forces the cluster autoscaler to scale up # CPU nodes. This prevents a situation where the GPU nodes are not scaled down because they are occupied by # CPU workloads. The cluster autoscaler does not know that it should create CPU nodes when the GPUs are unused. # TODO(taylor): This could cause unexpected behavior if the cluster has no CPU nodes. Running CPU jobs on GPU # nodes could also be an opportunity for more efficient resource utilization, but is avoided for now because the # workloads cannot be migrated onto CPU nodes by the cluster autoscaler as mentioned above. # NOTE(taylor): Applying a NoSchedule taint to GPU nodes is another way to achieve this behavior, but does not work as # well out of the box with clusters that orchestrate doesn't provision. Applying a PreferNoSchedule # taint to GPU nodes does not resolve the workload migration issue when there are no CPU nodes. if all( float(group.get("nvidia.com/gpu", 0)) == 0 for group in (requests, limits)): unacceptable_node_group_types.append("gpu") node_affinity = k8s_client.V1NodeAffinity( required_during_scheduling_ignored_during_execution=k8s_client. V1NodeSelector(node_selector_terms=[ k8s_client.V1NodeSelectorTerm(match_expressions=[ k8s_client.V1NodeSelectorRequirement( key="orchestrate.sigopt.com/node-group-type", operator="NotIn", values=unacceptable_node_group_types, ) ], ) ], ), ) pod_affinity = k8s_client.V1PodAffinity( preferred_during_scheduling_ignored_during_execution=pod_affinities, ) pod = k8s_client.V1Pod( metadata=k8s_client.V1ObjectMeta( owner_references=k8s_settings.owner_references, labels={ "type": "run", **labels, }, name=run_name, ), spec=k8s_client.V1PodSpec( affinity=k8s_client.V1Affinity( node_affinity=node_affinity, pod_affinity=pod_affinity, ), containers=[ k8s_client.V1Container( name="model-runner", image=k8s_settings.image, resources=k8s_client.V1ResourceRequirements( **k8s_settings.resources), image_pull_policy="Always", command=[], args=k8s_settings.args, env=env, volume_mounts=volume_mounts, tty=True, ), ], volumes=volumes, restart_policy="Never", ), ) k8s_settings.api.create_namespaced_pod(k8s_settings.namespace, pod) return pod
def add(ip, game_id, params): game=get_game_by_id(game_id) game.validate_params(params) uid=uuid.uuid4().hex[:12] name="gaas-{}".format(uid) labels={ "app": "gaas", "game": game_id, "server": uid, "creator": ip, } metadata=client.V1ObjectMeta( labels=labels, name=name, ) ip_ext=alloc_ip() extra_env=[client.V1EnvVar( name="IP_ALLOC", value=ip_ext ), client.V1EnvVar( name="IP_CREATOR", value=ip )] containers = game.make_deployment(params) generic_ports = [] # TODO(bluecmd): Hack to work around that not all # ports are routed to the VIP by default. This allows # outgoing connections from inside the pod on the VIP. for p in range(50000, 50016): generic_ports.append(client.V1ServicePort( name="internal-tcp-" + str(p), port=p, target_port=p, protocol="TCP")) generic_ports.append(client.V1ServicePort( name="internal-udp-" + str(p), port=p, target_port=p, protocol="UDP")) for container in containers: if container.env: container.env.extend(extra_env) else: container.env = extra_env if not container.resources: container.resources=client.V1ResourceRequirements( limits={ "cpu": "4", "memory": "32G" }, requests={ "cpu": "2", "memory": "16G" } ) deployment=client.V1Deployment( spec=client.V1DeploymentSpec( replicas=1, strategy=client.AppsV1beta1DeploymentStrategy( rolling_update=client.AppsV1beta1RollingUpdateDeployment( max_surge=0, max_unavailable=1 ) ), selector=client.V1LabelSelector( match_labels=labels, ), template=client.V1PodTemplateSpec( spec=client.V1PodSpec( containers=containers, termination_grace_period_seconds=0, # TODO(bluecmd): Hack to work around that not all # ports are routed to the VIP by default. This allows # outgoing connections from inside the pod on the VIP. security_context=client.V1PodSecurityContext( sysctls=[client.V1Sysctl( name='net.ipv4.ip_local_port_range', value='50000 50015')]), affinity=client.V1Affinity( node_affinity=client.V1NodeAffinity( required_during_scheduling_ignored_during_execution=client.V1NodeSelector( node_selector_terms=[ client.V1NodeSelectorTerm( match_expressions=[ client.V1NodeSelectorRequirement( key="kubernetes.io/role", operator="NotIn", values=["shared"] ) ] ) ] ) ) ) ) ) ) ) service=client.V1Service( spec=client.V1ServiceSpec( type="ClusterIP", selector=labels, ports=game.make_service(params) + generic_ports, external_i_ps=[ip_ext], ) ) deployment.metadata=metadata deployment.spec.template.metadata=metadata service.metadata=metadata service.metadata.annotations={"kube-router.io/service.dsr": "tunnel"} client.AppsV1Api().create_namespaced_deployment( namespace=NAMESPACE, body=deployment, ) service_resp = client.CoreV1Api().create_namespaced_service( namespace=NAMESPACE, body=service, ) return {"uid": uid, "ip": ip}
def from_runs(cls, id: str, runs: List[Run]): k8s_name = 'tensorboard-' + id run_names_hash = K8STensorboardInstance.generate_run_names_hash(runs) volume_mounts = [] for run in runs: mount = k8s.V1VolumeMount( name=cls.EXPERIMENTS_OUTPUT_VOLUME_NAME, mount_path=os.path.join( cls.TENSORBOARD_CONTAINER_MOUNT_PATH_PREFIX, run.owner, run.name), sub_path=os.path.join(run.owner, run.name)) volume_mounts.append(mount) deployment_labels = { 'name': k8s_name, 'type': 'nauta-tensorboard', 'nauta_app_name': 'tensorboard', 'id': id, 'runs-hash': run_names_hash } tensorboard_command = [ "tensorboard", "--logdir", cls.TENSORBOARD_CONTAINER_MOUNT_PATH_PREFIX, "--port", "6006", "--host", "127.0.0.1" ] nauta_config = NautaPlatformConfig.incluster_init() tensorboard_image = nauta_config.get_tensorboard_image() tensorboard_proxy_image = nauta_config.get_activity_proxy_image() deployment = k8s.V1Deployment( api_version='apps/v1', kind='Deployment', metadata=k8s.V1ObjectMeta(name=k8s_name, labels=deployment_labels), spec=k8s.V1DeploymentSpec( replicas=1, selector=k8s.V1LabelSelector(match_labels=deployment_labels), template=k8s.V1PodTemplateSpec( metadata=k8s.V1ObjectMeta(labels=deployment_labels), spec=k8s.V1PodSpec( tolerations=[ k8s.V1Toleration(key='master', operator='Exists', effect='NoSchedule') ], affinity=k8s. V1Affinity(node_affinity=k8s.V1NodeAffinity( required_during_scheduling_ignored_during_execution =k8s.V1NodeSelector(node_selector_terms=[ k8s.V1NodeSelectorTerm(match_expressions=[ k8s.V1NodeSelectorRequirement( key="master", operator="In", values=["True"]) ]) ]))), containers=[ k8s.V1Container(name='app', image=tensorboard_image, command=tensorboard_command, volume_mounts=volume_mounts), k8s.V1Container( name='proxy', image=tensorboard_proxy_image, ports=[k8s.V1ContainerPort(container_port=80)], readiness_probe=k8s.V1Probe( period_seconds=5, http_get=k8s.V1HTTPGetAction( path='/healthz', port=80))) ], volumes=[ k8s.V1Volume( name=cls.EXPERIMENTS_OUTPUT_VOLUME_NAME, persistent_volume_claim= # noqa k8s.V1PersistentVolumeClaimVolumeSource( claim_name=cls. EXPERIMENTS_OUTPUT_VOLUME_NAME, read_only=True)) ])))) service = k8s.V1Service( api_version='v1', kind='Service', metadata=k8s.V1ObjectMeta(name=k8s_name, labels={ 'name': k8s_name, 'type': 'nauta-tensorboard', 'nauta_app_name': 'tensorboard', 'id': id }), spec=k8s.V1ServiceSpec( type='ClusterIP', ports=[k8s.V1ServicePort(name='web', port=80, target_port=80)], selector={ 'name': k8s_name, 'type': 'nauta-tensorboard', 'nauta_app_name': 'tensorboard', 'id': id })) ingress = k8s.V1beta1Ingress( api_version='extensions/v1beta1', kind='Ingress', metadata=k8s.V1ObjectMeta( name=k8s_name, labels={ 'name': k8s_name, 'type': 'nauta-tensorboard', 'nauta_app_name': 'tensorboard', 'id': id }, annotations={ 'nauta.ingress.kubernetes.io/rewrite-target': '/', 'kubernetes.io/ingress.class': 'nauta-ingress' }), spec=k8s.V1beta1IngressSpec(rules=[ k8s.V1beta1IngressRule( host='localhost', http=k8s.V1beta1HTTPIngressRuleValue(paths=[ k8s.V1beta1HTTPIngressPath( path='/tb/' + id + "/", backend=k8s.V1beta1IngressBackend( service_name=k8s_name, service_port=80)) ])) ])) return cls(deployment=deployment, service=service, ingress=ingress)