def test_pod_spec(self): cluster_spec = ClusterSpec(cluster_spec_json=test_spec) pod = create_test_pod("test_spec") pod = cluster_spec.patch_pod(pod, "other") self.assertEqual( pod.metadata.labels["elasticdl.org/app-name"], "elasticdl" ) self.assertEqual(pod.metadata.labels["elasticdl.org/site"], "hangzhou") self.assertEqual( pod.metadata.annotations["tag.elasticdl.org/optimization"], "enabled", ) expected_tolerations = [ client.V1Toleration( effect="NoSchedule", key="elasticdl.org/logic-pool", operator="Equal", value="ElasticDL", ) ] self.assertEqual(pod.spec.tolerations, expected_tolerations) match_expressions = [ client.V1NodeSelectorRequirement( key="elasticdl.org/logic-pool", operator="In", values=["ElasticDL"], ) ] expected_affinity = client.V1Affinity( node_affinity=client.V1NodeAffinity( required_during_scheduling_ignored_during_execution=( client.V1NodeSelector( node_selector_terms=[ client.V1NodeSelectorTerm( match_expressions=match_expressions ) ] ) ) ) ) self.assertEqual(pod.spec.affinity, expected_affinity) expected_env = [] expected_env.append(client.V1EnvVar(name="LOG_ENABLED", value="true")) self.assertEqual(pod.spec.containers[0].env, expected_env) pod = create_test_pod("test_spec") pod = cluster_spec.patch_pod(pod, PodType.MASTER) self.assertEqual(pod.metadata.labels["elasticdl.org/xyz"], "Sun") pod = create_test_pod("test_spec") pod = cluster_spec.patch_pod(pod, PodType.WORKER) self.assertEqual(pod.metadata.labels["elasticdl.org/xyz"], "Earth") pod = create_test_pod("test_spec") pod = cluster_spec.patch_pod(pod, PodType.PS) self.assertEqual(pod.metadata.labels["elasticdl.org/xyz"], "Moon")
def pv_create(core_v1_api, pv_name): core_v1_api = core_v1_api body = client.V1PersistentVolume( api_version="v1", kind="PersistentVolume", metadata=client.V1ObjectMeta(name=pv_name, labels={"key": "localpvs"}), spec=client.V1PersistentVolumeSpec( capacity={"storage": "0.5Gi"}, volume_mode="Filesystem", access_modes=["ReadWriteOnce"], persistent_volume_reclaim_policy="Recycle", local={ "path": "/home/damu/Documents/kubernet/project/CDN_project/volumes/{_name}" .format(_name=pv_name) }, node_affinity=client.V1VolumeNodeAffinity( required=client.V1NodeSelector([ client.V1NodeSelectorTerm(match_expressions=[ client.V1NodeSelectorRequirement( key="kubernetes.io/hostname", operator="In", values=["minikube"]) ]) ])))) core_v1_api.create_persistent_volume(body=body)
def template(context): labels = {"app": context["name"]} template_spec = client.V1PodSpec(containers=[ client.V1Container(name=context["name"], image=context["image"]) ]) if "nodeSelector" in context: template_spec.node_selector = client.V1NodeSelector( node_selector_terms=context["nodeSelector"]) return client.V1Deployment( api_version="extensions/v1beta1", kind="Deployment", metadata=client.V1ObjectMeta(name=context["name"]), spec=client.V1DeploymentSpec( replicas=context["replicas"], selector=client.V1LabelSelector(match_labels=labels), template=client.V1PodTemplateSpec( metadata=client.V1ObjectMeta(labels=labels), spec=template_spec), ), )
def clean_pod_template(pod_template, match_node_purpose="prefer", pod_type="worker"): """ Normalize pod template and check for type errors """ if isinstance(pod_template, str): msg = ( "Expected a kubernetes.client.V1Pod object, got %s" "If trying to pass a yaml filename then use " "KubeCluster.from_yaml" ) raise TypeError(msg % pod_template) if isinstance(pod_template, dict): msg = ( "Expected a kubernetes.client.V1Pod object, got %s" "If trying to pass a dictionary specification then use " "KubeCluster.from_dict" ) raise TypeError(msg % str(pod_template)) pod_template = copy.deepcopy(pod_template) # Make sure metadata / labels / env objects exist, so they can be modified # later without a lot of `is None` checks if pod_template.metadata is None: pod_template.metadata = client.V1ObjectMeta() if pod_template.metadata.labels is None: pod_template.metadata.labels = {} if pod_template.spec.containers[0].env is None: pod_template.spec.containers[0].env = [] # add default tolerations tolerations = [ client.V1Toleration( key="k8s.dask.org/dedicated", operator="Equal", value=pod_type, effect="NoSchedule", ), # GKE currently does not permit creating taints on a node pool # with a `/` in the key field client.V1Toleration( key="k8s.dask.org_dedicated", operator="Equal", value=pod_type, effect="NoSchedule", ), ] if pod_template.spec.tolerations is None: pod_template.spec.tolerations = tolerations else: pod_template.spec.tolerations.extend(tolerations) # add default node affinity to k8s.dask.org/node-purpose=worker if match_node_purpose != "ignore": # for readability affinity = pod_template.spec.affinity if affinity is None: affinity = client.V1Affinity() if affinity.node_affinity is None: affinity.node_affinity = client.V1NodeAffinity() # a common object for both a preferred and a required node affinity node_selector_term = client.V1NodeSelectorTerm( match_expressions=[ client.V1NodeSelectorRequirement( key="k8s.dask.org/node-purpose", operator="In", values=[pod_type] ) ] ) if match_node_purpose == "require": if ( affinity.node_affinity.required_during_scheduling_ignored_during_execution is None ): affinity.node_affinity.required_during_scheduling_ignored_during_execution = client.V1NodeSelector( node_selector_terms=[] ) affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms.append( node_selector_term ) elif match_node_purpose == "prefer": if ( affinity.node_affinity.preferred_during_scheduling_ignored_during_execution is None ): affinity.node_affinity.preferred_during_scheduling_ignored_during_execution = ( [] ) preferred_scheduling_terms = [ client.V1PreferredSchedulingTerm( preference=node_selector_term, weight=100 ) ] affinity.node_affinity.preferred_during_scheduling_ignored_during_execution.extend( preferred_scheduling_terms ) else: raise ValueError( 'Attribute must be one of "ignore", "prefer", or "require".' ) pod_template.spec.affinity = affinity return pod_template
def _generate_affinity(self): return k8s_client.V1Affinity( node_affinity=k8s_client.V1NodeAffinity( preferred_during_scheduling_ignored_during_execution=[ k8s_client.V1PreferredSchedulingTerm( weight=1, preference=k8s_client.V1NodeSelectorTerm( match_expressions=[ k8s_client.V1NodeSelectorRequirement( key="some_node_label", operator="In", values=[ "possible-label-value-1", "possible-label-value-2", ], ) ] ), ) ], required_during_scheduling_ignored_during_execution=k8s_client.V1NodeSelector( node_selector_terms=[ k8s_client.V1NodeSelectorTerm( match_expressions=[ k8s_client.V1NodeSelectorRequirement( key="some_node_label", operator="In", values=[ "required-label-value-1", "required-label-value-2", ], ) ] ), ] ), ), pod_affinity=k8s_client.V1PodAffinity( required_during_scheduling_ignored_during_execution=[ k8s_client.V1PodAffinityTerm( label_selector=k8s_client.V1LabelSelector( match_labels={"some-pod-label-key": "some-pod-label-value"} ), namespaces=["namespace-a", "namespace-b"], topology_key="key-1", ) ] ), pod_anti_affinity=k8s_client.V1PodAntiAffinity( preferred_during_scheduling_ignored_during_execution=[ k8s_client.V1WeightedPodAffinityTerm( weight=1, pod_affinity_term=k8s_client.V1PodAffinityTerm( label_selector=k8s_client.V1LabelSelector( match_expressions=[ k8s_client.V1LabelSelectorRequirement( key="some_pod_label", operator="NotIn", values=[ "forbidden-label-value-1", "forbidden-label-value-2", ], ) ] ), namespaces=["namespace-c"], topology_key="key-2", ), ) ] ), )
def submit_job(args, command=None): container_image = args.container container_name = args.name body = client.V1Job(api_version="batch/v1", kind="Job", metadata=client.V1ObjectMeta(name=container_name)) body.status = client.V1JobStatus() template = client.V1PodTemplate() labels = { 'hugin-job': "1", 'hugin-job-name': f'{container_name}' } template.template = client.V1PodTemplateSpec( metadata=client.V1ObjectMeta(labels=labels) ) tolerations = [] env = [] if args.environment: for env_spec in args.environment: env_name,env_value = env_spec.split("=", 1) env.append(client.V1EnvVar(name=env_name, value=env_value)) containe_args = dict( name=f"container-{container_name}", image=container_image, env=env, ) if args.gpu: tolerations.append(client.V1Toleration( key='nvidia.com/gpu', operator='Exists', effect='NoSchedule')) containe_args['resources'] = client.V1ResourceRequirements(limits={"nvidia.com/gpu": 1}) if command or args.command: containe_args['command'] = command if command else args.command container = client.V1Container(**containe_args) pull_secrets = [] if args.pull_secret is not None: pull_secrets.append(client.V1LocalObjectReference(name=args.pull_secret)) pod_args = dict(containers=[container], restart_policy='Never', image_pull_secrets=pull_secrets) if tolerations: pod_args['tolerations'] = tolerations if args.node_selector is not None: parts = args.node_selector.split("=", 1) if len(parts) == 2: affinity = client.V1Affinity( node_affinity=client.V1NodeAffinity( required_during_scheduling_ignored_during_execution=client.V1NodeSelector( node_selector_terms=[client.V1NodeSelectorTerm( match_expressions=[client.V1NodeSelectorRequirement( key=parts[0], operator='In', values=[parts[1]])] )] ) ) ) pod_args['affinity'] = affinity template.template.spec = client.V1PodSpec(**pod_args) body.spec = client.V1JobSpec(ttl_seconds_after_finished=1800, template=template.template) try: api_response = batch_v1.create_namespaced_job("default", body, pretty=True) #print (api_response) except client.exceptions.ApiException as e: logging.critical(f"Failed to start job: {e.reason}")
def add(ip, game_id, params): game=get_game_by_id(game_id) game.validate_params(params) uid=uuid.uuid4().hex[:12] name="gaas-{}".format(uid) labels={ "app": "gaas", "game": game_id, "server": uid, "creator": ip, } metadata=client.V1ObjectMeta( labels=labels, name=name, ) ip_ext=alloc_ip() extra_env=[client.V1EnvVar( name="IP_ALLOC", value=ip_ext ), client.V1EnvVar( name="IP_CREATOR", value=ip )] containers = game.make_deployment(params) generic_ports = [] # TODO(bluecmd): Hack to work around that not all # ports are routed to the VIP by default. This allows # outgoing connections from inside the pod on the VIP. for p in range(50000, 50016): generic_ports.append(client.V1ServicePort( name="internal-tcp-" + str(p), port=p, target_port=p, protocol="TCP")) generic_ports.append(client.V1ServicePort( name="internal-udp-" + str(p), port=p, target_port=p, protocol="UDP")) for container in containers: if container.env: container.env.extend(extra_env) else: container.env = extra_env if not container.resources: container.resources=client.V1ResourceRequirements( limits={ "cpu": "4", "memory": "32G" }, requests={ "cpu": "2", "memory": "16G" } ) deployment=client.V1Deployment( spec=client.V1DeploymentSpec( replicas=1, strategy=client.AppsV1beta1DeploymentStrategy( rolling_update=client.AppsV1beta1RollingUpdateDeployment( max_surge=0, max_unavailable=1 ) ), selector=client.V1LabelSelector( match_labels=labels, ), template=client.V1PodTemplateSpec( spec=client.V1PodSpec( containers=containers, termination_grace_period_seconds=0, # TODO(bluecmd): Hack to work around that not all # ports are routed to the VIP by default. This allows # outgoing connections from inside the pod on the VIP. security_context=client.V1PodSecurityContext( sysctls=[client.V1Sysctl( name='net.ipv4.ip_local_port_range', value='50000 50015')]), affinity=client.V1Affinity( node_affinity=client.V1NodeAffinity( required_during_scheduling_ignored_during_execution=client.V1NodeSelector( node_selector_terms=[ client.V1NodeSelectorTerm( match_expressions=[ client.V1NodeSelectorRequirement( key="kubernetes.io/role", operator="NotIn", values=["shared"] ) ] ) ] ) ) ) ) ) ) ) service=client.V1Service( spec=client.V1ServiceSpec( type="ClusterIP", selector=labels, ports=game.make_service(params) + generic_ports, external_i_ps=[ip_ext], ) ) deployment.metadata=metadata deployment.spec.template.metadata=metadata service.metadata=metadata service.metadata.annotations={"kube-router.io/service.dsr": "tunnel"} client.AppsV1Api().create_namespaced_deployment( namespace=NAMESPACE, body=deployment, ) service_resp = client.CoreV1Api().create_namespaced_service( namespace=NAMESPACE, body=service, ) return {"uid": uid, "ip": ip}
def from_runs(cls, id: str, runs: List[Run]): k8s_name = 'tensorboard-' + id run_names_hash = K8STensorboardInstance.generate_run_names_hash(runs) volume_mounts = [] for run in runs: mount = k8s.V1VolumeMount( name=cls.EXPERIMENTS_OUTPUT_VOLUME_NAME, mount_path=os.path.join( cls.TENSORBOARD_CONTAINER_MOUNT_PATH_PREFIX, run.owner, run.name), sub_path=os.path.join(run.owner, run.name)) volume_mounts.append(mount) deployment_labels = { 'name': k8s_name, 'type': 'nauta-tensorboard', 'nauta_app_name': 'tensorboard', 'id': id, 'runs-hash': run_names_hash } tensorboard_command = [ "tensorboard", "--logdir", cls.TENSORBOARD_CONTAINER_MOUNT_PATH_PREFIX, "--port", "6006", "--host", "127.0.0.1" ] nauta_config = NautaPlatformConfig.incluster_init() tensorboard_image = nauta_config.get_tensorboard_image() tensorboard_proxy_image = nauta_config.get_activity_proxy_image() deployment = k8s.V1Deployment( api_version='apps/v1', kind='Deployment', metadata=k8s.V1ObjectMeta(name=k8s_name, labels=deployment_labels), spec=k8s.V1DeploymentSpec( replicas=1, selector=k8s.V1LabelSelector(match_labels=deployment_labels), template=k8s.V1PodTemplateSpec( metadata=k8s.V1ObjectMeta(labels=deployment_labels), spec=k8s.V1PodSpec( tolerations=[ k8s.V1Toleration(key='master', operator='Exists', effect='NoSchedule') ], affinity=k8s. V1Affinity(node_affinity=k8s.V1NodeAffinity( required_during_scheduling_ignored_during_execution =k8s.V1NodeSelector(node_selector_terms=[ k8s.V1NodeSelectorTerm(match_expressions=[ k8s.V1NodeSelectorRequirement( key="master", operator="In", values=["True"]) ]) ]))), containers=[ k8s.V1Container(name='app', image=tensorboard_image, command=tensorboard_command, volume_mounts=volume_mounts), k8s.V1Container( name='proxy', image=tensorboard_proxy_image, ports=[k8s.V1ContainerPort(container_port=80)], readiness_probe=k8s.V1Probe( period_seconds=5, http_get=k8s.V1HTTPGetAction( path='/healthz', port=80))) ], volumes=[ k8s.V1Volume( name=cls.EXPERIMENTS_OUTPUT_VOLUME_NAME, persistent_volume_claim= # noqa k8s.V1PersistentVolumeClaimVolumeSource( claim_name=cls. EXPERIMENTS_OUTPUT_VOLUME_NAME, read_only=True)) ])))) service = k8s.V1Service( api_version='v1', kind='Service', metadata=k8s.V1ObjectMeta(name=k8s_name, labels={ 'name': k8s_name, 'type': 'nauta-tensorboard', 'nauta_app_name': 'tensorboard', 'id': id }), spec=k8s.V1ServiceSpec( type='ClusterIP', ports=[k8s.V1ServicePort(name='web', port=80, target_port=80)], selector={ 'name': k8s_name, 'type': 'nauta-tensorboard', 'nauta_app_name': 'tensorboard', 'id': id })) ingress = k8s.V1beta1Ingress( api_version='extensions/v1beta1', kind='Ingress', metadata=k8s.V1ObjectMeta( name=k8s_name, labels={ 'name': k8s_name, 'type': 'nauta-tensorboard', 'nauta_app_name': 'tensorboard', 'id': id }, annotations={ 'nauta.ingress.kubernetes.io/rewrite-target': '/', 'kubernetes.io/ingress.class': 'nauta-ingress' }), spec=k8s.V1beta1IngressSpec(rules=[ k8s.V1beta1IngressRule( host='localhost', http=k8s.V1beta1HTTPIngressRuleValue(paths=[ k8s.V1beta1HTTPIngressPath( path='/tb/' + id + "/", backend=k8s.V1beta1IngressBackend( service_name=k8s_name, service_port=80)) ])) ])) return cls(deployment=deployment, service=service, ingress=ingress)