def start(self): """Launch graphscope instance on kubernetes cluster. Raises: RuntimeError: If instance launch failed or timeout. Returns: str: Coordinator service endpoint. """ try: self._create_namespace() self._create_role_and_binding() self._create_services() time.sleep(1) self._waiting_for_services_ready() self._coordinator_endpoint = self._get_coordinator_endpoint() logger.info( "Coordinator pod start successful with address %s, connecting to service ...", self._coordinator_endpoint, ) except Exception as e: time.sleep(1) self._dump_coordinator_failed_status() self.stop() raise K8sError( "Error when launching Coordinator on kubernetes cluster" ) from e
def wait_for_deployment_complete(api_client, namespace, name, timeout_seconds=60): core_api = kube_client.CoreV1Api(api_client) app_api = kube_client.AppsV1Api(api_client) start_time = time.time() while time.time() - start_time < timeout_seconds: time.sleep(2) response = app_api.read_namespaced_deployment_status( namespace=namespace, name=name ) s = response.status if ( s.updated_replicas == response.spec.replicas and s.replicas == response.spec.replicas and s.available_replicas == response.spec.replicas and s.observed_generation >= response.metadata.generation ): return True else: # check failed selector = "" for k, v in response.spec.selector.match_labels.items(): selector += k + "=" + v + "," selector = selector[:-1] pods = core_api.list_namespaced_pod( namespace=namespace, label_selector=selector ) for pod in pods.items: if pod.status.container_statuses is not None: for container_status in pod.status.container_statuses: if ( not container_status.ready and container_status.restart_count > 0 ): raise K8sError("Deployment {} start failed.".format(name)) raise TimeoutError("Waiting timeout for deployment {}".format(name))
def start(self): """Launch graphscope instance on kubernetes cluster. Raises: RuntimeError: If instance launch failed or timeout. Returns: str: Coordinator service endpoint. """ try: self._create_namespace() self._create_role_and_binding() self._create_services() self._waiting_for_services_ready() return self._get_coordinator_endpoint() except Exception as e: time.sleep(1) logger.error( "Error when launching Coordinator on kubernetes cluster.") self._dump_coordinator_status() self.stop() raise K8sError( "Error when launching Coordinator on kubernetes cluster" ) from e
def _stream_event_impl(self, simple=False): field_selector = "involvedObject.name=" + self._pod_name event_messages = [] while not self._stopped: time.sleep(1) try: events = self._core_api.list_namespaced_event( namespace=self._namespace, field_selector=field_selector, timeout_seconds=2, ) except K8SApiException: pass else: for event in events.items: msg = "{0}: {1}".format(self._pod_name, event.message) if msg and msg not in event_messages: event_messages.append(msg) self._lines.put(msg) logger.info(msg, extra={"simple": simple}) if event.reason == "Failed": raise K8sError("Kubernetes event error: {}".format(msg))
def _connect(self): if self._config_params["addr"] is not None: # try connect to exist coordinator self._session_type = types_pb2.HOSTS proc, endpoint = None, self._config_params["addr"] elif self._config_params["enable_k8s"]: if (self._config_params["k8s_etcd_image"] is None or self._config_params["k8s_gs_image"] is None): raise K8sError("None image found.") api_client = kube_config.new_client_from_config( **self._config_params["k8s_client_config"]) proc = None self._session_type = types_pb2.K8S self._k8s_cluster = KubernetesCluster( api_client=api_client, namespace=self._config_params["k8s_namespace"], service_type=self._config_params["k8s_service_type"], num_workers=self._config_params["num_workers"], gs_image=self._config_params["k8s_gs_image"], etcd_image=self._config_params["k8s_etcd_image"], gie_graph_manager_image=self. _config_params["k8s_gie_graph_manager_image"], zookeeper_image=self._config_params["k8s_zookeeper_image"], image_pull_policy=self._config_params["k8s_image_pull_policy"], image_pull_secrets=self. _config_params["k8s_image_pull_secrets"], vineyard_cpu=self._config_params["k8s_vineyard_cpu"], vineyard_mem=self._config_params["k8s_vineyard_mem"], vineyard_shared_mem=self. _config_params["k8s_vineyard_shared_mem"], etcd_cpu=self._config_params["k8s_etcd_cpu"], etcd_mem=self._config_params["k8s_etcd_mem"], zookeeper_cpu=self._config_params["k8s_zookeeper_cpu"], zookeeper_mem=self._config_params["k8s_zookeeper_mem"], gie_graph_manager_cpu=self. _config_params["k8s_gie_graph_manager_cpu"], gie_graph_manager_mem=self. _config_params["k8s_gie_graph_manager_mem"], engine_cpu=self._config_params["k8s_engine_cpu"], engine_mem=self._config_params["k8s_engine_mem"], coordinator_cpu=float( self._config_params["k8s_coordinator_cpu"]), coordinator_mem=self._config_params["k8s_coordinator_mem"], volumes=self._config_params["k8s_volumes"], waiting_for_delete=self. _config_params["k8s_waiting_for_delete"], timeout_seconds=self._config_params["timeout_seconds"], ) endpoint = self._k8s_cluster.start() if self._config_params["k8s_namespace"] is None: self._config_params[ "k8s_namespace"] = self._k8s_cluster.get_namespace() elif (isinstance(self._config_params["hosts"], list) and len(self._config_params["hosts"]) != 0 and self._config_params["num_workers"] > 0): # lanuch coordinator with hosts proc, endpoint = _launch_coordinator_on_local(self._config_params) self._session_type = types_pb2.HOSTS else: raise RuntimeError("Session initialize failed.") # waiting service ready self._grpc_client = GRPCClient(endpoint) self._grpc_client.waiting_service_ready( timeout_seconds=self._config_params["timeout_seconds"], enable_k8s=self._config_params["enable_k8s"], ) # connect to rpc server try: ( self._session_id, self._engine_config, self._pod_name_list, ) = self._grpc_client.connect() _session_dict[self._session_id] = self except Exception: if proc is not None and proc.poll() is None: try: proc.terminate() except: # noqa: E722 pass raise return proc, endpoint
def get_service_endpoints(api_client, namespace, name, type, timeout_seconds=60): """Get service endpoint by service name and service type. Args: api_client: ApiClient An kubernetes ApiClient object, initialized with the client args. namespace: str Namespace of the service belongs to. name: str Service name. type: str Service type. Valid options are NodePort, LoadBalancer and ClusterIP. timeout_seconds: int Raise TimeoutError after waiting for this seconds, only used in LoadBalancer type. Raises: TimeoutError: If the underlying cloud-provider doesn't support the LoadBalancer service type. K8sError: The service type is not one of (NodePort, LoadBalancer, ClusterIP). Or the service has no endpoint. Returns: A list of endpoint. If service type is LoadBalancer, format with <load_balancer_ip>:<port>. And if service type is NodePort, format with <host_ip>:<node_port>, And if service type is ClusterIP, format with <cluster_ip>:<port> """ start_time = time.time() core_api = kube_client.CoreV1Api(api_client) svc = core_api.read_namespaced_service(name=name, namespace=namespace) # get pods selector = "" for k, v in svc.spec.selector.items(): selector += k + "=" + v + "," selector = selector[:-1] pods = core_api.list_namespaced_pod(namespace=namespace, label_selector=selector) ips = [] ports = [] if type == "NodePort": for pod in pods.items: ips.append(pod.status.host_ip) for port in svc.spec.ports: ports.append(port.node_port) elif type == "LoadBalancer": while True: svc = core_api.read_namespaced_service(name=name, namespace=namespace) if svc.status.load_balancer.ingress is not None: for ingress in svc.status.load_balancer.ingress: if ingress.hostname is not None: ips.append(ingress.hostname) else: ips.append(ingress.ip) for port in svc.spec.ports: ports.append(port.port) break time.sleep(1) if time.time() - start_time > timeout_seconds: raise TimeoutError("LoadBalancer service type is not supported yet.") elif type == "ClusterIP": ips.append(svc.spec.cluster_ip) for port in svc.spec.ports: ports.append(port.port) else: raise K8sError("Service type {0} is not supported yet".format(type)) # generate endpoint endpoints = [] if not ips or not ports: raise K8sError("Get {0} service {1} failed.".format(type, name)) for ip in ips: for port in ports: endpoints.append("{0}:{1}".format(ip, port)) return endpoints