def __init__(self): self.failed_pvc_q = Q.Queue() self.client = KubernetesApiClient() aws_metadata = AWSMetaData() self.my_instance_id = aws_metadata.get_instance_id() self.my_region = aws_metadata.get_region()
def exec_commands(self, commands): """ Return a generator with the output of the command """ cname = self.get_main_container_name() client = KubernetesApiClient() return client.exec_cmd(self.namespace, self.name, commands, container=cname)
def __init__(self, kubectl=None, config_file=None): if kubectl: self.kubectl = kubectl elif config_file: self.kubectl = KubernetesApiClient(config_file=config_file) else: self.kubectl = KubernetesApiClient(host="localhost", port="8001", use_proxy=True) self.monitors = [] self.record = AXKubeWaitingRecord() self.started = False
def _wait_for_deregistering_minions(self): """ This step waits for all minions to be de-registered from Kubernetes master, e.g. `kubectl get nodes` returns no minions besides master :return: """ # Wait for kubernetes master de-register all minions logger.info( "Waiting for Kubernetes master to de-register all existing minions" ) self._cluster_info.download_kube_config() kube_config = self._cluster_info.get_kube_config_file_path() kubectl = KubernetesApiClient(config_file=kube_config) while True: try: nodes = kubectl.api.list_node() node_names = [] # list nodes should only show master now if len(nodes.items) > 1: for n in nodes.items: node_names.append(n.metadata.name) logger.info("Remaining Kubernetes minions: %s", node_names) else: # I don't see it necessary to check if the remaining node is master or not logger.info("%sAll minions de-registered from master%s", COLOR_GREEN, COLOR_NORM) break except Exception as e: logger.warning("Caught exception when listing nodes: %s", e) time.sleep(15)
def __init__(self, name, namespace="axuser"): self.name = name self.namespace = namespace self.size = None kclient = KubernetesApiClient(use_proxy=True) self.client = kclient.api
def delete(self): client = KubernetesApiClient(use_proxy=True) options = swagger_client.V1DeleteOptions() options.grace_period_seconds = 1 @parse_kubernetes_exception @retry(wait_exponential_multiplier=100, stop_max_attempt_number=10) def delete_in_provider(): try: client.extensionsvbeta.delete_namespaced_deployment( options, self.namespace, self.name) client.extensionsvbeta.deletecollection_namespaced_replica_set( self.namespace, label_selector="app={}".format(self.name)) client.api.deletecollection_namespaced_pod( self.namespace, label_selector="app={}".format(self.name)) except swagger_client.rest.ApiException as e: if e.status != 404: raise e delete_in_provider() time.sleep(2) s = ServiceEndpoint(self.name, self.namespace) s.delete()
def _get_s3_proxy_port(self, kubeconfig): k8s = KubernetesApiClient(config_file=kubeconfig) resp = k8s.api.list_namespaced_service(S3PROXY_NAMESPACE) for i in resp.items: if i.metadata.name == "s3proxy": return i.spec.ports[0].node_port return None
def __init__(self, pod_name, service_id, root_id, leaf_full_path, namespace="axuser", app_mode=False): """ Initialize information. :param pod_name: We collect log for this pod :param service_id: ServiceID (job) / DeploymentID (application) :param root_id: WorkflowID (job) / ApplicationID (application) :param leaf_full_path: WorkflowPath (job) / DeploymentName (application) :param app_mode: upload xxx-json.log upon termination :param apprecord ApplicationRecord singleton """ self._pod_name = pod_name self._namespace = namespace self._kubectl = KubernetesApiClient() self._service_id = service_id self._root_id = root_id self._leaf_full_path = leaf_full_path self._log_root = os.getenv("LOGMOUNT_PATH") # key:val = cid:cname self._container_info = {} self._local_log_dirs = {} self._bucket = None self._log_s3_prefix = None self._bucket_ax = None self._log_s3_prefix_ax = None self._collectors = {} self._app_mode = app_mode self._set_s3()
def _install_s3_proxy_service(self,kube_config): k8s = KubernetesApiClient(config_file=kube_config) resp = k8s.api.list_namespaced_service(S3PROXY_NAMESPACE) for i in resp.items: if i.metadata.name == "s3proxy": return None subprocess.check_call(["kubectl", "--kubeconfig", kube_config, "create", "--namespace", S3PROXY_NAMESPACE, "-f", "/ax/config/service/argo-wfe/s3proxy-svc.yml"])
def get_pools_from_kubernetes(): client = KubernetesApiClient(use_proxy=True) vols = client.api.list_namespaced_persistent_volume_claim( namespace) assert isinstance(vols, swagger_client.V1PersistentVolumeClaimList) ret = {} for vol in vols.items or []: logger.debug("Checking volume {}".format(vol.metadata.name)) if vol.metadata.labels is None: logger.debug( "Ignoring volume {} as it does not have labels that match a supported volumepool" .format(vol.metadata.name)) continue pool_name = vol.metadata.labels.get("ax-pool-name", None) if pool_name is None: logger.debug( "Ignoring volume {} as it does not have labels that match a supported volumepool" .format(vol.metadata.name)) continue if pool_name not in ret: # get the metadata meta = vol.metadata.annotations.get("ax_metadata", None) if meta is None: logger.warn( "Ignoring volume {} as it does not have metadata". format(vol.metadata.name)) continue ret[pool_name] = meta return ret
def __init__(self, client=None): if client is None: self.client = KubernetesApiClient(use_proxy=True) else: self.client = client self.ignored_namespaces = frozenset( ["kube-system", "default", "axsys", "axuser", "kube-public"])
def __init__(self, config_file, kubepoll=None, replacing=None, kube_config=None, kube_namespace=None): """ Initialize with kubernetes object config file. It can be json or yaml format. :param kubepoll: kubepoll object :param config_file: pathname to yaml or json config file. :param replacing: dict for macro replacement. :param kube_config: optional saved kube_config for cluster config. """ self._config_file = config_file self._replacing = replacing if replacing else {} self._kube_config = kube_config self._attribute_map = {} self._namespace = kube_namespace self._kubectl = KubernetesApiClient(config_file=self._kube_config) self._kube_poll = kubepoll if kubepoll else KubeObjPoll( kubectl=self._kubectl) self._kube_conf_file = KubeObjectConfigFile( self._config_file, self._replacing) if self._config_file else None # This is a hack for Daemon Set or multiple replicas, we want to use monitor to make sure there is # at least one pod coming up as monitor would give us lots of useful information in case of error, # such as container command exe error / image pull error, etc. # After at least one pod starts, the other pods are very likely to start as normal. For Daemon Set # especially, because it is hard for us to know how many replicas, so we use this flag to do extra # poll: the caller shall poll KubeObject.healthy flag until the object is healthy self._extra_poll = False
def __init__(self, name, namespace, client=None): self.name = name self.namespace = namespace if client is None: self.client = KubernetesApiClient(use_proxy=True) else: self.client = client self.ax_meta = {}
def _install_s3_proxy_pod(self,kube_config): k8s = KubernetesApiClient(config_file=kube_config) resp = k8s.api.list_namespaced_pod(S3PROXY_NAMESPACE) for i in resp.items: if str(i.metadata.name).startswith("s3proxy-deployment"): return None subprocess.check_call(["kubectl", "--kubeconfig", kube_config, "create", "--namespace", S3PROXY_NAMESPACE, "-f", "/ax/config/service/argo-wfe/s3proxy.yml"])
def _wait_for_minions(self): """ This step waits for all minions to come up and registered in Kubernetes master :return: """ # Get kubernetes access token self._cluster_info.download_kube_config() kube_config = self._cluster_info.get_kube_config_file_path() # Wait for nodes to be ready. # Because we made sure during pause that kubernetes master already knows that all minions are gone, # we don't need to worry about cached minions here logger.info("Wait 120 seconds before Kubernetes master comes up ...") time.sleep(120) kubectl = KubernetesApiClient(config_file=kube_config) logger.info("Waiting for all Kubelets to be ready ...") trail = 0 while True: try: all_kubelets_ready = True nodes = kubectl.api.list_node() logger.info("%s / %s nodes registered", len(nodes.items), self._total_nodes) if len(nodes.items) < self._total_nodes: all_kubelets_ready = False else: for n in nodes.items: kubelet_check = { "KubeletHasSufficientDisk", "KubeletHasSufficientMemory", "KubeletHasNoDiskPressure", "KubeletReady", "RouteCreated" } for cond in n.status.conditions: if cond.reason in kubelet_check: kubelet_check.remove(cond.reason) if kubelet_check: logger.info("Node %s not ready yet. Remaining Kubelet checkmarks: %s", n.metadata.name, kubelet_check) all_kubelets_ready = False break else: logger.info("Node %s is ready.", n.metadata.name) if all_kubelets_ready: logger.info("All Kubelets are ready") break except Exception as e: if "Max retries exceeded" in str(e): # If master API server is still not ready at this moment, we don't count as a trail trail -= 1 logger.info("Kubernetes API server not ready yet") else: logger.exception("Caught exception when listing nodes: %s", e) trail += 1 if trail > WAIT_FOR_MINION_REG_RETRY: raise RuntimeError("Timeout waiting for minions to come up. Please manually check cluster status") time.sleep(10)
def __init__(self, name, namespace="axuser"): self.name = name self.namespace = namespace self.client = KubernetesApiClient(use_proxy=True) self._attribute_map = { "nodename": "spec.node_name", "nodeip": "status.host_ip", "containers": "spec.containers" }
def __init__(self, name, client=None): self.name = name if client is None: self._client = KubernetesApiClient(use_proxy=True) else: self._client = client self._registry_spec = None self._software_info = SoftwareInfo() if self._software_info.registry_is_private(): secret = KubeObjectConfigFile(DEFAULT_SECRET_YAML_PATH, {"REGISTRY_SECRETS": self._software_info.registry_secrets}) for obj in secret.get_swagger_objects(): if isinstance(obj, swagger_client.V1Secret): self._registry_spec = obj assert self._registry_spec, "Argo registry specification is missing" self._am_service_spec = None self._am_deployment_spec = None # AA-2471: Hack to add AXOPS_EXT_DNS to Application Manager elb = InternalRoute("axops", "axsys", client=self._client) elb_status = elb.status(with_loadbalancer_info=True)["loadbalancer"][0] if not elb_status: raise AXPlatformException("Could not get axops elb address {}".format(elb_status)) replacements = {"NAMESPACE": self._software_info.image_namespace, "VERSION": self._software_info.image_version, "REGISTRY": self._software_info.registry, "APPLICATION_NAME": self.name, "AXOPS_EXT_DNS": elb_status} cluster_name_id = os.getenv("AX_CLUSTER_NAME_ID", None) assert cluster_name_id, "Cluster name id is None!" cluster_config = AXClusterConfig(cluster_name_id=cluster_name_id) if not cluster_config.get_cluster_provider().is_user_cluster(): axam_path = DEFAULT_AM_YAML_PATH else: axam_path = "/ax/config/service/argo-all/axam-svc.yml.in" replacements["ARGO_DATA_BUCKET_NAME"] = os.getenv("ARGO_DATA_BUCKET_NAME") logger.info("Using replacements: %s", replacements) k = KubeObjectConfigFile(axam_path, replacements) for obj in k.get_swagger_objects(): if isinstance(obj, swagger_client.V1Service): self._am_service_spec = obj elif isinstance(obj, swagger_client.V1beta1Deployment): self._am_deployment_spec = obj self._add_pod_metadata("deployment", self._am_deployment_spec.metadata.name, is_label=True) self._add_pod_metadata("ax_costid", json.dumps({ "app": self.name, "service": "axam-deployment", "user": "******" })) else: logger.debug("Ignoring specification of type {}".format(type(obj))) assert self._am_service_spec and self._am_deployment_spec, "Application monitor specification is missing"
def __init__(self, name, namespace="axuser", client=None): """ Args: name: String. Needs to be 24 characters and valid dns chars only """ self.name = name self.namespace = namespace _client = client if client is None: _client = KubernetesApiClient(use_proxy=True) super(ServiceEndpoint, self).__init__(_client)
def query_from_kubernetes(task_name): client = KubernetesApiClient(use_proxy=True) try: job = client.batchv.read_namespaced_job_status("axuser", task_name) assert isinstance( job, swagger_client.V1Job), "Expect to see an object of type V1Job" return job except swagger_client.rest.ApiException as e: if e.status == 404: return None return None
def get_host_ip(kube_config=None): """ Get's the IP address of the host in the cluster. """ k8s = KubernetesApiClient(config_file=kube_config) resp = k8s.api.list_node() assert len(resp.items) == 1, "Need 1 node in the cluster" for n in resp.items: for addr in n.status.addresses: addr_dict = addr.to_dict() if addr_dict['type'] == 'InternalIP': return addr_dict['address'] return None
def init(self): if self.cluster_name: self.kube_client = KubernetesApiClient(config_file="/tmp/ax_kube/cluster_{}.conf".format(self.cluster_name)) else: self.kube_client = KubernetesApiClient() if self.config_file: config_location = self.config_file else: config_location = '/ax/etc/config.yaml' if getattr(sys, 'frozen', False) else 'config.yaml' with open(config_location) as f: yaml_result = yaml.load(f) self.monkey_config = config_schema(yaml_result) try: url = self.service_url.format(self.kube_client.url, AX_NAMESPACE, 'axops-internal:8085/v1/tools?category=notification') resp = self.kube_client.session.get(url) smtp_config = resp.json()['data'] if not smtp_config: self.email_client = ChaosMonkeyNotify(None, self.monkey_config['notification_email'], False) else: self.email_client = ChaosMonkeyNotify(smtp_config[0], self.monkey_config['notification_email'], self.monkey_config['enable_notification']) except Exception as exc: logger.exception("Failed to retrieve smtp configuration from AxOps. Will not notify. %s", str(exc)) self.email_client = ChaosMonkeyNotify(None, self.monkey_config['notification_email'], False) try: url = self.service_url.format(self.kube_client.url, AX_NAMESPACE, 'axops-internal:8085/v1/system/settings/dnsname') resp = self.kube_client.session.get(url) self.dns_name = "({})".format(resp.json()['dnsname']) except Exception as exc: logger.exception("Failed to retrieve dnsname from AxOps. %s", str(exc)) self.dns_name = "" # init ec2 boto3.setup_default_session(profile_name=self.monkey_config['instance']['aws_profile']) self.ec2 = boto3.resource('ec2')
def __init__(self): super(AXMon, self).__init__() self.version = __version__ self._cluster_cond = threading.Condition() self._shutdown = False self._kubectl = KubernetesApiClient(use_proxy=True) # Initialize SoftwareInfo singleton self._software_info = SoftwareInfo() if Cloud().target_cloud_aws(): # init the volume manager singleton VolumeManager()
def __init__(self, name, namespace="axuser"): self.name = name self.namespace = namespace self.client = KubernetesApiClient(use_proxy=True) self.service = None # this is the argo.services.service.Service object self._host_vols = [] self._name_id = AXClusterId().get_cluster_name_id() self._s3_bucket_ax_is_external = AXLogPath(self._name_id).is_external() self._s3_bucket_ax = AXLogPath(self._name_id).bucket() self._s3_key_prefix_ax = AXLogPath(self._name_id).artifact() self._s3_bucket = AXClusterDataPath(self._name_id).bucket() self._s3_key_prefix = AXClusterDataPath(self._name_id).artifact() self.software_info = SoftwareInfo() self._resources = AXResources()
def rest_api(self): """ Thread that responds to the Flask api endpoints. """ k8s_client = KubernetesApiClient() app = Flask("MinionManagerRestAPI") def _update_config_map(enabled_str, asgs): cmap = k8s_client.api.read_namespaced_config_map( namespace=MM_CONFIG_MAP_NAMESPACE, name=MM_CONFIG_MAP_NAME) cmap.data["MM_SPOT_INSTANCE_ENABLED"] = enabled_str if asgs: cmap.data["MM_SCALING_GROUPS"] = asgs k8s_client.api.replace_namespaced_config_map( cmap, MM_CONFIG_MAP_NAMESPACE, MM_CONFIG_MAP_NAME) @app.route('/spot_instance_config', methods=['PUT']) def _update_spot_instances(): """ Update whether spot instances config. """ enabled_str = request.args.get('enabled').title() assert enabled_str.lower() in ("true", "false") # Update the config-map first asgs = request.args.get('asgs', None) _update_config_map(enabled_str, asgs) if asgs: os.environ["MM_SCALING_GROUPS"] = asgs logger.info("Set MM_SCALING_GROUPS to %s", asgs) with self._asg_lock: del self._asg_metas[:] self._scaling_groups = asgs.split() self.start() os.environ["MM_SPOT_INSTANCE_ENABLED"] = enabled_str logger.info("Set MM_SPOT_INSTANCE_ENABLED to %s", enabled_str) return jsonify({"status": "ok"}) @app.route('/spot_instance_config', methods=['GET']) def _get_spot_instances(): """ Get spot-instances config. """ cmap = k8s_client.api.read_namespaced_config_map( namespace=MM_CONFIG_MAP_NAMESPACE, name=MM_CONFIG_MAP_NAME) return jsonify({ "status": cmap.data["MM_SPOT_INSTANCE_ENABLED"], "asgs": cmap.data["MM_SCALING_GROUPS"] }) app.run(host='0.0.0.0', port=6000)
def exists(self): s = ServiceEndpoint(self.name, self.namespace) client = KubernetesApiClient(use_proxy=True) @retry_unless(swallow_code=[404]) def _get_from_provider(): return client.extensionsvbeta.read_namespaced_deployment(self.namespace, self.name) @retry_unless(swallow_code=[404]) def _get_configmap_from_provider(): return client.api.read_namespaced_config_map(self.namespace, self.cmap_name) if _get_from_provider() and _get_configmap_from_provider() and s.exists(): return True return False
def __init__(self): self.client = KubernetesApiClient(use_proxy=True) self.batchapi = self.client.batchv self.kube_namespace = "axuser" self.jobname = None self.service = None # this is the argo.services.service.Service object self._host_vols = [] self._name_id = AXClusterId().get_cluster_name_id() self._s3_bucket_ax_is_external = AXLogPath(self._name_id).is_external() self._s3_bucket_ax = AXLogPath(self._name_id).bucket() self._s3_key_prefix_ax = AXLogPath(self._name_id).artifact() self._s3_bucket = AXClusterDataPath(self._name_id).bucket() self._s3_key_prefix = AXClusterDataPath(self._name_id).artifact() self._attribute_map = {"uuid": "metadata.uid"} self.software_info = SoftwareInfo() self._ax_resources = {}
def get_vols_from_kubernetes(): client = KubernetesApiClient(use_proxy=True) vols = client.api.list_namespaced_persistent_volume_claim( self.namespace, label_selector="ax-pool-name={}".format(self.name)) assert isinstance(vols, swagger_client.V1PersistentVolumeClaimList) s = "" for vol in vols.items or []: name = vol.metadata.name exclusive = vol.metadata.annotations["ax_exclusive"] == 'True' refs_str = vol.metadata.annotations["ax_refs"] refs = ast.literal_eval(refs_str) pool_meta = json.loads(vol.metadata.annotations["ax_metadata"]) deletion = vol.metadata.annotations.get("ax_deletion", "False") s += "Vol {} Excl {} Refs {} Attributes {} Marked for deletion {}\n".format( name, exclusive, refs, pool_meta, deletion) return s
def __init__(self): self._kubectl = KubernetesApiClient() # Initialize kubelet client singleton self._kubelet = KubeletClient() # Initialize DB, PLM pool, self._app_record = ApplicationRecord(table_create=True) self._plm_pool = PodLogManagerPool() # Initialize handshake server self._handshake_server = AXHandshakeServer(sock_addr=APPLET_SOCK, proto=DeploymentNannyProtocol) # Initialize application monitor client self._am = ApplicationManagerClient() # In case applet restarts, it should continue to nanny existing pods self._nanny_existing_pods()
def __init__(self, name, application): """ Each deployment has a name and needs to be part of an application Application maps to a kubernetes namespace and the deployment will be created in this namespace. Args: name: deployment name application: the application that this deployment runs under """ self.name = name self.application = application self.client = KubernetesApiClient(use_proxy=True) self._nameid = AXClusterId().get_cluster_name_id() self._software_info = SoftwareInfo() self._app_obj = Application(application) self.spec = None
def get_vols_from_kubernetes(): client = KubernetesApiClient(use_proxy=True) vols = client.api.list_namespaced_persistent_volume_claim( namespace, label_selector="ax-pool-name={}".format(pool_name)) assert isinstance(vols, swagger_client.V1PersistentVolumeClaimList) ret = {} for vol in vols.items or []: logger.debug("Processing volume {}".format( json.dumps(vol.to_dict()))) name = vol.metadata.name exclusive = vol.metadata.annotations["ax_exclusive"] == 'True' refs_str = vol.metadata.annotations["ax_refs"] refs = ast.literal_eval(refs_str) pool_meta = json.loads(vol.metadata.annotations["ax_metadata"]) vol_obj = Volume(name, namespace) if len(refs) == 0: vol_obj.delete() continue logger.warn( "Volume {}/{} not deleted as it has references".format( namespace, name)) if pool_meta["size"] != size or pool_meta[ "attributes"] != attributes: # if we have a size of attributes mismatch then do not use this volume if it has no ref # if it has a ref then mark it for deletion upon return to the pool if len(refs) > 0: vol_obj.mark_for_deletion() else: vol_obj.delete() continue assert len(refs) <= 1 ret[name] = { "taken": exclusive, "taken-by": refs[0] if exclusive else [], "timer": None } return ret