def check_election_status(self, config): """ Retrieves the leader-election annotation from a given object, and submits metrics and a service check. An integration warning is sent if the object is not retrievable, or no record is found. Monitors on the service-check should have no-data alerts enabled to account for this. The config objet requires the following fields: namespace (prefix for the metrics and check) record_kind (endpoints or configmap) record_name record_namespace tags (optional) It reads the following agent configuration: kubernetes_kubeconfig_path: defaut is to use in-cluster config """ try: record = self._get_record(config.get("record_kind", ""), config.get("record_name", ""), config.get("record_namespace", "")) self._report_status(config, record) except Exception as e: self.warning( "Cannot retrieve leader election record {}: {}".format( config.get("record_name", ""), e))
def _report_status(self, config, record): # Compute prefix for gauges and service check prefix = config.get("namespace") + ".leader_election" # Compute tags for gauges and service check tags = [] for n in ["record_kind", "record_name", "record_namespace"]: if n in config: tags.append("{}:{}".format(n, config[n])) tags += config.get("tags", []) # Sanity check on the record valid, reason = record.validate() if not valid: self.service_check(prefix + ".status", AgentCheck.CRITICAL, tags=tags, message=reason) return # Stop here # Report metrics self.monotonic_count(prefix + ".transitions", record.transitions, tags) self.gauge(prefix + ".lease_duration", record.lease_duration, tags) leader_status = AgentCheck.OK if record.seconds_until_renew + record.lease_duration < 0: leader_status = AgentCheck.CRITICAL self.service_check(prefix + ".status", leader_status, tags=tags, message=record.summary)
def main(cfg): #parse config if os.path.isfile(cfg): config = ConfigParser.ConfigParser() config.read(cfg) test_name = config.get('kraken', 'test_type') namespace = config.get('kraken', 'name') label = config.get('kraken', 'label') master_label = config.get('kraken', 'master_label') if (label is None): print( Fore.YELLOW + 'label is not provided, assuming you are okay with deleting any of the available nodes except the master\n' ) label = "undefined" if test_name == "kill_node": node_test(label, master_label) elif test_name == "crash_node": node_crash(label, master_label) elif test_name == "kill_master": master_test(label, master_label) elif test_name == "kill_etcd": etcd_test(label, master_label) else: print( Fore.RED + '%s is not a valid scenario, please choose from kill_node, crash_node, kill_etcd, kill_master' ) % (test_name) sys.exit(1) else: help() sys.exit(1)
def main(cfg): #parse config if os.path.isfile(cfg): config = ConfigParser.ConfigParser() config.read(cfg) test_name = config.get('kraken', 'test_type') namespace = config.get('kraken','name') label = config.get('kraken', 'label') master_label = config.get('kraken', 'master_label') # wabouham ADDED: master_label is normally "node_type=master" print (Fore.YELLOW + 'label is: %s\n') %(label) print (Fore.YELLOW + 'master_label is: %s\n') %(master_label) if (label is None): print (Fore.YELLOW + 'label is not provided, assuming you are okay with deleting any of the available nodes except the master\n') label = "undefined" if test_name == "kill_node": node_test(label, master_label) elif test_name == "crash_node": node_crash(label, master_label) elif test_name == "kill_master": master_test(label, master_label) elif test_name == "kill_etcd": etcd_test(label, master_label) else: print (Fore.RED + '%s is not a valid scenario, please choose from kill_node, crash_node, kill_etcd, kill_master') %(test_name) sys.exit(1) else: help() sys.exit(1)
def __init__(self, config, trawler): # Takes in config object and trawler instance it's behind # In k8s or outside self.use_kubeconfig = trawler.use_kubeconfig # Namespace to find managemnet pods self.namespace = config.get('namespace', 'default') # Maximum frequency to pull data from APIC self.max_frequency = int(config.get('frequency', 600)) if self.use_kubeconfig: logger.error( "Analytics metrics currently only available in cluster") else: self.find_hostname_and_certs()
def __init__(self, config, trawler): # Takes in config object and trawler instance it's behind # Use kubeconfig or in-cluster config for k8s comms self.use_kubeconfig = trawler.use_kubeconfig # Namespace to find managemnet pods self.namespace = config.get('namespace', 'default') # Maximum frequency to pull data from APIC self.max_frequency = int(config.get('frequency', 600)) # Cloud manager username to use for REST calls self.username = config.get('username', 'admin') # Load password from secret `cloudmanager_password` self.password = trawler.read_secret('cloudmanager_password') if self.password is None: # Use out of box default password self.password = '******' self.hostname = self.find_hostname()
def get_current_context(): ''' Read ~/.kube/config to get the current context from the merged kubeconfig --> 'kubectl config current-context' return: current_context ''' config = _config_loader() if config: return config.get('current-context')
def _report_status(self, config, record): # Compute prefix for gauges and service check prefix = config.get("namespace") + ".leader_election" # Compute tags for gauges and service check tags = [] for k, v in { "record_kind": record.kind, "record_name": config.get("record_name"), "record_namespace": config.get("record_namespace"), }.items(): if v is not None: tags.append("{}:{}".format(k, v)) tags += config.get("tags", []) # Sanity check on the record valid, reason = record.validate() if not valid: self.service_check(prefix + ".status", AgentCheck.CRITICAL, tags=tags, message=reason) return # Stop here # Report metrics self.monotonic_count(prefix + ".transitions", record.transitions, tags) self.gauge(prefix + ".lease_duration", record.lease_duration, tags) leader_status = AgentCheck.OK message = record.summary if record.seconds_until_renew + record.lease_duration < 0: leader_status = AgentCheck.CRITICAL if leader_status is AgentCheck.OK: message = None self.service_check(prefix + ".status", leader_status, tags=tags, message=message)
def _isExist(cluster_name): ''' Read ~/.kube/config to check if the provided cluster name is already configured under ~/.kube/config :param cluster_name: Name of cluster to check against. :return: True or False ''' config = _config_loader() if config: for cluster in config.get('clusters', []): if (cluster['name'] == cluster_name): return True return False
def get_kubeasyList(output=False): kubeasyList = {} config = _config_loader() if config: for cluster in config.get('clusters', []): if cluster['name'] == get_current_context(): kubeasyList['** ' + cluster['name']] = cluster['cluster']['server'] else: kubeasyList[' ' + cluster['name']] = cluster['cluster']['server'] if output and kubeasyList: header = ['K8s Cluster', 'Master'] print( '\n List of clusters which are currently ready to use for kubeasy:' ) print( colorama.Fore.GREEN + '\n - \'kubeasy -d\' to access Kubernetes dashboard for the current context.' ) print( colorama.Fore.GREEN + ' - \'kubeasy -c <cluster_name>\' to switch to another listed context.\n' ) _print_table(kubeasyList, header) print(colorama.Fore.GREEN + '\nNote: ** indicates current context.\n') elif output and not kubeasyList: print( colorama.Fore.YELLOW + 'Currently there are no clusters configured for kubeasy, Please check \"kubeasy -h\" for how to add new AKS\\GKE clusters.' ) else: return kubeasyList
def main(cfg): # Parse and read the config if os.path.isfile(cfg): with open(cfg, 'r') as f: config = yaml.full_load(f) config = config["shutdown"][0] cloud_type = config['cloud_type'] kubeconfig_path = config.get("kubeconfig_path", "~/.kube/config") shutdown_master_num = config.get("shutdown_master_num", "all") shutdown_worker_num = config.get("shutdown_worker_num", "all") shutdown_infra_num = config.get("shutdown_infra_num", "all") ssh_file = config.get("ssh_file", "") initialize_clients(kubeconfig_path) downtime = calc_time(config.get("downtime", "300 s")) masters = list_nodes("node-role.kubernetes.io/master") backup_etcd(masters[1]) if shutdown_master_num != "all": new_master_list = [] for i in range(int(shutdown_master_num)): new_master_list.append(masters[i]) masters = new_master_list workers = list_nodes( "node-role.kubernetes.io/worker=,node-role.kubernetes.io/infra!=") if shutdown_worker_num != "all": new_worker_list = [] for i in range(int(shutdown_worker_num)): new_worker_list.append(workers[i]) workers = new_worker_list infras = list_nodes("node-role.kubernetes.io/infra") if shutdown_infra_num != "all": new_infra_list = [] for i in range(int(shutdown_infra_num)): new_infra_list.append(infras[i]) infras = new_infra_list node_list = workers + infras + masters logging.info('node list ' + str(node_list)) if cloud_type == "aws": aws = aws_node_scenarios() for node in node_list: logging.info('stop node ' + str(node)) aws_node_scenarios.node_stop_scenario(aws, node) elif cloud_type == "azure" or cloud_type == "az": logging.info("azure") az_account = run_cmd("az account list -o yaml") az = azure_node_scenarios(az_account) for node in node_list: logging.info('stop node ' + str(node)) azure_node_scenarios.node_stop_scenario(az, node) elif cloud_type == "gcp": logging.info('gcp') project = run_cmd('gcloud config get-value project').split( '/n')[0].strip() gcp = gcp_node_scenarios(project) for node in node_list: logging.info('stop node ' + str(node)) gcp_node_scenarios.node_stop_scenario(gcp, node) else: logging.info("Shutting down using ssh") shutdown_via_ssh(node_list, ssh_file) # wait period time.sleep(downtime) # restart cluster # start nodes based on cloud provider if cloud_type == "aws": for node in node_list: logging.info('start node ' + str(node)) aws_node_scenarios.node_start_scenario(aws, node) elif cloud_type == "azure" or cloud_type == "az": for node in node_list: logging.info('start node ' + str(node)) azure_node_scenarios.node_start_scenario(az, node) elif cloud_type == "gcp": for node in node_list: logging.info('start node ' + str(node)) gcp_node_scenarios.node_start_scenario(gcp, node) else: logging.info("Cloud type " + str(cloud_type) + " is not supported ") sys.exit(1) wait_for_all_nodes_ready(masters) wait_for_all_nodes_ready(workers) wait_for_all_nodes_ready(infras) cluster_operators = run_cmd("oc get co") run_cmd("oc get nodes")
def login_with_kubeconfig(**_: Any) -> Optional[credentials.ConnectionInfo]: """ A minimalistic login handler that can get raw data from a kubeconfig file. Authentication capabilities can be limited to keep the code short & simple. No parsing or sophisticated multi-step token retrieval is performed. This login function is intended to make Kopf runnable in trivial cases when neither pykube-ng nor the official client library are installed. """ # As per https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/ kubeconfig = os.environ.get('KUBECONFIG') if not kubeconfig and os.path.exists(os.path.expanduser('~/.kube/config')): kubeconfig = '~/.kube/config' if not kubeconfig: return None paths = [path.strip() for path in kubeconfig.split(os.pathsep)] paths = [os.path.expanduser(path) for path in paths if path] # As prescribed: if the file is absent or non-deserialisable, then fail. The first value wins. current_context: Optional[str] = None contexts: Dict[Any, Any] = {} clusters: Dict[Any, Any] = {} users: Dict[Any, Any] = {} for path in paths: with open(path, 'rt', encoding='utf-8') as f: config = yaml.safe_load(f.read()) or {} if current_context is None: current_context = config.get('current-context') for item in config.get('contexts', []): if item['name'] not in contexts: contexts[item['name']] = item.get('context') or {} for item in config.get('clusters', []): if item['name'] not in clusters: clusters[item['name']] = item.get('cluster') or {} for item in config.get('users', []): if item['name'] not in users: users[item['name']] = item.get('user') or {} # Once fully parsed, use the current context only. if current_context is None: raise credentials.LoginError('Current context is not set in kubeconfigs.') context = contexts[current_context] cluster = clusters[context['cluster']] user = users[context['user']] # Unlike pykube's login, we do not make a fake API request to refresh the token. provider_token = user.get('auth-provider', {}).get('config', {}).get('access-token') # Map the retrieved fields into the credentials object. return credentials.ConnectionInfo( server=cluster.get('server'), ca_path=cluster.get('certificate-authority'), ca_data=cluster.get('certificate-authority-data'), insecure=cluster.get('insecure-skip-tls-verify'), certificate_path=user.get('client-certificate'), certificate_data=user.get('client-certificate-data'), private_key_path=user.get('client-key'), private_key_data=user.get('client-key-data'), username=user.get('username'), password=user.get('password'), token=user.get('token') or provider_token, default_namespace=context.get('namespace'), priority=PRIORITY_OF_KUBECONFIG, )
def create_deps(self, configList, is_wait_ip=True): result = { 'datas': { 'node': '', 'deps': {} }, 'error': '', 'status': True } # 过滤出来的属性 result_deps_pool = [ 'name', 'deploy_name', 'service_name', 'host_ip', 'rf_port', 'mysql_port', 'ssh_port', 'pod_ip', 'web_ssh_port', 'port_map', 'error', 'res' ] reqData = [] for config in configList: # 端口数据转字符串 for i in range(len(config.get('ports'))): config.get('ports')[i] = str(config.get('ports')[i]) is_resource_occupied = config.get( 'is_resource_occupied') if config.get( 'is_resource_occupied') else 0 life_days = config.get('life_days') if config.get( 'life_days') else 0 is_count = config.get('is_count') if config.get('is_count') else 0 max_count = config.get('max_count') if config.get( 'max_count') else 0 app_info = config.get('app_info') if config.get( 'app_info') else None if app_info: if not app_info['project']: app_info['project'] = "未知" if not app_info['department']: app_info['department'] = "未知" if not app_info['panel']: app_info['panel'] = "未知" if not app_info['packet']: app_info['packet'] = "未知" if not app_info['component']: app_info['component'] = "未知" if not app_info['usage']: app_info['usage'] = "未知" reqData.append({ "id": uuid.uuid4().__str__(), "uid": self.uid, "image": config.get('image'), "label": '', "name": 'sdkuser', "command": config.get('command'), "cpu": config.get('cpu'), "memory": config.get('memory'), "ephemeral_storage": config.get('ephemeral_storage'), "ports": ','.join(config.get('ports')), "is_build": 0, "is_persistent": 0, "is_resource_occupied": is_resource_occupied, "p_name": '', "p_path": '', "p_storage": 0, "sub_net_name": [], "node_name": '', "pic": '', "coordinate": [], "is_set": -1, "node_labels": config.get('node_labels'), "life_days": life_days, "is_count": is_count, "max_count": max_count, "app_info": app_info }) headers = { "Accept": "*/*", "Accept-Encoding": "gzip, deflate", "User-Agent": "python-requests/2.9.1", } url = REQUEST_URL + "/api/multiDeploy" response = requests.post(url=url, data=json.dumps(reqData), headers=headers, verify=False) response = response.content.decode('UTF-8') response = json.loads(response) # pprint(response) datas = response.get('data') status = response.get('status') # print(status) if status: # 容器数量是否超过上限 if status.get('overNumLimit') == True: result['error'] = '容器创建数量超过上限' result['status'] = False return result # 资源是否不足 if status.get('outOfResource') == True: result['error'] = '无可分配资源,请适当释放您的资源或通知资源负责人' result['status'] = False return result del status["outOfResource"] for k, v in status.items(): if v == False: result['error'] = '容器创建过程出错' result['status'] = False break # print('===========================') # 容器创建都成功 if result['status'] == True: # 调整result结构 id_to_name_map = {} # id与name的映射关系 podNames = [] # 容器name列表,获取pod_ip时候用 result.get("datas")["node"] = datas.get('node') # 构造容器属性body,并补充map列表 for id, name in datas.get('name').items(): result.get("datas").get("deps")[name] = {} id_to_name_map[id] = name podNames.append(name) # 填充容器属性 for k, v in datas.items(): if k in result_deps_pool: for dep_id, dep_val in v.items(): dep_name = id_to_name_map[dep_id] if dep_name: result.get("datas").get( "deps")[dep_name][k] = dep_val # pprint(result) # is_wait_ip is True -> get pod_ip if is_wait_ip: # 添加podIp url = REQUEST_URL + "/api/getPodIps" for i in range(POD_IP_TIMES): time.sleep(POD_IP_DELAY) res = requests.post(url=url, data=json.dumps(podNames), headers=headers, verify=False) res = res.content.decode('UTF-8') res = json.loads(res) # print('get_pod_res====================',res) if res.get('status') == True: pod_ips = res.get('datas') for name, props in result.get("datas").get( "deps").items(): result.get("datas").get( "deps")[name]['pod_ip'] = pod_ips[name] break # 超出等待时间,需要删除未获取到pod_ip的容器,并抛异常 delete_dep_list = [] for d_name, d_item in result.get("datas").get("deps").items(): # 有一个没得到pod_ip,就都删除 if not d_item.get('pod_ip'): if len(podNames) > 0: for p_name in podNames: delete_dep_list.append({"name": p_name}) print( '----------------部分容器未获得pod_ip,需要删除所有容器:-----------------------:' ) print(podNames) delete_res = self.delete_deps(delete_dep_list) print('----------------删除结果-------------------:') print(delete_res) # TODO.清空result数据 result = { 'datas': { 'node': '', 'deps': {} }, 'error': '', 'status': True } result['error'] = '创建失败,容器启动异常导致部分pod_ip未获取到,请确认配置' result['status'] = False # raise Exception("创建失败,容器启动异常导致部分pod_ip未获取到,请确认配置") # is_wait_ip is Fakse -> pod_ip = '0.0.0.0' else: for name, props in result.get("datas").get("deps").items(): result.get("datas").get("deps")[name]['pod_ip'] = '0.0.0.0' # 获取到podIp后,判断容器状态 url_status = REQUEST_URL + "/api/isPodRunning?uid=" + self.uid res_status = requests.post(url=url_status, data=json.dumps(podNames), headers=headers, verify=False) res_status = res_status.content.decode('UTF-8') res_status = json.loads(res_status) if res_status.get('status') == False: result['error'] = res_status.get('error') result['status'] = False if result.get('datas').get('deps') == {} and result.get( 'status') == True: result['error'] = '未获取到容器创建数据' result['status'] = False # pprint(result) return result
initimage = None proxyimage = None imagepullpolicy = IMAGEPULLPOLICY debug = DEBUG verbosity = VERBOSITY namespaces = NAMESPACES version = ISTIO_VERSION configmap = None if configmaps.items: found = [c for c in configmaps.items if c.metadata.name == CONFIGMAP] if found: configmap = found[0] if configmap is not None: print("Applying settings from configmap") config = yaml.load(configmap.data['config']) policy = config.get('policy', 'enabled') initializername = config.get('initializerName', INITIALIZER) namespaces = config.get('namespaces', NAMESPACES) params = config.get('params') if params is not None: initimage = params.get('initImage', initimage) proxyimage = params.get('proxyImage', proxyimage) imagepullpolicy = params.get('imagePullPolicy', imagepullpolicy) debug = params.get('debugMode', debug) verbosity = params.get('verbosity', verbosity) version = params.get('version', version) if proxyimage is None: proxyimage = 'docker.io/istio/proxy_debug:%s' % version if initimage is None: initimage = 'docker.io/istio/proxy_init:%s' % version if not debug:
def main(cfg): # Parse and read the config if os.path.isfile(cfg): config = configparser.ConfigParser() config.read(cfg) watch_nodes = config.get('cerberus', 'watch_nodes') cerberus_publish_status = config.get('cerberus', 'cerberus_publish_status') watch_etcd = config.get('cerberus', 'watch_etcd') etcd_namespace = config.get('cerberus', 'etcd_namespace') watch_openshift_apiserver = config.get('cerberus', 'watch_openshift_apiserver') openshift_apiserver_namespace = \ config.get('cerberus', 'openshift_apiserver_namespace') watch_kube_apiserver = config.get('cerberus', 'watch_kube_apiserver') kube_apiserver_namespace = config.get('cerberus', 'kube_apiserver_namespace') watch_monitoring_stack = config.get('cerberus', 'watch_monitoring_stack') monitoring_stack_namespace = config.get('cerberus', 'monitoring_stack_namespace') watch_kube_controller = config.get('cerberus', 'watch_kube_controller') kube_controller_namespace = config.get('cerberus', 'kube_controller_namespace') iterations = config.get('tunings', 'iterations') sleep_time = config.get('tunings', 'sleep_time') daemon_mode = config.get('tunings', 'daemon_mode') # Start cerberus logging.info("Starting cerberus") # Run http server using a separate thread # if cerberus is asked to publish the status. # It is served by the http server. if cerberus_publish_status == "True": logging.info("Publishing cerberus status at http://localhost:8086") _thread.start_new_thread(start_server, ()) # Initialize the start iteration to 0 iteration = 0 # Set the number of iterations to loop to infinity # if daemon mode is enabled # or else set it to the provided iterations count in the config if daemon_mode == "True": logging.info("Daemon mode enabled, cerberus will monitor forever") logging.info("Ignoring the iterations set") iterations = float('inf') else: iterations = int(iterations) # Loop to run the components status checks starts here while (int(iteration) < iterations): iteration += 1 # Monitor nodes status if watch_nodes == "True": watch_nodes_status = monitor_nodes() logging.info("Iteration %s: Node status: \ %s" % (iteration, watch_nodes_status)) else: logging.info("Cerberus is not monitoring nodes,\ so setting the status to True and \ assuming that the nodes are ready") watch_nodes_status = True # Monitor etcd status if watch_etcd == "True": watch_etcd_status = monitor_namespace(etcd_namespace) logging.info("Iteration %s: \ Etcd member pods status: \ %s" % (iteration, watch_etcd_status)) else: logging.info("Cerberus is not monitoring ETCD,\ so setting the status to True and \ assuming that the ETCD member pods are ready") watch_etcd_status = True # Monitor openshift-apiserver status if watch_openshift_apiserver == "True": watch_openshift_apiserver_status = \ monitor_namespace(openshift_apiserver_namespace) logging.info("Iteration %s: \ OpenShift apiserver status: \ %s" % (iteration, watch_openshift_apiserver_status)) else: logging.info("Cerberus is not monitoring openshift-apiserver,\ so setting the status to True \ and assuming that the \ openshift-apiserver is up and running") watch_openshift_apiserver_status = True # Monitor kube apiserver status if watch_kube_apiserver == "True": watch_kube_apiserver_status = \ monitor_namespace(kube_apiserver_namespace) logging.info("Iteration %s: \ Kube ApiServer status: \ %s" % (iteration, watch_kube_apiserver_status)) else: logging.info("Cerberus is not monitoring Kube ApiServer, so \ setting the status to True and assuming that \ the Kube ApiServer is up and running") watch_kube_apiserver_status = True # Monitor prometheus/monitoring stack if watch_monitoring_stack == "True": watch_monitoring_stack_status = \ monitor_namespace(monitoring_stack_namespace) logging.info("Iteration %s: \ Monitoring stack status: \ %s" % (iteration, watch_monitoring_stack_status)) else: logging.info("Cerberus is not monitoring prometheus/monitoring\ , so setting the status to True \ and assuming that the monitoring stack is \ up and running") watch_monitoring_stack_status = True # Monitor kube controller if watch_kube_controller == "True": watch_kube_controller_status = \ monitor_namespace(kube_controller_namespace) logging.info("Iteration %s: \ Kube controller status: \ %s" % (iteration, watch_kube_controller_status)) else: logging.info("Cerberus is not monitoring kube controller, so \ setting the status to True and assuming that \ the kube controller is up and running") watch_kube_controller_status = True # Sleep for the specified duration logging.info("Sleeping for the \ specified duration: %s" % (sleep_time)) time.sleep(float(sleep_time)) # Set the cerberus status by checking the status of the # watched components/resources for the http server to publish it if (watch_nodes_status and watch_etcd_status and watch_openshift_apiserver_status and watch_kube_apiserver and watch_monitoring_stack_status and watch_kube_controller): cerberus_status = True else: cerberus_status = False if cerberus_publish_status == "True": publish_cerberus_status(cerberus_status) else: logging.info("Completed watching for the specified number of \ iterations: %s" % (iterations)) else: logging.error("Could not find a config at %s, please check" % (cfg)) sys.exit(1)
def deploy(deployment, chart, environment, namespace=None, helm_config_overrides_implicit=None, helm_config_overrides_string=None, version=None, timeout=None, force=False, atomic=False, cleanup_on_fail=False): """ Deploy a JupyterHub. Expects the following files to exist in current directory {chart}/ (Helm deployment chart) deployments/ - {deployment} - image/ (optional) - secrets/ - {environment}.yaml - config/ - common.yaml - {environment}.yaml A docker image from deployments/{deployment}/image is expected to be already built and available with imagebuilder. `jupyterhub.singleuser.image.tag` will be automatically set to this image tag. """ if helm_config_overrides_implicit is None: helm_config_overrides_implicit = [] if helm_config_overrides_string is None: helm_config_overrides_string = [] config = get_config(deployment) name = f'{deployment}-{environment}' if namespace is None: namespace = name helm_config_files = [ f for f in [ os.path.join('deployments', deployment, 'config', 'common.yaml'), os.path.join('deployments', deployment, 'config', f'{environment}.yaml'), ] if os.path.exists(f) ] helm_secret_files = [ f for f in [ # Support for secrets in same repo os.path.join('deployments', deployment, 'secrets', f'{environment}.yaml'), # Support for secrets in a submodule repo os.path.join('secrets', 'deployments', deployment, 'secrets', f'{environment}.yaml'), ] if os.path.exists(f) ] if config.get('images'): for image in config['images']['images']: # We can support other charts that wrap z2jh by allowing various # config paths where we set image tags and names. # We default to one sublevel, but we can do multiple levels. # With the PANGEO chart, we this could be set to `pangeo.jupyterhub.singleuser.image` helm_config_overrides_string.append( f'{image.helm_substitution_path}.tag={image.tag}') helm_config_overrides_string.append( f'{image.helm_substitution_path}.name={image.name}') with ExitStack() as stack: decrypted_secret_files = [ stack.enter_context(decrypt_file(f)) for f in helm_secret_files ] # Just in time for k8s access, activate the cluster credentials stack.enter_context(cluster_auth(deployment)) helm_upgrade( name, namespace, chart, helm_config_files + decrypted_secret_files, helm_config_overrides_implicit, helm_config_overrides_string, version, timeout, force, atomic, cleanup_on_fail, )
def main(cfg): # Parse and read the config if os.path.isfile(cfg): config = configparser.ConfigParser() config.read(cfg) watch_nodes = config.get('cerberus', 'watch_nodes') cerberus_publish_status = config.get('cerberus', 'cerberus_publish_status') watch_etcd = config.get('cerberus', 'watch_etcd') etcd_namespace = config.get('cerberus', 'etcd_namespace') watch_openshift_apiserver = config.get('cerberus', 'watch_openshift_apiserver') openshift_apiserver_namespace = \ config.get('cerberus', 'openshift_apiserver_namespace') watch_kube_apiserver = config.get('cerberus', 'watch_kube_apiserver') kube_apiserver_namespace = config.get('cerberus', 'kube_apiserver_namespace') watch_monitoring_stack = config.get('cerberus', 'watch_monitoring_stack') monitoring_stack_namespace = config.get('cerberus', 'monitoring_stack_namespace') watch_kube_controller = config.get('cerberus', 'watch_kube_controller') kube_controller_namespace = config.get('cerberus', 'kube_controller_namespace') watch_machine_api = config.get('cerberus', 'watch_machine_api_components') machine_api_namespace = config.get('cerberus', 'machine_api_namespace') watch_kube_scheduler = config.get('cerberus', 'watch_kube_scheduler') kube_scheduler_namespace = config.get('cerberus', 'kube_scheduler_namespace') kubeconfig_path = config.get('cerberus', 'kubeconfig_path') iterations = config.get('tunings', 'iterations') sleep_time = config.get('tunings', 'sleep_time') daemon_mode = config.get('tunings', 'daemon_mode') # Initialize clients if not os.path.isfile(kubeconfig_path): kubeconfig_path = None initialize_clients(kubeconfig_path) # Start cerberus logging.info("Starting cerberus") # Run http server using a separate thread # if cerberus is asked to publish the status. # It is served by the http server. if cerberus_publish_status == "True": logging.info("Publishing cerberus status at http://localhost:8086") _thread.start_new_thread(start_server, ()) # Initialize the start iteration to 0 iteration = 0 # Set the number of iterations to loop to infinity # if daemon mode is enabled # or else set it to the provided iterations count in the config if daemon_mode == "True": logging.info("Daemon mode enabled, cerberus will monitor forever") logging.info("Ignoring the iterations set") iterations = float('inf') else: iterations = int(iterations) # Loop to run the components status checks starts here while (int(iteration) < iterations): iteration += 1 print("\n") # Monitor nodes status if watch_nodes == "True": watch_nodes_status, failed_nodes = monitor_nodes() logging.info("Iteration %s: Node status: %s" % (iteration, watch_nodes_status)) else: logging.info("Cerberus is not monitoring nodes, " "so setting the status to True and " "assuming that the nodes are ready") watch_nodes_status = True # Monitor etcd status if watch_etcd == "True": watch_etcd_status, failed_etcd_pods = \ monitor_namespace(etcd_namespace) logging.info("Iteration %s: Etcd member pods status: %s" % (iteration, watch_etcd_status)) else: logging.info("Cerberus is not monitoring ETCD, " "so setting the status to True and " "assuming that the ETCD member pods are ready") watch_etcd_status = True # Monitor openshift-apiserver status if watch_openshift_apiserver == "True": watch_openshift_apiserver_status, failed_ocp_apiserver_pods = \ monitor_namespace(openshift_apiserver_namespace) logging.info("Iteration %s: OpenShift apiserver status: %s" % (iteration, watch_openshift_apiserver_status)) else: logging.info("Cerberus is not monitoring openshift-apiserver, " "so setting the status to True " "and assuming that the " "openshift-apiserver is up and running") watch_openshift_apiserver_status = True # Monitor kube apiserver status if watch_kube_apiserver == "True": watch_kube_apiserver_status, failed_kube_apiserver_pods = \ monitor_namespace(kube_apiserver_namespace) logging.info("Iteration %s: Kube ApiServer status: %s" % (iteration, watch_kube_apiserver_status)) else: logging.info("Cerberus is not monitoring Kube ApiServer, so " "setting the status to True and assuming that " "the Kube ApiServer is up and running") watch_kube_apiserver_status = True # Monitor prometheus/monitoring stack if watch_monitoring_stack == "True": watch_monitoring_stack_status, failed_monitoring_stack = \ monitor_namespace(monitoring_stack_namespace) logging.info("Iteration %s: Monitoring stack status: %s" % (iteration, watch_monitoring_stack_status)) else: logging.info("Cerberus is not monitoring prometheus stack, " "so setting the status to True " "and assuming that the monitoring stack is " "up and running") watch_monitoring_stack_status = True # Monitor kube controller if watch_kube_controller == "True": watch_kube_controller_status, failed_kube_controller_pods = \ monitor_namespace(kube_controller_namespace) logging.info("Iteration %s: Kube controller status: %s" % (iteration, watch_kube_controller_status)) else: logging.info("Cerberus is not monitoring kube controller, so " "setting the status to True and assuming that " "the kube controller is up and running") watch_kube_controller_status = True # Monitor machine api components # Components includes operator, controller and auto scaler if watch_machine_api == "True": watch_machine_api_status, failed_machine_api_components = \ monitor_namespace(machine_api_namespace) logging.info( "Iteration %s: Machine API components status: %s" % (iteration, watch_machine_api_status)) else: logging.info("Cerberus is not monitoring machine api " "components, so setting the status to True and " "assuming that it is up and running") watch_machine_api_status = True # Monitor kube scheduler if watch_kube_scheduler == "True": watch_kube_scheduler_status, failed_kube_scheduler_pods = \ monitor_namespace(kube_scheduler_namespace) logging.info("Iteration %s: Kube scheduler status: %s" % (iteration, watch_kube_scheduler_status)) else: logging.info("Cerberus is not monitoring kube scheduler, so " "setting the status to True and assuming that " "the kube scheduler is up and running") watch_kube_scheduler_status = True # Sleep for the specified duration logging.info("Sleeping for the " "specified duration: %s" % (sleep_time)) time.sleep(float(sleep_time)) # Set the cerberus status by checking the status of the # watched components/resources for the http server to publish it if watch_nodes_status and watch_etcd_status \ and watch_openshift_apiserver_status \ and watch_kube_apiserver_status \ and watch_monitoring_stack_status \ and watch_kube_controller_status \ and watch_machine_api_status \ and watch_kube_scheduler_status: cerberus_status = True else: cerberus_status = False logging.info( "Failed nodes: %s\n" "Failed etcd pods: %s\n" "Failed openshift apiserver pods: %s\n" "Failed kube apiserver pods: %s\n" "Failed monitoring stack components: %s\n" "Failed kube controller pods: %s\n" "Failed machine api components: %s " "Failed kube scheduler pods: %s " % (failed_nodes, failed_etcd_pods, failed_ocp_apiserver_pods, failed_kube_apiserver_pods, failed_monitoring_stack, failed_kube_controller_pods, failed_machine_api_components, failed_kube_scheduler_pods)) if cerberus_publish_status == "True": publish_cerberus_status(cerberus_status) else: logging.info("Completed watching for the specified number of " "iterations: %s" % (iterations)) else: logging.error("Could not find a config at %s, please check" % (cfg)) sys.exit(1)