def upgrade_tiller(namespace): """ Updates the version of Tiller in a namespace to match the currently configured Helm client. An exception will be thrown if Tiller is not present. Args: namespace: The namespace of the Tiller deployment """ # Check if Tiller is already at the correct version (rc, output) = baseutils.exe_cmd( '{helm} version --tiller-namespace {namespace} --short'.format( helm=helm_binary, namespace=baseutils.shell_escape(namespace))) output = output.strip().splitlines() client_version = output[0].strip().split()[1] tiller_version = output[1].strip().split()[1] if client_version != tiller_version: deployment = k8s.get('deployment', namespace=namespace, name='tiller-deploy') pod_spec = deployment['spec']['template']['spec'] service_account_name = pod_spec['serviceAccountName'] container_spec = pod_spec['containers'][0] override = None if 'command' in container_spec: override = '"spec.template.spec.containers[0].command"="{{{{{command}}}}}"'.format( command=','.join(container_spec['command'])) baseutils.exe_cmd( '{helm} init --history-max 20 --tiller-namespace {namespace} --service-account {service_account_name} {override} --upgrade' .format(helm=helm_binary, namespace=baseutils.shell_escape(namespace), service_account_name=baseutils.shell_escape( service_account_name), override='--override {override}'.format( override=baseutils.shell_escape(override)) if override else ''))
def import_certificates(release_name): """ Triggers the import of certificates from Certificate Manager into the Kubernetes cluster if the correct annotations are found on ingress resources. Specifically, ingress resources may have an ingress with key "p2paas-certificate" and the value equal to the CRN in Certificate Manager. The secret created in Kubernetes will be named as "{namespace-of-ingress}-{name-of-ingress}". Args: release_name: The name of the release to check for certificate annotations """ cluster_info = None manifest = helm.get_manifest(release_name) for resource in manifest: kind = resource['kind'].lower() if kind == 'ingress': annotations = resource['metadata'].get('annotations', {}) certificate_crn = annotations.get('p2paas-certificate') if certificate_crn: if not cluster_info: cluster_info = json.loads( k8s.get('configmap', 'kube-system', 'cluster-info')['data']['cluster-config.json']) certificate_secret_name = '{namespace}.{name}'.format( namespace=resource['metadata'].get('namespace', 'default'), name=resource['metadata']['name']) logger.info( 'Importing certificate "{crn}" as secret "{secret}"'. format(crn=certificate_crn, secret=certificate_secret_name)) ibmcloud.ks_alb_cert_deploy(cluster_info['cluster_id'], certificate_secret_name, certificate_crn)
def _get_current_iks_ovpn_config_name(cluster_name): """ Retrieves the name of the currently associated ovpn config associated to an IKS cluster Args: cluster_name: The name of the cluster to retrieve the ovpn name for Returns: The name of config if there exists a reserved config, otherwise None is returned """ iks_ovpn_config_name = None if k8s.exists('secret', 'ibm-services-system', 'sos-vpn-secret'): vpn_secret = k8s.get('secret', 'ibm-services-system', 'sos-vpn-secret') for key in vpn_secret['data']: if key.endswith('.ovpn'): iks_ovpn_config_name = key with baseutils.local_lock( lock_name=iks_ovpn_reservation_lock_name): # Ensure the reservation system is in-sync with the current state of the IKS cluster vault_reservation_path = '{parent}/reservations/{config_name}'.format( parent=vault_iks_ovpn_path, config_name=iks_ovpn_config_name) current_reservation_owner = vault.read( vault_reservation_path, property='cluster') if current_reservation_owner: if current_reservation_owner != cluster_name: raise Exception( 'Cluster is using and ovpn config reserved by a different cluster' ) else: vault.write(vault_reservation_path, {'cluster': cluster_name}) break return iks_ovpn_config_name
def set_cluster_autoscaler(enabled, worker_pool_names=None, new_worker_pool_names=None): """ Enables or disables the cluster autoscaler in a cluster. This will neither install nor uninstall the autoscaler, merely update the configuration of the autoscaler if present. If the autoscaler is installed but a given worker pool is not already present in the autoscaler config, it will not be added. Args: enabled: Whether to enable or disable the cluster autoscaler. True = enable, False = disable worker_pool_names: If present, only the passed list of pools will be enabled/disabled(Optional, default: all worker pools currently configured) new_worker_pool_names: If worker_pool_names is also specified, element n in worker_pool_names will be renamed to element n in new_worker_pool_names. Each element in worker_pool_names must have a corresponding entry in new_worker_pool_names and at the same index (Optional) Returns: A list of the worker pools that had their configuration changed """ modified_pools = [] if k8s.exists('configmap', 'kube-system', 'iks-ca-configmap'): config_map = k8s.get('configmap', 'kube-system', 'iks-ca-configmap') worker_pools_config = json.loads(config_map['data']['workerPoolsConfig.json']) rename_worker_pools = new_worker_pool_names and worker_pool_names and len(new_worker_pool_names) == len(worker_pool_names) for pool_config in worker_pools_config: if not worker_pool_names or pool_config['name'] in worker_pool_names: if rename_worker_pools: pool_config['name'] = new_worker_pool_names[worker_pool_names.index(pool_config['name'])] pool_config['enabled'] = enabled modified_pools.append(pool_config['name']) elif pool_config['enabled'] != enabled: pool_config['enabled'] = enabled modified_pools.append(pool_config['name']) if modified_pools: config_map['data']['workerPoolsConfig.json'] = json.dumps(worker_pools_config, ensure_ascii=False) # TODO: Remove ensure_ascii when migration to py3 is complete k8s.apply(config_map) else: logger.info('Cluster autoscaler is not present') return modified_pools
def wait_for_node_namespace_pods(node, namespace): """ Waits until the pods in a particular namespace on a specified node node are Running. This can be useful when perform node maintenance when some namespaces do not utilise PodDisruptionBudgets. This function continually polls for the status of pods in the namespace assigned to a given node and returns when they are all Running or Completed. I the node is scaled out during the process, the function will return without error. Args: node: The name of the node to wait on. This is generally the IP of the node namespace: The namespace to poll """ pods_ready = False while not pods_ready: pods_ready = True time.sleep(15) pods = k8s.get('pod', namespace=namespace) for pod in pods: pod_status = pod['status'] if pod_status.get('hostIP') == node: # We are not checking pods with an empty hostIP as we can't tell if it's actually waiting for the current host # Pod is located on the host we are monitoring pod_phase = pod_status['phase'] if pod_phase != 'Succeeded': # A succeeded pod is successfully complete if pod_phase == 'Running': for container_status in pod_status['containerStatuses']: if not container_status['ready']: pods_ready = False break else: # A non-running, non-succeeded pod is not ready pods_ready = False break
def get_tiller_version(): """ Retrieves the current version of Tiller in the environment. If Tiller is not present, an exception will be triggered by the underlying Kubernetes apis. Returns: The installed version of Tiller """ tiller_deployment = k8s.get('deployment', namespace='kube-system', name='tiller-deploy') return tiller_deployment['spec']['template']['spec']['containers'][0]['image'].split(':')[-1]
def _check_for_resource_pod_errors(kind, namespace, name): """ Checks if pods from a resource enter an error state. If an error state is detected, an exception is raised. Args: kind: The kind of the resource owning the pods namespace: The namespace of the resource owning the pods name: The name of the resource owning the pods """ live_resource = k8s.get(kind, namespace=namespace, name=name) # Evaluate pods specific to the new converging state only. Do not include pods that may have been in a bad state pre-upgrade if kind == 'deployment': replica_sets = k8s.get('replicaset', namespace=namespace, labels=live_resource['spec']['selector']['matchLabels']) for replica_set in replica_sets: if live_resource['metadata']['annotations']['deployment.kubernetes.io/revision'] == replica_set['metadata']['annotations']['deployment.kubernetes.io/revision']: match_labels = replica_set['spec']['selector']['matchLabels'] break elif kind == 'daemonset': match_labels = live_resource['spec']['selector']['matchLabels'] match_labels['pod-template-generation'] = live_resource['metadata']['generation'] elif kind == 'statefulset': match_labels = live_resource['spec']['selector']['matchLabels'] match_labels['controller-revision-hash'] = live_resource['status']['updateRevision'] pods = k8s.get('pod', namespace=namespace, labels=match_labels) for pod in pods: pod_name = pod['metadata']['name'] for container_status in pod['status'].get('containerStatuses', []): wait_reason = container_status['state'].get('waiting', {}).get('reason') if wait_reason and (wait_reason == 'CrashLoopBackOff' or wait_reason == 'ErrImagePull' or wait_reason == 'InvalidImageName' or wait_reason == 'RunContainerError' or wait_reason == 'ImagePullBackOff' or wait_reason.endswith(' not found') or wait_reason.startswith('Couldn\'t find ')): # A failed container has been identified. Log useful information before raising exception k8s.describe('pod', namespace=namespace, name=pod_name) k8s.logs(pod_name, namespace=namespace, container=container_status['name']) raise Exception('The pod {pod} has entered the failed waiting state "{state}" during chart upgrade. {message}'.format( pod=pod_name, state=wait_reason, message=container_status['state']['waiting'].get('message', ''))) if container_status.get('restartCount', 0) > 0: k8s.describe('pod', namespace=namespace, name=pod_name) k8s.logs(pod_name, namespace=namespace, container=container_status['name']) raise Exception('The pod {pod} has restarted (failed) during chart upgrade'.format(pod=pod_name))
def wait_for_release_resources(release): """ Waits until a release's resources are all initalised. For resources with replicas, this means all pods must be started and passing their probes. For load_balancer services, this means waiting until the ingresses are initialised. If this is an upgrade of a release with replicas, the pre-upgrade state of the pods is required to know when they have been replaced. Pods will only be waited upon if their Deployment or Statefule set is configured for RollingUpdate and either their image tag or env vars have been updated. Args: release: The name of the release to wait on """ manifest = get_manifest(release) time.sleep(2) # Waiting to ensure new replica values has been rolled our from manifest to the deployed resources max_replicas = get_max_replicas_count_in_manifest(manifest) timeout_value = max(900, (max_replicas * 300) + 60) # Minimum timeout value is 900. Otherwise, define it base on max number of replicas of any resource logger.info('Configuring timeout period set to {timeout} seconds for release "{release}"'.format(timeout=timeout_value, release=release)) with baseutils.timeout(seconds=timeout_value): logger.info('Waiting for resources in release "{release}" to enter ready state'.format(release=release)) for resource in manifest: kind = resource['kind'].lower() name = resource['metadata']['name'] namespace = resource['metadata'].get('namespace') if kind in ['deployment', 'daemonset', 'statefulset']: logger.info('Tracking rollout status of "{kind}" "{name}"'.format(kind=kind, name=name)) rollout_status = '' try: while ('rolling update complete' not in rollout_status and 'successfully rolled out' not in rollout_status and 'roll out complete' not in rollout_status): time.sleep(5) rollout_status = k8s.rollout_status(kind, name, namespace=namespace) _check_for_resource_pod_errors(kind, namespace, name) logger.info('Pods for "{kind}" "{name}" have been rolled out'.format(kind=kind, name=name)) except Exception as e: if 'Status is available only for RollingUpdate strategy type' in str(e): logger.info('"{kind}" "{name}" is not configured for rolling updates'.format(kind=kind, name=name)) else: raise elif kind == 'service' and resource['spec'].get('type') == 'LoadBalancer': resource_ready = False while not resource_ready: time.sleep(5) live_resource = k8s.get(kind, namespace=namespace, name=name) ingress = live_resource['status']['loadBalancer'].get('ingress') if ingress and 'ip' in ingress[0] and 'clusterIP' in live_resource['spec']: resource_ready = True
def test_get(self, mock_exe_cmd): mock_exe_cmd.return_value = (0, '{"items": [], "kind": "List"}') self.assertEqual([], k8s.get('clusterrolebinding')) self.assertEqual([], k8s.get('deployment', namespace='kube-config')) self.assertEqual([], k8s.get('pod', namespace='all')) mock_exe_cmd.return_value = (0, '{}') self.assertEqual({}, k8s.get('serviceaccount', name='name')) self.assertEqual({}, k8s.get('pod', name='name', namespace='kube-config')) self.assertEqual({}, k8s.get('node', name='name', labels='dedicated=edge')) self.assertEqual({}, k8s.get('node', name='name', labels={ 'dedicated': 'edge', 'key2': 'label2' })) mock_exe_cmd.return_value = (0, pod_output) self.assertEqual(json.loads(pod_output), k8s.get('Pod', name='tiller-deploy-5c477df6bf-rsjhp'))
def set_recovery_tool(enabled): """ Sets ibm-worker-recovery tool "Enabled" attribute, which governs whether it monitors nodes for failures. This can be used to disable the tool when performing maintenance on IKS nodes. Otherwise an intermediate state of a node could trigger the tool to queue up a reload. If the recovery tool is not present in an environment, nothing will be done. Currently only KUBEAPI checks are enabled/disabled. Args: enabled: Boolean value to define if the ibm-worker-recovery tool should be enabled. """ if k8s.exists('configmap', 'kube-system', 'ibm-worker-recovery-checks'): config_map = k8s.get('configmap', 'kube-system', 'ibm-worker-recovery-checks') for check in config_map['data']: check_config = json.loads(config_map['data'][check]) if check_config['Check'] == 'KUBEAPI': check_config['Enabled'] = enabled config_map['data'][check] = json.dumps(check_config, ensure_ascii=False) # TODO: Remove ensure_ascii when migration to py3 is complete k8s.apply(config_map) else: logger.info('IBM Auto-Recovery tool is not present')
def run_release_tests(release_name): """ Runs post-deployment Helm tests if the chart contains Helm tests. Nothing will be done if the release does not contain tests. Pre-existing test pods will be cleaned up prior to executing the tests. An exception is raised if the tests fail. Logs of the test containers are captured. Args: release: The name of the release to test """ release_tests = helm.get_hooks(release_name, resource_types=['Pod'], hook_types=['test-success', 'test-failure']) if release_tests: for release_test in release_tests: test_kind = release_test['kind'] test_namespace = release_test['metadata'].get( 'namespace', 'default') test_name = release_test['metadata']['name'] if k8s.exists(test_kind, test_namespace, test_name): k8s.delete(test_kind, test_namespace, test_name) try: helm.test(release_name) finally: # Grab the logs of any (failed) tests for release_test in release_tests: test_kind = release_test['kind'] test_namespace = release_test['metadata'].get( 'namespace', 'default') test_name = release_test['metadata']['name'] if k8s.exists(test_kind, test_namespace, test_name): test_pod = k8s.get(test_kind, namespace=test_namespace, name=test_name) for test_container in test_pod['spec']['containers']: test_container_name = test_container['name'] logger.info( 'Logs for test pod "{pod}" container "{container}":' .format(pod=test_name, container=test_container_name)) k8s.logs(test_name, namespace=test_namespace, container=test_container_name)
def install_helm(helm_version): """ Install Helm and Tiller into the Kubernetes infrastructure. This assumes Tiller is to be installed in the kube-system namespace. It will upgrade Tiller if it is already present. It is safe to call this function multiple times. There are checks for understanding the current state of the Helm/Tiller deployment and only necessary updates are made. Args: helm_version: The version of helm that should be installed, eg: v2.11.1 """ # First check and ensure that the correct client version is present (rc, output) = baseutils.exe_cmd( '{helm} version --client'.format(helm=helm_binary), raise_exception=False, log_level=logging.NOTSET) if rc or helm_version not in output: tmp_dir = tempfile.mkdtemp() try: helm_tar = baseutils.shell_escape( os.path.join(tmp_dir, 'helm.tar.gz')) baseutils.exe_cmd('/usr/bin/curl -L {url} -o {helm_tar}'.format( url=baseutils.shell_escape( 'https://storage.googleapis.com/kubernetes-helm/helm-{version}-linux-amd64.tar.gz' .format(version=helm_version)), helm_tar=helm_tar)) baseutils.exe_cmd( '/bin/tar -xzvf {helm_tar} -C {tmp_dir} && rm -f {helm_tar}'. format(helm_tar=helm_tar, tmp_dir=baseutils.shell_escape(tmp_dir))) os.rename(os.path.join(tmp_dir, 'linux-amd64', 'helm'), helm_binary.strip('\'')) os.chmod(helm_binary.strip('\''), 0o755) finally: shutil.rmtree(tmp_dir) # Secondly check that the correct version of Tiller is installed into the Kubernetes cluster (rc, output) = baseutils.exe_cmd('{helm} version'.format(helm=helm_binary), raise_exception=False, log_level=logging.NOTSET) if rc: # Tiller is not installed. We must check if the service account exists yet service_accounts = k8s.get('serviceaccount', namespace='kube-system') if 'tiller' not in [ service_account['metadata']['name'] for service_account in service_accounts ]: k8s.apply({ 'apiVersion': 'v1', 'kind': 'ServiceAccount', 'metadata': { 'name': 'tiller', 'namespace': 'kube-system' } }) cluster_role_bindings = k8s.get('clusterrolebinding') if 'tiller' not in [ cluster_role_binding['metadata']['name'] for cluster_role_binding in cluster_role_bindings ]: k8s.apply({ 'apiVersion': 'rbac.authorization.k8s.io/v1', 'kind': 'ClusterRoleBinding', 'metadata': { 'name': 'tiller', }, 'roleRef': { 'apiGroup': 'rbac.authorization.k8s.io', 'kind': 'ClusterRole', 'name': 'cluster-admin' }, 'subjects': [{ 'kind': 'ServiceAccount', 'name': 'tiller', 'namespace': 'kube-system' }] }) baseutils.exe_cmd( '{helm} init --history-max 20 --service-account tiller --override "spec.template.spec.containers[0].command"="{{/tiller,--storage=secret}}"' .format(helm=helm_binary)) elif output.count(helm_version) != 2: # Tiller is installed but it is an old version. Upgrade it baseutils.exe_cmd( '{helm} init --history-max 20 --service-account tiller --override "spec.template.spec.containers[0].command"="{{/tiller,--storage=secret}}" --upgrade' .format(helm=helm_binary)) else: # Tiller is correctly configured. We still need to init the client to facilitate the usage of helm repositories baseutils.exe_cmd('{helm} init --client-only'.format(helm=helm_binary))