def update_retention_check_interval(k8s_client, kafka_name, namespace="default", interval=1000): custom_object_client = kubernetes.client.CustomObjectsApi(k8s_client) body = { "spec": { "kafka": { "config": { "log.retention.check.interval.ms": interval } } } } custom_object_client.patch_namespaced_custom_object( namespace=namespace, group='kafka.strimzi.io', version='v1beta1', plural='kafkas', name=kafka_name, body=body) waiter.wait_for_predicate(lambda: kubectl.is_stateful_set_ready( k8s_client, f"{kafka_name}-kafka", namespace=namespace), timeout=60)
def _expose(self): # Checks if kafka is already exposed if self._is_exposed: return logging.debug("Exposing kafka cluster") custom_object_client = kubernetes.client.CustomObjectsApi(self._cluster.Kubectl.client()) kafka_spec = custom_object_client.get_namespaced_custom_object(namespace=self._namespace, group='kafka.strimzi.io', version='v1beta1', plural='kafkas', name=self._name)['spec'] advertised_brokers = {'brokers': []} for i in range(0, kafka_spec['kafka']['replicas']): advertised_brokers['brokers'].append({'broker': i, 'advertisedHost': self._master.ip}) kafka_spec['kafka']['listeners']['external'] = {'type': 'nodeport', 'tls': False, 'overrides': advertised_brokers} pods_timestamps = self._brokers_state() custom_object_client.patch_namespaced_custom_object(namespace=self._namespace, group='kafka.strimzi.io', version='v1beta1', plural='kafkas', name=self._name, body={'spec': kafka_spec}) logging.debug("Waiting for kafka brokers to restart") waiter.wait_for_predicate(lambda: self._kafka_brokers_restarted(pods_timestamps) is True, timeout=30) waiter.wait_for_predicate(lambda: self._is_running is True, timeout=30)
def delete_stateful_set_data(client, name, namespace='default', clear_data=False, timeout=60): v1_app = kubernetes.client.AppsV1Api(client) sts_spec = v1_app.read_namespaced_stateful_set(name=name, namespace=namespace).spec replicas = sts_spec.replicas scale_stateful_set(client, 0, name, namespace) claim_templates = [ volume.metadata.name for volume in sts_spec.volume_claim_templates ] pvcs_to_delete = [] for template in claim_templates: for i in range(0, replicas): pvcs_to_delete.append(f"{template}-{name}-{i}") for pvc in pvcs_to_delete: delete_pvc(client, pvc, namespace, clear_data) scale_stateful_set(client, replicas, name, namespace, timeout=timeout) waiter.wait_for_predicate(lambda: is_stateful_set_ready(client, name), timeout=timeout)
def run(self): self.kill() self.check_for_legacy_containers() ssh_direct = self._ssh_direct self.docker.login() logging.debug("running docker") run_cmd = f'{self._docker_bin_path} run -d --rm ' \ f'--volume=/tmp/automation_infra/:/tmp/automation_infra ' \ f'--volume=/etc/hosts:/etc/hosts ' \ f'--volume=/var/log/journal:/var/log/journal ' \ f'--volume=/storage/logs:/storage/logs ' \ f'--privileged ' \ f'--network=host ' \ f'--name=automation_proxy gcr.io/anyvision-training/automation-proxy:{self._automation_proxy_version()}' try: ssh_direct.execute(run_cmd) waiter.wait_for_predicate(lambda: self.running) except SSHCalledProcessError as e: if "endpoint with name automation_proxy already exists in network host" in e.stderr: ssh_direct.execute( f"{self._docker_bin_path} network disconnect --force host automation_proxy" ) ssh_direct.execute(run_cmd) if f"manifest for gcr.io/anyvision-training/automation-proxy:{self._automation_proxy_version()} not found" in e.stderr: logging.error( f"tag {self._automation_proxy_version()} was not pushed to gcr, " f"please run make push-automation-proxy from devops-infra repo" ) raise e else: raise e logging.debug("docker is running")
def kill(self): if not self.running: logging.debug("nothing to remove") return logging.debug("trying to remove docker container") self._ssh_direct.execute( f"{self._docker_bin_path} kill automation_proxy") waiter.wait_for_predicate(lambda: not self.running) logging.debug("removed successfully!")
def scale_deployment(client, replicas, name, namespace='default'): v1 = kubernetes.client.AppsV1Api(client) v1.patch_namespaced_deployment_scale(name=name, namespace=namespace, body={'spec': { 'replicas': replicas }}) waiter.wait_for_predicate(lambda: v1.read_namespaced_deployment_scale( name=name, namespace=namespace).status.replicas == replicas, timeout=30)
def clear_topic(admin, consumer, name): retention = get_topic_config_value(admin, name, 'retention.ms') update_topic_config(admin, name, {"retention.ms": 1000}) consumer.subscribe(name) # Lazy initiate of topic assignment consumer.topics() waiter.wait_for_predicate(lambda: consumer.beginning_offsets( consumer.assignment()) == consumer.end_offsets(consumer.assignment())) update_topic_config(admin, name, {"retention.ms": retention}) consumer.close()
def kill(self): if not self.running: logging.debug("nothing to remove") return logging.debug("trying to remove automation-proxy daemonset") try: self._k8s_v1_client.delete_namespaced_daemon_set( name=self.daemon_set_name, namespace='default') except ApiException as e: logging.exception( "Exception when calling AppsV1Api->create_namespaced_daemon_set: %s\n" % e) waiter.wait_for_predicate(lambda: not self.running) for host in self._cluster.hosts.values(): host.TunnelManager.clear() logging.debug("removed successfully!")
def setup_cluster(cluster, request): for host_name, config in request.function.__hardware_reqs.items(): host = dict(cluster.hosts.items())[host_name] host.k3s_config = config['k3s_config'] host.internal_ip = host.SshDirect.execute("hostname -I | awk {'print $1'}").strip() logging.info("Setting up k3s cluster") hosts = list(cluster.hosts.values()) masters = [host for host in hosts if host.k3s_config["role"] == "master"] if not masters: raise Exception("Couldn't find any master node") main_master = next(iter(masters)) main_master.k8s_name = "k3s-master" main_master.SshDirect.execute( "curl -sfL https://get.k3s.io | sh -s - --cluster-init --cluster-reset --cluster-reset-restore-path=/root/k3s-infra-1174-snapshot") waiter.wait_nothrow(lambda: main_master.SshDirect.execute("journalctl --since='1 min ago' | grep 'restart without'")) main_master.SshDirect.execute( "curl -sfL https://get.k3s.io | sh -s - --node-name=k3s-master --disable='servicelb,traefik,local-storage,metrics-server'") main_master.SshDirect.execute("sudo chmod o+r /etc/rancher/k3s/k3s.yaml") cluster_token = main_master.SshDirect.execute("sudo cat /var/lib/rancher/k3s/server/token").strip() cluster_ip = main_master.SshDirect.execute("hostname -I").strip() waiter.wait_nothrow(lambda: main_master.SshDirect.execute("kubectl get nodes")) nodes = [host for host in hosts if host.k3s_config['role'] == "node"] masters.remove(main_master) jobs = {} nodes_jobs = {f"{host.alias}": partial(_join_agent, host, cluster_ip, cluster_token) for host in nodes} masters_jobs = {f"{master.alias}": partial(_join_master, master, cluster_ip, cluster_token) for master in masters} jobs.update(nodes_jobs) jobs.update(masters_jobs) if jobs: concurrently.run(jobs) logging.info("Waiting for cluster to be Ready...") k8s_client = cluster.Kubectl.client() v1 = kubernetes.client.CoreV1Api(k8s_client) waiter.wait_for_predicate(lambda: len(v1.list_node().items) == len(hosts), timeout=30) logging.info(f"Number of nodes in cluster: {len(v1.list_node().items)}") waiter.wait_for_predicate(lambda: kubectl.is_cluster_ready(k8s_client), timeout=60) logging.info("Adding node labels and taints") _label_and_taint_nodes(k8s_client, hosts)
def _create_service_account(self): if self._api_token: return ssh = self._master.SshDirect try: ssh.execute("sudo kubectl create sa automation-admin") ssh.execute( "sudo kubectl create clusterrolebinding automation-admin --serviceaccount=default:automation-admin --clusterrole=cluster-admin" ) except SSHCalledProcessError as e: pass get_sa_token = lambda: ssh.execute( '''sudo kubectl get secrets -n default -o jsonpath="{.items[?(@.metadata.annotations['kubernetes\.io/service-account\.name']=='automation-admin')].data.token}"|base64 --decode''' ).strip() waiter.wait_for_predicate(get_sa_token, timeout=30) self._api_token = get_sa_token()
def recycle_pvc(client, pvc_name, namespace='default', timeout=60): k8s_client = kubernetes.client v1 = k8s_client.CoreV1Api(client) try: v1.read_namespaced_persistent_volume_claim(name=pvc_name, namespace=namespace) except ApiException as e: if e.status == 404: raise ApiException( f"Couldn't find pvc {pvc_name} in namespace {namespace}") container = kubernetes.client.V1Container( name="pv-cleaner", command=["/bin/sh", "-c", "rm -rf /scrub/*"], image="k8s.gcr.io/busybox", volume_mounts=[ k8s_client.V1VolumeMount(name="pvc-volume", mount_path="/scrub") ]) volume = k8s_client.V1Volume( name="pvc-volume", persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource( claim_name=pvc_name)) pod_spec = k8s_client.V1PodSpec(volumes=[volume], containers=[container], restart_policy="Never") pod_name = f"pv-cleaner-{str(uuid.uuid4())[:6]}" pod = k8s_client.V1Pod(metadata=k8s_client.V1ObjectMeta(name=pod_name), spec=pod_spec) v1 = k8s_client.CoreV1Api(client) v1.create_namespaced_pod(namespace=namespace, body=pod) try: waiter.wait_for_predicate(lambda: v1.read_namespaced_pod( name=pod_name, namespace=namespace).status.phase == "Succeeded", timeout=timeout) except TimeoutError as e: logging.debug( v1.read_namespaced_pod(name=pod_name, namespace=namespace).status) raise e v1.delete_namespaced_pod(name=pod_name, namespace=namespace)
def run(self): self.kill() logging.debug("Deploying automation-proxy DaemonSet") kubectl.create_image_pull_secret(self._cluster.Kubectl.client()) with open( os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../proxy_container/daemonset.yaml")) as f: ds_yaml = yaml.safe_load(f) ds_yaml['spec']['template']['spec']['containers'][0][ 'image'] = f'gcr.io/anyvision-training/automation-proxy:{self._automation_proxy_version()}' try: res = self._k8s_v1_client.create_namespaced_daemon_set( namespace="default", body=ds_yaml) except ApiException as e: logging.exception( "Exception when calling AppsV1Api->create_namespaced_daemon_set: %s\n" % e) waiter.wait_for_predicate( lambda: self._num_ready_pods() == len(self._cluster.hosts), timeout=120) logging.debug(f"Deployment created. status={res.metadata.name}")
def delete_app_data(self, name, label_value=None, label_name="app", resource_type="statefulset"): label_value = label_value or name logging.debug(f"get {name} {resource_type} pods") pod_list = self.get_pods_using_selector_labels( label_name=label_name, label_value=label_value)['items'] num_of_pods = len(pod_list) if num_of_pods == 0: raise Exception(f"unable to find {name} {resource_type} pods") pvc_list = [] pv_list = [] for pod in pod_list: pod_name = pod["metadata"]["name"] logging.debug(f"get pvc name from {name} pod") pvc_name = self.get_pvc_by_pod_name(pod_name) pvc_list.append(pvc_name) logging.debug(f"get pv name from {pvc_name} pvc") pv_name = self.get_pv_by_pvc_name(pvc_name) pv_list.append(pv_name) for pv in pv_list: logging.debug(f"set reclaim policy \"Delete\" to {pv} pv") self.set_pv_reclaim_policy(pv, "Delete") logging.debug(f"scale down {resource_type}: {name}") self.scale(name, resource_type, replicas=0) self.delete_pod_by_label(label_value, label_name, "true", 0) wait_for_predicate( lambda: self.num_of_pod_replicas(name, resource_type) == 0, 120) for pvc in pvc_list: logging.debug(f"delete {pvc} pvc") self.delete_pvc(pvc) logging.debug(f"scale up {resource_type} {name}") self.scale(name, resource_type, replicas=num_of_pods) wait_for_predicate_nothrow( lambda: self.num_of_ready_pod_replicas(name, resource_type) == num_of_pods, 180)
def test_cluster_network_master_restart( base_config, clean_up_all_deployments_and_svcs, amount_of_replicas=100, docker_image_name='gcr.io/hello-minikube-zero-install/hello-node', deployment_name="test"): # Clean up before test starts base_config.hosts.host1.SshDirect.connect(timeout=60) create_deployment_with_replicas(base_config.hosts.host1, deployment_name, docker_image_name, amount_of_replicas) base_config.hosts.host1.Power.reboot() # Check host has started again wait_for_predicate_nothrow( lambda: host_is_active(base_config.hosts.host1.ip), timeout=60) base_config.hosts.host1.SshDirect.connect(timeout=60) wait_for_predicate( lambda: base_config.hosts.host1.Gravity.is_cluster_healthy(), timeout=120, interval=5) wait_for_predicate_nothrow(lambda: all_deployments_pods_alive( base_config.hosts.host1, deployment_name), timeout=300, interval=10)
def wait_container_health_status(self, name_regex, status, timeout=100): waiter.wait_for_predicate( lambda: self.get_container_health_status(name_regex) == status, timeout=timeout)
def wait_for_job_to_succeed(client, job_name, namespace='default', timeout=60): waiter.wait_for_predicate(lambda: get_job_status( client, namespace=namespace, job_name=job_name).succeeded == 1, timeout=timeout)
def create_deployment_with_replicas(host, name, docker_image, amount_of_replicas): wait_for_predicate(lambda: host.K8s.create_deployment(name, docker_image), timeout=120) host.K8s.scale_deployment(name, int(amount_of_replicas)) host.K8s.expose_deployment(name)
def reboot(self, options=""): # Reboots the host and verifies using a ping host = self._host host.SshDirect.execute( f"sudo /sbin/reboot {options} > /dev/null 2>&1 &", timeout=0.1) wait_for_predicate(lambda: not host_is_active(host.ip), timeout=20)
def wait_for_redis_to_be_up(self): waiter.wait_for_predicate(lambda: self.ping(), timeout=30)
def restart_pod_by_service_name(self, service_name): self.delete_pod_by_service_name(service_name) wait_for_predicate( lambda: self.number_ready_pods_in_deployment(service_name) == 1)