def wait_till_specific_host_is_in_stage( client, cluster_id: str, host_name: str, stages: List[str], nodes_count: int = 1, timeout: int = consts.CLUSTER_INSTALLATION_TIMEOUT / 2, interval: int = 5, ): log.info(f"Wait till {host_name} host is in stage {stages}") try: waiting.wait( lambda: utils.are_host_progress_in_stage( [client.get_host_by_name(cluster_id, host_name)], stages, nodes_count, ), timeout_seconds=timeout, sleep_seconds=interval, waiting_for=f"Node to be in of the stage {stages}", ) except BaseException: hosts = [client.get_host_by_name(cluster_id, host_name)] log.error( f"All nodes stages: " f"{[host['progress']['current_stage'] for host in hosts]} " f"when waited for {stages}" ) raise
def wait_till_cluster_is_in_status( client, cluster_id, statuses: List[str], timeout=consts.NODES_REGISTERED_TIMEOUT, interval=30, break_statuses: List[str] = None, ): log.info("Wait till cluster %s is in status %s", cluster_id, statuses) try: if break_statuses: statuses += break_statuses waiting.wait( lambda: is_cluster_in_status(client=client, cluster_id=cluster_id, statuses=statuses), timeout_seconds=timeout, sleep_seconds=interval, waiting_for=f"Cluster to be in status {statuses}", ) if break_statuses and is_cluster_in_status(client, cluster_id, break_statuses): raise BaseException( f"Stop installation process, " f"cluster is in status {client.cluster_get(cluster_id).status}" ) except BaseException: log.error("Cluster status is: %s", client.cluster_get(cluster_id).status) raise
def wait_till_at_least_one_host_is_in_stage( client, cluster_id, stages, nodes_count=1, timeout=consts.CLUSTER_INSTALLATION_TIMEOUT / 2, interval=consts.DEFAULT_CHECK_STATUSES_INTERVAL, ): log.info(f"Wait till {nodes_count} node is in stage {stages}") try: waiting.wait( lambda: utils.are_host_progress_in_stage( client.get_cluster_hosts(cluster_id), stages, nodes_count, ), timeout_seconds=timeout, sleep_seconds=interval, waiting_for=f"Node to be in of the stage {stages}", ) except BaseException: hosts = client.get_cluster_hosts(cluster_id) log.error( f"All nodes stages: " f"{[host['progress']['current_stage'] for host in hosts]} " f"when waited for {stages}" ) raise
def _are_hosts_in_status(hosts, nodes_count, statuses, status_info="", fall_on_error_status=True): hosts_in_status = [ host for host in hosts if (host["status"] in statuses and host["status_info"].startswith(status_info)) ] if len(hosts_in_status) >= nodes_count: return True elif fall_on_error_status and len( [host for host in hosts if host["status"] == consts.NodesStatus.ERROR]) > 0: hosts_in_error = [(i, host["id"], host["requested_hostname"], host["role"], host["status"], host["status_info"]) for i, host in enumerate(hosts, start=1) if host["status"] == consts.NodesStatus.ERROR] log.error( "Some of the hosts are in insufficient or error status. Hosts in error %s", hosts_in_error) raise InstallationFailedError() log.info( "Asked hosts to be in one of the statuses from %s and currently hosts statuses are %s", statuses, host_statuses(hosts), ) return False
def validate_dns(client, cluster_id): if not args.managed_dns_domains: # 'set_dns' (using dnsmasq) is invoked after nodes_flow return cluster = client.cluster_get(cluster_id) api_address = "api.{}.{}".format(cluster.name, cluster.base_dns_domain) ingress_address = "ingress.apps.{}.{}".format(cluster.name, cluster.base_dns_domain) log.info( "Validating resolvability of the following domains: %s -> %s, %s -> %s", api_address, cluster.api_vip, ingress_address, cluster.ingress_vip, ) try: api_answers = dns.resolver.resolve(api_address, "A") ingress_answers = dns.resolver.resolve(ingress_address, "A") api_vip = str(api_answers[0]) ingress_vip = str(ingress_answers[0]) if api_vip != cluster.api_vip or ingress_vip != cluster.ingress_vip: raise Exception("DNS domains are not resolvable") log.info("DNS domains are resolvable") except Exception as e: log.error("Failed to resolve DNS domains") raise e
def are_libvirt_nodes_in_cluster_hosts(client, cluster_id, num_nodes): try: hosts_macs = client.get_hosts_id_with_macs(cluster_id) except BaseException as e: log.error("Failed to get nodes macs for cluster: %s", cluster_id) return False num_macs = len([mac for mac in hosts_macs if mac != ""]) return num_macs >= num_nodes
def apply(self, refresh: bool = True) -> None: return_value, output, err = self.tf.apply(no_color=IsFlagged, refresh=refresh, input=False, skip_plan=True) if return_value != 0: message = f"Terraform apply failed with return value {return_value}, output {output} , error {err}" log.error(message) raise Exception(message)
def are_libvirt_nodes_in_cluster_hosts(self) -> bool: try: hosts_macs = self.api_client.get_hosts_id_with_macs( self.config.cluster_id) except BaseException: log.error("Failed to get nodes macs for cluster: %s", self.config.cluster_id) return False num_macs = len([mac for mac in hosts_macs if mac != ""]) return num_macs >= self.config.day2_workers_count
def get_assisted_controller_status(kubeconfig): log.info("Getting controller status") command = ( f"oc --insecure-skip-tls-verify --kubeconfig={kubeconfig} --no-headers=true -n assisted-installer " f"get pods -l job-name=assisted-installer-controller" ) response = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) if response.returncode != 0: log.error(f"failed to get controller status: {response.stderr}") return b"" log.info(f"{response.stdout}") return response.stdout
def render_worker_live_iso_ignition(install_device: str): """ The worker live iso ignition file is embedded in the live ISO for the worker and is responsible for: - Copying the worker.ign file into the live filesystem - Creating a one-shot systemd unit service which runs coreos-installer with the worker.ign - Rebooting the node once the operating system has been written to disk The worker then starts with the installed RHCOS+worker.ign and attempts to join the cluster The reason we don't simply boot the live ISO with the worker.ign as ignition is because an RHCOS OS with worker.ign has to be written to disk, worker.ign is not meant to be the ignition of the live ISO itself. Basically, the live ISO phase is just a temporary operating system for the user to shell into and run coreos-installer to install the actual operating system used for OCP. In this test we just automate this manual process using our own ignition file, which will automatically run coreos-installer within the live operating system and reboot the node for us. @param install_device The path of the disk to install RHCOS on (e.g. /dev/vda) """ with open(os.path.join(RESOURCES_DIR, WORKER_LIVE_IGNITION_TEMPLATE), "r") as f: live_iso_ignition_template_contents = f.read() with open(os.path.join(RESOURCES_DIR, WORKER_INSTALL_SCRIPT), "rb") as f: worker_install_script_contents = f.read() try: with open(os.path.join(IBIP_DIR, "worker.ign"), "rb") as f: worker_ignition_contents = f.read() except FileNotFoundError: log.error( "The worker.ign file is only generated in OCP 4.11 and above, " "this test is not meant to run on earlier versions") raise jinja2.filters.FILTERS["b64encode_utf8"] = lambda s: base64.b64encode( s).decode("utf-8") return jinja2.Template(live_iso_ignition_template_contents).render( ssh_public_key=os.environ["SSH_PUB_KEY"], worker_ign_contents=worker_ignition_contents, install_sh_contents=worker_install_script_contents, install_device=install_device, )
def wait_till_nodes_are_ready(nodes_count, network_name): log.info("Wait till %s nodes will be ready and have ips", nodes_count) try: waiting.wait( lambda: len(get_network_leases(network_name)) >= nodes_count, timeout_seconds=consts.NODES_REGISTERED_TIMEOUT * nodes_count, sleep_seconds=10, waiting_for="Nodes to have ips", ) log.info("All nodes have booted and got ips") except BaseException: log.error( "Not all nodes are ready. Current dhcp leases are %s", get_network_leases(network_name), ) raise
def get_libvirt_nodes_mac_role_ip_and_name(network_name): nodes_data = {} try: leases = get_network_leases(network_name) for lease in leases: nodes_data[lease["mac"]] = { "ip": lease["ipaddr"], "name": lease["hostname"], "role": consts.NodeRoles.WORKER if consts.NodeRoles.WORKER in lease["hostname"] else consts.NodeRoles.MASTER, } return nodes_data except BaseException: log.error( "Failed to get nodes macs from libvirt. Output is %s", get_network_leases(network_name), ) raise
def wait_for_logs_complete(client, cluster_id, timeout, interval=60, check_host_logs_only=False): log.info("wait till logs of cluster %s are collected (or timed-out)", cluster_id) statuses = ["completed", "timeout"] try: waiting.wait( lambda: _are_logs_in_status(client=client, cluster_id=cluster_id, statuses=statuses, check_host_logs_only= check_host_logs_only), timeout_seconds=timeout, sleep_seconds=interval, waiting_for=f"Logs to be in status {statuses}", ) log.info("logs are in expected state") except BaseException: log.error("waiting for logs expired after %d", timeout) raise
def are_hosts_in_status(hosts, nodes_count, statuses, status_info="", fall_on_error_status=True): hosts_in_status = [ host for host in hosts if (host["status"] in statuses and host["status_info"].startswith(status_info)) ] if len(hosts_in_status) >= nodes_count: return True elif fall_on_error_status and len([host for host in hosts if host["status"] == consts.NodesStatus.ERROR]) > 0: hosts_in_error = [ (i, host["id"], host["requested_hostname"], host["role"], host["status"], host["status_info"]) for i, host in enumerate(hosts, start=1) if host["status"] == consts.NodesStatus.ERROR ] log.error("Some of the hosts are in insufficient or error status. Hosts in error %s", hosts_in_error) raise Exception("All the nodes must be in valid status, but got some in error") log.info( "Asked hosts to be in one of the statuses from %s and currently hosts statuses are %s", statuses, [ (i, host["id"], host.get("requested_hostname"), host.get("role"), host["status"], host["status_info"]) for i, host in enumerate(hosts, start=1) ], ) return False
def add_interface(self, node_name, network_name, target_interface): """ Create an interface using given network name, return created interface's mac address. Note: Do not use the same network for different tests """ log.info( f"Creating new interface attached to network: {network_name}, for node: {node_name}" ) net_leases = self.list_leases(network_name) mac_addresses = [] for lease in net_leases: mac_addresses.append(lease["mac"]) command = f"virsh attach-interface {node_name} network {network_name} --target {target_interface} --persistent" utils.run_command(command) try: waiting.wait( lambda: len(self.list_leases(network_name)) > len(mac_addresses ), timeout_seconds=30, sleep_seconds=2, waiting_for="Wait for network lease", ) except waiting.exceptions.TimeoutExpired: log.error("Network lease wasnt found for added interface") raise mac_address = "" new_net_leases = self.list_leases(network_name) for lease in new_net_leases: if not lease["mac"] in mac_addresses: mac_address = lease["mac"] break log.info( f"Successfully attached interface, network: {network_name}, mac: {mac_address}, for node:" f" {node_name}") return mac_address