def approve_csrs(kubeconfig_path: str, done: threading.Event): log.info( "Started background worker to approve CSRs when they appear...") while not done.is_set(): unapproved_csrs = [] try: unapproved_csrs = get_unapproved_csr_names(kubeconfig_path) except subprocess.SubprocessError: log.debug( "Failed to list csrs. This is usually due to API downtime. Retrying" ) except Exception: # We're in a thread so it's a bit awkward to stop everything else... # Just continue after logging the unexpected exception log.exception("Unknown exception while listing csrs") for csr_name in unapproved_csrs: log.info(f"Found unapproved CSR {csr_name}, approving...") try: approve_csr(kubeconfig_path, csr_name) except subprocess.SubprocessError: log.warning( "Failed attempt to approve CSR, this may be due to API downtime. Will retry later" ) except Exception: # We're in a thread so it's a bit awkward to stop everything else... # Just continue after logging the unexpected exception log.exception( f"Unknown exception while approving the {csr_name} CSR" ) time.sleep(10)
def waiting_for_installation_completion( self, controller: NodeController, cluster_configuration: ClusterConfig, skip_logs=False): master_ip = controller.master_ips[0][0] try: log.info("Configuring /etc/hosts...") utils.config_etc_hosts( cluster_name=cluster_configuration.cluster_name.get(), base_dns_domain=cluster_configuration.base_dns_domain, api_vip=master_ip, ) log.info("Waiting for installation to complete...") waiting.wait( self.all_operators_available, sleep_seconds=20, timeout_seconds=60 * 60, waiting_for="all operators to get up", ) log.info("Installation completed successfully!") except Exception: log.exception( "An unexpected error has occurred while waiting for installation to complete" ) # In case of error, always collect logs self.log_collection(master_ip) raise else: # If successful, collect logs only if caller asked not to skip if not skip_logs: self.log_collection(master_ip)
def _get_domain_ips_and_macs( domain: libvirt.virDomain) -> Tuple[List[str], List[str]]: interfaces_sources = [ # getting all DHCP leases IPs libvirt.VIR_DOMAIN_INTERFACE_ADDRESSES_SRC_LEASE, # getting static IPs via ARP libvirt.VIR_DOMAIN_INTERFACE_ADDRESSES_SRC_ARP, ] interfaces = {} for addresses_source in interfaces_sources: try: interfaces.update( **domain.interfaceAddresses(addresses_source)) except libvirt.libvirtError: log.exception( "Got an error while updating domain's network addresses") ips = [] macs = [] log.debug(f"Host {domain.name()} interfaces are {interfaces}") if interfaces: for (_, val) in interfaces.items(): if val["addrs"]: for addr in val["addrs"]: ips.append(addr["addr"]) macs.append(val["hwaddr"]) if ips: log.info("Host %s ips are %s", domain.name(), ips) if macs: log.info("Host %s macs are %s", domain.name(), macs) return ips, macs
def wrapped(*args, **kwargs): try: return fn(*args, **kwargs) except errors as e: if message: log.exception(message) if callback: callback(e) if silent: return raise
def is_cluster_in_status(client, cluster_id, statuses): log.info("Is cluster %s in status %s", cluster_id, statuses) try: cluster_status = client.cluster_get(cluster_id).status if cluster_status in statuses: return True else: log.info(f"Cluster not yet in its required status. " f"Current status: {cluster_status}") return False except BaseException: log.exception("Failed to get cluster %s info", cluster_id)
def gather_sosreport_from_node(node: Node, destination_dir: str): try: node.upload_file(SOSREPORT_SCRIPT, "/tmp/man_sosreport.sh") node.run_command("chmod a+x /tmp/man_sosreport.sh") node.run_command("sudo /tmp/man_sosreport.sh") node.download_file( "/tmp/sosreport.tar.bz2", os.path.join(destination_dir, f"sosreport-{node.name}.tar.bz2")) except (TimeoutError, RuntimeError, SSHException, SCPException): log.exception("Failed accessing node %s for sosreport data gathering", node)
def wait_and_verify_oc_logs_uploaded(cluster, cluster_tar_path): try: cluster.wait_for_logs_complete(timeout=OC_DOWNLOAD_LOGS_TIMEOUT, interval=OC_DOWNLOAD_LOGS_INTERVAL, check_host_logs_only=False) cluster.download_installation_logs(cluster_tar_path) assert os.path.exists( cluster_tar_path), f"{cluster_tar_path} doesn't exist" _verify_oc_logs_uploaded(cluster_tar_path) except BaseException: log.exception("oc logs were not uploaded") raise
def get_nodes(self, ready: bool = False) -> V1NodeList: if self.hypershift_cluster_client is None: hypershift_cluter_kubeapi_client = create_kube_api_client( self.kubeconfig_path) self.hypershift_cluster_client = CoreV1Api( hypershift_cluter_kubeapi_client) try: nodes = self.hypershift_cluster_client.list_node() except Exception: log.exception("Failed listing nodes") return V1NodeList() if ready: return filter_node_by_ready_status(nodes) return nodes
def wait_for_controller_logs(client, cluster_id, timeout, interval=60): try: # if logs_info has any content, the conroller is alive and healthy waiting.wait( lambda: client.cluster_get(cluster_id).logs_info, timeout_seconds=timeout, sleep_seconds=interval, waiting_for="controller logs_info to be filled", ) except BaseException: log.exception( "Failed to wait on start of controller logs on cluster %s", cluster_id) return False
def collect_debug_info_from_cluster(cluster_deployment, agent_cluster_install, output_folder=None): cluster_name = cluster_deployment.ref.name if not output_folder: output_folder = f"build/{cluster_name}" recreate_folder(output_folder) aci = agent_cluster_install.get() debug_info = aci["status"]["debugInfo"] try: log.info( "Collecting debugInfo events from cluster to %s, debug info %s", output_folder, debug_info) fetch_url_and_write_to_file("eventsURL", "events.json", debug_info, output_folder) log.info("Collecting debugInfo logs from cluster") fetch_url_and_write_to_file("logsURL", "logs.tar", debug_info, output_folder) except Exception as err: log.exception( f"Failed to collect debug info for cluster {cluster_name} ({err})")
def _are_logs_in_status(client, cluster_id, statuses, check_host_logs_only=False): try: cluster = client.cluster_get(cluster_id) hosts = client.get_cluster_hosts(cluster_id) cluster_logs_status = cluster.logs_info host_logs_statuses = [host.get("logs_info", "") for host in hosts] if all(s in statuses for s in host_logs_statuses) and ( check_host_logs_only or (cluster_logs_status in statuses)): log.info("found expected state. cluster logs: %s, host logs: %s", cluster_logs_status, host_logs_statuses) return True log.info( "Cluster logs not yet in their required state. %s, host logs: %s", cluster_logs_status, host_logs_statuses) return False except BaseException: log.exception("Failed to get cluster %s log info", cluster_id) return False
def kube_api_test( kube_api_context, nodes: Nodes, cluster_config: ClusterConfig, proxy_server=None, *, is_ipv4=True, is_disconnected=False, ): cluster_name = cluster_config.cluster_name.get() # TODO resolve it from the service if the node controller doesn't have this information # (please see cluster.get_primary_machine_cidr()) machine_cidr = nodes.controller.get_primary_machine_cidr() agent_cluster_install = AgentClusterInstall( kube_api_client=kube_api_context.api_client, name=f"{cluster_name}-agent-cluster-install", namespace=global_variables.spoke_namespace, ) secret = Secret( kube_api_client=kube_api_context.api_client, name=f"{cluster_name}-secret", namespace=global_variables.spoke_namespace, ) secret.create(pull_secret=cluster_config.pull_secret) cluster_deployment = ClusterDeployment( kube_api_client=kube_api_context.api_client, name=cluster_name, namespace=global_variables.spoke_namespace, ) cluster_deployment.create( agent_cluster_install_ref=agent_cluster_install.ref, secret=secret, ) agent_cluster_install.create( cluster_deployment_ref=cluster_deployment.ref, image_set_ref=deploy_image_set(cluster_name, kube_api_context), cluster_cidr=cluster_config.cluster_networks[0].cidr, host_prefix=cluster_config.cluster_networks[0].host_prefix, service_network=cluster_config.service_networks[0].cidr, ssh_pub_key=cluster_config.ssh_public_key, hyperthreading=cluster_config.hyperthreading, control_plane_agents=nodes.controller.params.master_count, worker_agents=nodes.controller.params.worker_count, machine_cidr=machine_cidr, ) agent_cluster_install.wait_to_be_ready(False) if is_disconnected: log.info("getting igntion and install config override for disconected install") ca_bundle = get_ca_bundle_from_hub() patch_install_config_with_ca_bundle(cluster_deployment, ca_bundle) ignition_config_override = get_ignition_config_override(ca_bundle) else: ignition_config_override = None proxy = setup_proxy(cluster_config, machine_cidr, cluster_name, proxy_server) infra_env = InfraEnv( kube_api_client=kube_api_context.api_client, name=f"{cluster_name}-infra-env", namespace=global_variables.spoke_namespace, ) infra_env.create( cluster_deployment=cluster_deployment, ignition_config_override=ignition_config_override, secret=secret, proxy=proxy, ssh_pub_key=cluster_config.ssh_public_key, ) infra_env.status() download_iso_from_infra_env(infra_env, cluster_config.iso_download_path) log.info("iso downloaded, starting nodes") nodes.start_all() log.info("waiting for host agent") agents = cluster_deployment.wait_for_agents(len(nodes)) for agent in agents: agent.approve() set_agent_hostname(nodes[0], agent, is_ipv4) # Currently only supports single node if len(nodes) == 1: set_single_node_ip(cluster_deployment, nodes, is_ipv4) log.info("Waiting for agent status verification") Agent.wait_for_agents_to_install(agents) agent_cluster_install.wait_to_be_ready(True) log.info("waiting for agent-cluster-install to be in installing state") agent_cluster_install.wait_to_be_installing() try: log.info("installation started, waiting for completion") agent_cluster_install.wait_to_be_installed() log.info("installation completed successfully") except Exception: log.exception("Failure during kube-api installation flow:") collect_debug_info_from_cluster(cluster_deployment, agent_cluster_install)