def installer_gather(self, ip: str, ssh_key: Path, out_dir: str): stdout, stderr, _ret = utils.run_command( f"{INSTALLER_BINARY} gather bootstrap --log-level debug --bootstrap {ip} --master {ip} --key {str(ssh_key)}" ) with open(INSTALLER_GATHER_DEBUG_STDOUT, "w") as f: f.write(stdout) with open(INSTALLER_GATHER_DEBUG_STDERR, "w") as f: f.write(stderr) matches = re.compile(r'.*logs captured here "(.*)".*').findall(stderr) if len(matches) == 0: log.warning( f"It seems like installer-gather didn't generate any bundles, stderr: {stderr}" ) return bundle_file_path, *_ = matches log.info(f"Found installer-gather bundle at path {bundle_file_path}") utils.run_command_with_output( f"tar -xzf {bundle_file_path} -C {out_dir}") os.remove(bundle_file_path) if os.path.exists( bundle_file_path) else None
def approve_csrs(kubeconfig_path: str, done: threading.Event): log.info( "Started background worker to approve CSRs when they appear...") while not done.is_set(): unapproved_csrs = [] try: unapproved_csrs = get_unapproved_csr_names(kubeconfig_path) except subprocess.SubprocessError: log.debug( "Failed to list csrs. This is usually due to API downtime. Retrying" ) except Exception: # We're in a thread so it's a bit awkward to stop everything else... # Just continue after logging the unexpected exception log.exception("Unknown exception while listing csrs") for csr_name in unapproved_csrs: log.info(f"Found unapproved CSR {csr_name}, approving...") try: approve_csr(kubeconfig_path, csr_name) except subprocess.SubprocessError: log.warning( "Failed attempt to approve CSR, this may be due to API downtime. Will retry later" ) except Exception: # We're in a thread so it's a bit awkward to stop everything else... # Just continue after logging the unexpected exception log.exception( f"Unknown exception while approving the {csr_name} CSR" ) time.sleep(10)
def fetch_url_and_write_to_file(url_key, file_name, debug_info, output_folder): if url_key in debug_info: logs_url = debug_info[url_key] content = fetch_url(logs_url) output_file = os.path.join(output_folder, file_name) with open(output_file, "wb") as _file: _file.write(content) else: log.warning(f"{url_key} is not available")
def download_must_gather(kubeconfig: str, dest_dir: str): log.info(f"Downloading must-gather to {dest_dir}") command = ( f"oc --insecure-skip-tls-verify --kubeconfig={kubeconfig} adm must-gather" f" --dest-dir {dest_dir} > {dest_dir}/must-gather.log") try: run_command(command, shell=True, raise_errors=True) except RuntimeError as ex: log.warning(f"Failed to run must gather: {ex}")
def try_get_cluster(): if args.cluster_id: try: client = ClientFactory.create_client(url=utils.get_assisted_service_url_by_args(args=args), offline_token=utils.get_env("OFFLINE_TOKEN")) return client, client.cluster_get(cluster_id=args.cluster_id) except ApiException as e: log.warning(f"Can't retrieve cluster_id={args.cluster_id}, {e}") return None, None
def handle_trigger(self, conditions_string: List[str], values: Dict[str, Any]) -> None: for k, v in values.items(): if not hasattr(self, k): continue if not self.is_user_set(k): log.debug( f"{self.__class__.__name__} - Trigger set `{k}` to `{v}`, Condition: {conditions_string}" ) self._set(k, v) else: log.warning( f"Skipping setting {k} to value {v} due that it already been set by the user" )
def worker_ready() -> bool: try: node_readiness_map = get_nodes_readiness(KUBE_CONFIG) except subprocess.SubprocessError: log.debug("Failed to list nodes. This is usually due to API downtime. Retrying") return False if f"{CLUSTER_PREFIX}-master-0" not in node_readiness_map: log.warning("Couldn't find master in node status list, this should not happen") return False if f"{CLUSTER_PREFIX}-worker-0" not in node_readiness_map: return False return all(node_status for node_status in node_readiness_map.values())
def collect_debug_info_from_cluster(cluster_deployment, agent_cluster_install): cluster_name = cluster_deployment.ref.name output_folder = f"build/{cluster_name}" recreate_folder(output_folder) aci = agent_cluster_install.get() debug_info = aci["status"]["debugInfo"] try: log.info("Collecting debugInfo (events/logs) from cluster") fetch_url_and_write_to_file("eventsURL", "events.json", debug_info, output_folder) fetch_url_and_write_to_file("logsURL", "logs.tar", debug_info, output_folder) except Exception as err: log.warning( f"Failed to collect debug info for cluster {cluster_name} ({err})")
def start_node(self, node_name, check_ips=True): log.info("Going to power-on %s, check ips flag %s", node_name, check_ips) node = self.libvirt_connection.lookupByName(node_name) if not node.isActive(): try: node.create() if check_ips: self._wait_till_domain_has_ips(node) except waiting.exceptions.TimeoutExpired: log.warning("Node %s failed to recive IP, retrying", node_name) self.shutdown_node(node_name) node.create() if check_ips: self._wait_till_domain_has_ips(node)
def _connect_to_load_balancer(self, load_balancer_ip: str) -> bool: family = socket.AF_INET6 if ":" in load_balancer_ip else socket.AF_INET try: with socket.socket(family, socket.SOCK_STREAM) as s: s.connect( (load_balancer_ip, consts.DEFAULT_LOAD_BALANCER_PORT)) log.info( f"Successfully connected to load balancer " f"{load_balancer_ip}:{consts.DEFAULT_LOAD_BALANCER_PORT}") return True except Exception as e: log.warning( "Could not connect to load balancer endpoint %s: %s", self._render_socket_endpoint( load_balancer_ip, consts.DEFAULT_LOAD_BALANCER_PORT), e, ) return False
def set_hosts_roles(client, cluster, nodes_details, machine_net, tf, master_count, static_network_mode): networks_names = ( nodes_details["libvirt_network_name"], nodes_details["libvirt_secondary_network_name"] ) # don't set roles in bip role if not machine_net.has_ip_v6: libvirt_nodes = get_libvirt_nodes_mac_role_ip_and_name(networks_names[0]) libvirt_nodes.update(get_libvirt_nodes_mac_role_ip_and_name(networks_names[1])) if static_network_mode: log.info("Setting hostnames when running in static network config mode") update_hostnames = True else: update_hostnames = False else: log.warning("Work around libvirt for Terrafrom not setting hostnames of IPv6 hosts") libvirt_nodes = utils.get_libvirt_nodes_from_tf_state(networks_names, tf.get_state()) update_hostnames = True utils.update_hosts(client, cluster.id, libvirt_nodes, update_hostnames=update_hostnames, update_roles=master_count > 1)
def ssh_connection(self): if not self.ips: raise RuntimeError(f"No available IPs for node {self.name}") log.info("Trying to access through IP addresses: %s", ", ".join(self.ips)) for ip in self.ips: exception = None try: connection = ssh.SshConnection( ip, private_ssh_key_path=self.private_ssh_key_path, username=self.username) connection.connect() return connection except (TimeoutError, SCPException) as e: log.warning("Could not SSH through IP %s: %s", ip, str(e)) exception = e if exception is not None: raise exception
def worker_installation(self, controller: TerraformController, cluster_configuration: ClusterConfig): controller.start_node(node_name=f"{CLUSTER_PREFIX}-worker-0") # Start a background worker to approve CSRs approve_csr_worker_done = threading.Event() approve_csr_worker = threading.Thread( target=self.approve_csrs, args=(KUBE_CONFIG, approve_csr_worker_done), # Don't hang if this thread is still running for some reason daemon=True, ) approve_csr_worker.start() try: self.waiting_for_added_worker(controller) finally: approve_csr_worker_done.set() approve_csr_worker.join(timeout=10) if approve_csr_worker.is_alive(): log.warning("CSR thread is still running for some reason")
def __init__(self, inventory_url: str, type: str, offline_token: str): self.client = ClientFactory.create_client(url=inventory_url, offline_token=offline_token) with open("src/manage/manageable_options.yaml", "r") as f: options = yaml.load(f, Loader=yaml.FullLoader) manage_config = options.get(type, None) if not manage_config: raise ValueError( f"{type} is not a valid manageable_options option") days_back = manage_config["days_back"] measure_field = manage_config["measure_field"] clusters = self.get_clusters() clusters_to_process = list() for cluster in clusters: if is_older_then(cluster[measure_field], days_back): clusters_to_process.append(cluster["id"]) len_of_clusters_to_prcess = len(clusters_to_process) log.info(f"Running {type} of {len_of_clusters_to_prcess} clusters") if not query_yes_no(): return method = getattr(self.client, manage_config["method"]) for cluster_id in clusters_to_process: try: method(cluster_id=cluster_id) except ApiException as e: log.warning(f"Can't process cluster_id={cluster_id}, {e}")
def download_must_gather(kubeconfig: str, dest_dir: str): must_gather_dir = f"{dest_dir}/must-gather-dir" os.mkdir(must_gather_dir) log.info( f"Downloading must-gather to {must_gather_dir}, kubeconfig {kubeconfig}" ) command = ( f"oc --insecure-skip-tls-verify --kubeconfig={kubeconfig} adm must-gather" f" --dest-dir {must_gather_dir} > {must_gather_dir}/must-gather.log") try: run_command(command, shell=True, raise_errors=True) except RuntimeError as ex: log.warning(f"Failed to run must gather: {ex}") log.debug("Archiving %s...", must_gather_dir) with tarfile.open(f"{dest_dir}/must-gather.tar", "w:gz") as tar: tar.add(must_gather_dir, arcname=os.path.sep) log.debug("Removing must-gather directory %s after we archived it", must_gather_dir) shutil.rmtree(must_gather_dir)
def attach_disk_flags(persistent): modified_nodes = set() def attach(node, disk_size, bootable=False, with_wwn=False): nonlocal modified_nodes node.attach_test_disk(disk_size, bootable=bootable, persistent=persistent, with_wwn=with_wwn) modified_nodes.add(node) yield attach if global_variables.test_teardown: for modified_node in modified_nodes: try: modified_node.detach_all_test_disks() log.info( f"Successfully detach test disks from node {modified_node.name}" ) except (libvirt.libvirtError, FileNotFoundError): log.warning( f"Failed to detach test disks from node {modified_node.name}" )
def _collect_virsh_logs(cls, nodes: Nodes, log_dir_name): log.info("Collecting virsh logs\n") os.makedirs(log_dir_name, exist_ok=True) virsh_log_path = os.path.join(log_dir_name, "libvirt_logs") os.makedirs(virsh_log_path, exist_ok=False) libvirt_list_path = os.path.join(virsh_log_path, "virsh_list") utils.run_command(f"virsh list --all >> {libvirt_list_path}", shell=True) libvirt_net_list_path = os.path.join(virsh_log_path, "virsh_net_list") utils.run_command(f"virsh net-list --all >> {libvirt_net_list_path}", shell=True) network_name = nodes.get_cluster_network() virsh_leases_path = os.path.join(virsh_log_path, "net_dhcp_leases") utils.run_command( f"virsh net-dhcp-leases {network_name} >> {virsh_leases_path}", shell=True) messages_log_path = os.path.join(virsh_log_path, "messages.log") try: shutil.copy("/var/log/messages", messages_log_path) except FileNotFoundError: log.warning( "Failed to copy /var/log/messages, file does not exist") qemu_libvirt_path = os.path.join(virsh_log_path, "qemu_libvirt_logs") os.makedirs(qemu_libvirt_path, exist_ok=False) for node in nodes: try: shutil.copy(f"/var/log/libvirt/qemu/{node.name}.log", f"{qemu_libvirt_path}/{node.name}-qemu.log") except FileNotFoundError: log.warning( f"Failed to copy {node.name} qemu log, file does not exist" ) console_log_path = os.path.join(virsh_log_path, "console_logs") os.makedirs(console_log_path, exist_ok=False) for node in nodes: try: shutil.copy(f"/var/log/libvirt/qemu/{node.name}-console.log", f"{console_log_path}/{node.name}-console.log") except FileNotFoundError: log.warning( f"Failed to copy {node.name} console log, file does not exist" ) libvird_log_path = os.path.join(virsh_log_path, "libvirtd_journal") utils.run_command( f'journalctl --since "{nodes.setup_time}" ' f"-u libvirtd -D /run/log/journal >> {libvird_log_path}", shell=True, )
def folder_exists(file_path): folder = Path(file_path).parent if not folder: log.warning("Directory %s doesn't exist. Please create it", folder) return False return True
const=True, default=False, ) parser.add_argument( "--hyperthreading", help="nodes cpu hyperthreading mode", type=str, nargs='?', const='all', default=None, ) parser.add_argument( "--kube-api", help='Should kube-api interface be used for cluster deployment', type=distutils.util.strtobool, nargs='?', const=True, default=False, ) oc_utils.extend_parser_with_oc_arguments(parser) args = parser.parse_args() if not args.pull_secret: raise ValueError("Can't install cluster without pull secret, please provide one") if args.master_count == 1: log.warning("Master count is 1, setting workers to 0") args.number_of_workers = 0 main()
def download_logs( client: InventoryClient, cluster: dict, dest: str, must_gather: bool, update_by_events: bool = False, retry_interval: int = RETRY_INTERVAL, ): if "hosts" not in cluster or len(cluster["hosts"]) == 0: cluster["hosts"] = client.get_cluster_hosts(cluster_id=cluster["id"]) output_folder = get_logs_output_folder(dest, cluster) if not is_update_needed(output_folder, update_by_events, client, cluster): log.info(f"Skipping, no need to update {output_folder}.") return recreate_folder(output_folder) recreate_folder(os.path.join(output_folder, "cluster_files")) try: write_metadata_file(client, cluster, os.path.join(output_folder, "metadata.json")) with SuppressAndLog(requests.exceptions.RequestException, ConnectionError, KeyboardInterrupt): client.download_metrics(os.path.join(output_folder, "metrics.txt")) for cluster_file in ( "bootstrap.ign", "master.ign", "worker.ign", "install-config.yaml", ): with SuppressAndLog(assisted_service_client.rest.ApiException, KeyboardInterrupt): client.download_and_save_file( cluster["id"], cluster_file, os.path.join(output_folder, "cluster_files", cluster_file)) with SuppressAndLog(assisted_service_client.rest.ApiException, KeyboardInterrupt): download_manifests(client, cluster["id"], output_folder) infra_env_list = set() for host_id, infra_env_id in map( lambda host: (host["id"], host["infra_env_id"]), cluster["hosts"]): with SuppressAndLog(assisted_service_client.rest.ApiException, KeyboardInterrupt): client.download_host_ignition( infra_env_id, host_id, os.path.join(output_folder, "cluster_files")) if infra_env_id not in infra_env_list: infra_env_list.add(infra_env_id) with SuppressAndLog(assisted_service_client.rest.ApiException, KeyboardInterrupt): client.download_infraenv_events( infra_env_id, get_infraenv_events_path(infra_env_id, output_folder)) with SuppressAndLog(assisted_service_client.rest.ApiException, KeyboardInterrupt): client.download_cluster_events( cluster["id"], get_cluster_events_path(cluster, output_folder)) shutil.copy2( os.path.join(os.path.dirname(os.path.realpath(__file__)), "events.html"), output_folder) with SuppressAndLog(assisted_service_client.rest.ApiException, KeyboardInterrupt): are_masters_in_configuring_state = are_host_progress_in_stage( cluster["hosts"], [HostsProgressStages.CONFIGURING], 2) are_masters_in_join_or_done_state = are_host_progress_in_stage( cluster["hosts"], [HostsProgressStages.JOINED, HostsProgressStages.DONE], 2) max_retries = MUST_GATHER_MAX_RETRIES if are_masters_in_join_or_done_state else MAX_RETRIES is_controller_expected = cluster[ "status"] == ClusterStatus.INSTALLED or are_masters_in_configuring_state min_number_of_logs = min_number_of_log_files( cluster, is_controller_expected) for i in range(max_retries): cluster_logs_tar = os.path.join( output_folder, f"cluster_{cluster['id']}_logs.tar") with suppress(FileNotFoundError): os.remove(cluster_logs_tar) client.download_cluster_logs(cluster["id"], cluster_logs_tar) try: verify_logs_uploaded( cluster_logs_tar, min_number_of_logs, installation_success=( cluster["status"] == ClusterStatus.INSTALLED), check_oc=are_masters_in_join_or_done_state, ) break except AssertionError as ex: log.warning("Cluster logs verification failed: %s", ex) # Skip sleeping on last retry if i < MAX_RETRIES - 1: log.info(f"Going to retry in {retry_interval} seconds") time.sleep(retry_interval) kubeconfig_path = os.path.join(output_folder, "kubeconfig-noingress") with SuppressAndLog(assisted_service_client.rest.ApiException): client.download_kubeconfig_no_ingress(cluster["id"], kubeconfig_path) if must_gather: config_etc_hosts( cluster["name"], cluster["base_dns_domain"], client.get_api_vip(cluster, cluster["id"]), ) download_must_gather(kubeconfig_path, output_folder) finally: run_command(f"chmod -R ugo+rx '{output_folder}'")