def _create_user_file_for_auth(self): if self.authenticated: create_user_file_cmd = ( f"htpasswd -b -c {self.config_dir_path}/squid-users {self.PROXY_USER} {self.PROXY_USER_PASS}" ) utils.run_command(create_user_file_cmd, shell=True) self.user_file_path = f"{self.config_dir_path}/squid-users"
def add_disk_bootflag(disk_path): command = f"virt-format -a {disk_path} --partition=mbr" utils.run_command(command, shell=True, env={ **os.environ, "LIBGUESTFS_BACKEND": "direct" })
def download_must_gather(kubeconfig: str, dest_dir: str): log.info(f"Downloading must-gather to {dest_dir}") command = ( f"oc --insecure-skip-tls-verify --kubeconfig={kubeconfig} adm must-gather" f" --dest-dir {dest_dir} > {dest_dir}/must-gather.log") try: run_command(command, shell=True, raise_errors=True) except RuntimeError as ex: log.warning(f"Failed to run must gather: {ex}")
def download_live_image(download_path): if os.path.exists(download_path): logging.info("Image %s already exists, skipping download", download_path) return logging.info("Downloading iso to %s", download_path) # TODO: enable fetching the appropriate rhcos image utils.run_command( f"curl https://mirror.openshift.com/pub/openshift-v4/dependencies/rhcos/4.8/4.8.2/rhcos-live.x86_64.iso" f" --retry 10 --retry-connrefused -o {download_path} --continue-at -")
def get_nodes( self) -> Callable[[BaseTerraformConfig, ClusterConfig], Nodes]: """Currently support only single instance of nodes""" nodes_data = dict() @JunitTestCase() def get_nodes_func(tf_config: BaseTerraformConfig, cluster_config: ClusterConfig): if "nodes" in nodes_data: return nodes_data["nodes"] nodes_data["configs"] = cluster_config, tf_config net_asset = LibvirtNetworkAssets() tf_config.net_asset = net_asset.get() nodes_data["net_asset"] = net_asset controller = TerraformController(tf_config, entity_config=cluster_config) nodes = Nodes(controller) nodes_data["nodes"] = nodes interfaces = self.nat_interfaces(tf_config) nat = NatController( interfaces, NatController.get_namespace_index(interfaces[0])) nat.add_nat_rules() nodes_data["nat"] = nat return nodes yield get_nodes_func _nodes: Nodes = nodes_data.get("nodes") _cluster_config, _tf_config = nodes_data.get("configs", (None, None)) _nat: NatController = nodes_data.get("nat") _net_asset: LibvirtNetworkAssets = nodes_data.get("net_asset") try: if _nodes and global_variables.test_teardown: log.info("--- TEARDOWN --- node controller\n") _nodes.destroy_all_nodes() log.info( f"--- TEARDOWN --- deleting iso file from: {_cluster_config.iso_download_path}\n" ) utils.run_command(f"rm -f {_cluster_config.iso_download_path}", shell=True) self.teardown_nat(_nat) self.delete_dnsmasq_conf_file( cluster_name=_cluster_config.cluster_name) finally: if _net_asset: _net_asset.release_all()
def extract_installer(release_image: str, dest: str): """ Extracts the installer binary from the release image. Args: release_image: The release image to extract the installer from. dest: The destination to extract the installer to. """ logging.info("Extracting installer from %s to %s", release_image, dest) with utils.pull_secret_file() as pull_secret: utils.run_command( f"oc adm release extract --registry-config '{pull_secret}'" f" --command=openshift-install --to={dest} {release_image}")
def prepare_nodes(self, nodes: Nodes, cluster_configuration: ClusterConfig) -> Nodes: try: nodes.prepare_nodes() yield nodes finally: if global_variables.test_teardown: log.info("--- TEARDOWN --- node controller\n") nodes.destroy_all_nodes() log.info( f"--- TEARDOWN --- deleting iso file from: {cluster_configuration.iso_download_path}\n" ) utils.run_command( f"rm -f {cluster_configuration.iso_download_path}", shell=True)
def prepare_infraenv_nodes( self, infraenv_nodes: Nodes, infra_env_configuration: InfraEnvConfig) -> Nodes: try: yield infraenv_nodes finally: if global_variables.test_teardown: log.info("--- TEARDOWN --- node controller\n") infraenv_nodes.destroy_all_nodes() log.info( f"--- TEARDOWN --- deleting iso file from: {infra_env_configuration.iso_download_path}\n" ) utils.run_command( f"rm -f {infra_env_configuration.iso_download_path}", shell=True)
def installer_gather(ip, ssh_key, out_dir): stdout, stderr, _ret = utils.run_command( f"{INSTALLER_BINARY} gather bootstrap --log-level debug --bootstrap {ip} --master {ip} --key {ssh_key}" ) with open(INSTALLER_GATHER_DEBUG_STDOUT, "w") as f: f.write(stdout) with open(INSTALLER_GATHER_DEBUG_STDERR, "w") as f: f.write(stderr) matches = re.compile(r'.*logs captured here "(.*)".*').findall(stderr) if len(matches) == 0: logging.warning( f"It seems like installer-gather didn't generate any bundles, stderr: {stderr}" ) return bundle_file_path, *_ = matches logging.info(f"Found installer-gather bundle at path {bundle_file_path}") utils.run_command_with_output(f"tar -xzf {bundle_file_path} -C {out_dir}") os.remove(bundle_file_path) if os.path.exists(bundle_file_path) else None
def _does_rule_exist(self) -> bool: check_rule = self._build_command_string(IpTableCommandOption.CHECK) _, _, exit_code = utils.run_command(check_rule, shell=True, raise_errors=False) return exit_code == 0
def __init__(self, config: BaseNodeConfig, entity_config: Union[BaseClusterConfig, BaseInfraEnvConfig]): super().__init__(config, entity_config) self.libvirt_connection: libvirt.virConnect = libvirt.open( "qemu:///system") self.private_ssh_key_path: Path = config.private_ssh_key_path self._setup_timestamp: str = utils.run_command( 'date +"%Y-%m-%d %T"')[0]
def _collect_virsh_logs(cls, nodes: Nodes, log_dir_name): log.info("Collecting virsh logs\n") os.makedirs(log_dir_name, exist_ok=True) virsh_log_path = os.path.join(log_dir_name, "libvirt_logs") os.makedirs(virsh_log_path, exist_ok=False) libvirt_list_path = os.path.join(virsh_log_path, "virsh_list") utils.run_command(f"virsh list --all >> {libvirt_list_path}", shell=True) libvirt_net_list_path = os.path.join(virsh_log_path, "virsh_net_list") utils.run_command(f"virsh net-list --all >> {libvirt_net_list_path}", shell=True) network_name = nodes.get_cluster_network() virsh_leases_path = os.path.join(virsh_log_path, "net_dhcp_leases") utils.run_command( f"virsh net-dhcp-leases {network_name} >> {virsh_leases_path}", shell=True) messages_log_path = os.path.join(virsh_log_path, "messages.log") try: shutil.copy("/var/log/messages", messages_log_path) except FileNotFoundError: log.warning( "Failed to copy /var/log/messages, file does not exist") qemu_libvirt_path = os.path.join(virsh_log_path, "qemu_libvirt_logs") os.makedirs(qemu_libvirt_path, exist_ok=False) for node in nodes: try: shutil.copy(f"/var/log/libvirt/qemu/{node.name}.log", f"{qemu_libvirt_path}/{node.name}-qemu.log") except FileNotFoundError: log.warning( f"Failed to copy {node.name} qemu log, file does not exist" ) console_log_path = os.path.join(virsh_log_path, "console_logs") os.makedirs(console_log_path, exist_ok=False) for node in nodes: try: shutil.copy(f"/var/log/libvirt/qemu/{node.name}-console.log", f"{console_log_path}/{node.name}-console.log") except FileNotFoundError: log.warning( f"Failed to copy {node.name} console log, file does not exist" ) libvird_log_path = os.path.join(virsh_log_path, "libvirtd_journal") utils.run_command( f'journalctl --since "{nodes.setup_time}" ' f"-u libvirtd -D /run/log/journal >> {libvird_log_path}", shell=True, )
def format_disk(cls, disk_path): log.info("Formatting disk %s", disk_path) if not os.path.exists(disk_path): log.info("Path to %s disk not exists. Skipping", disk_path) return command = f"qemu-img info {disk_path} --output json" output, _, _ = utils.run_command(command, shell=True) image_size = json.loads(output)["virtual-size"] cls.create_disk(disk_path, image_size)
def download_logs_kube_api(api_client: ApiClient, cluster_name: str, namespace: str, dest: str, must_gather: bool, management_kubeconfig: str): cluster_deployment = ClusterDeployment( kube_api_client=api_client, name=cluster_name, namespace=namespace, ) agent_cluster_install = AgentClusterInstall( kube_api_client=api_client, name=cluster_deployment.get()["spec"]["clusterInstallRef"]["name"], namespace=namespace, ) output_folder = os.path.join(dest, f"{cluster_name}") recreate_folder(output_folder) try: with SuppressAndLog(requests.exceptions.RequestException, ConnectionError): collect_debug_info_from_cluster(cluster_deployment, agent_cluster_install, output_folder) if must_gather: recreate_folder(os.path.join(output_folder, "must-gather")) with SuppressAndLog(Exception): # in case of hypershift if namespace.startswith("clusters"): log.info("Dumping hypershift files") hypershift = HyperShift(name=cluster_name) hypershift.dump(os.path.join(output_folder, "dump"), management_kubeconfig) else: _must_gather_kube_api(cluster_name, cluster_deployment, agent_cluster_install, output_folder) finally: run_command(f"chmod -R ugo+rx '{output_folder}'")
def format_disk(cls, disk_path): log.info("Formatting disk %s", disk_path) if not os.path.exists(disk_path): log.info("Path to %s disk not exists. Skipping", disk_path) return command = f"qemu-img info {disk_path} | grep 'virtual size'" output = utils.run_command(command, shell=True) image_size = output[0].split(" ")[2] # Fix for libvirt 6.0.0 if image_size.isdigit(): image_size += "G" cls.create_disk(disk_path, image_size)
def download_must_gather(kubeconfig: str, dest_dir: str): must_gather_dir = f"{dest_dir}/must-gather-dir" os.mkdir(must_gather_dir) log.info( f"Downloading must-gather to {must_gather_dir}, kubeconfig {kubeconfig}" ) command = ( f"oc --insecure-skip-tls-verify --kubeconfig={kubeconfig} adm must-gather" f" --dest-dir {must_gather_dir} > {must_gather_dir}/must-gather.log") try: run_command(command, shell=True, raise_errors=True) except RuntimeError as ex: log.warning(f"Failed to run must gather: {ex}") log.debug("Archiving %s...", must_gather_dir) with tarfile.open(f"{dest_dir}/must-gather.tar", "w:gz") as tar: tar.add(must_gather_dir, arcname=os.path.sep) log.debug("Removing must-gather directory %s after we archived it", must_gather_dir) shutil.rmtree(must_gather_dir)
def add_interface(self, node_name, network_name, target_interface): """ Create an interface using given network name, return created interface's mac address. Note: Do not use the same network for different tests """ log.info( f"Creating new interface attached to network: {network_name}, for node: {node_name}" ) net_leases = self.list_leases(network_name) mac_addresses = [] for lease in net_leases: mac_addresses.append(lease["mac"]) command = f"virsh attach-interface {node_name} network {network_name} --target {target_interface} --persistent" utils.run_command(command) try: waiting.wait( lambda: len(self.list_leases(network_name)) > len(mac_addresses ), timeout_seconds=30, sleep_seconds=2, waiting_for="Wait for network lease", ) except waiting.exceptions.TimeoutExpired: log.error("Network lease wasnt found for added interface") raise mac_address = "" new_net_leases = self.list_leases(network_name) for lease in new_net_leases: if not lease["mac"] in mac_addresses: mac_address = lease["mac"] break log.info( f"Successfully attached interface, network: {network_name}, mac: {mac_address}, for node:" f" {node_name}") return mac_address
def extract_version(release_image): """ Extracts the version number from the release image. Args: release_image: The release image to extract the version from. """ logging.info(f"Extracting version number from {release_image}") with utils.pull_secret_file() as pull_secret: stdout, _, _ = utils.run_command( f"oc adm release info --registry-config '{pull_secret}' '{release_image}' -ojson" ) ocp_full_version = json.loads(stdout).get("metadata", {}).get("version", "") ocp_semver = semver.VersionInfo.parse(ocp_full_version) ocp_version = f"{ocp_semver.major}.{ocp_semver.minor}" return ocp_version
def extract_rhcos_url_from_ocp_installer(installer_binary_path: str): """ Extracts the RHCOS download URL from the installer binary. Args: installer_binary_path: The path to the installer binary. """ logging.info(f"Extracting RHCOS URL from {installer_binary_path}") stdout, _, _ = utils.run_command( f"'{installer_binary_path}' coreos print-stream-json") jsonpath = "architectures.x86_64.artifacts.metal.formats.iso.disk.location" current_node = json.loads(stdout) for element in jsonpath.split("."): current_node = current_node.get(element, {}) if current_node == {}: raise ValueError( f"Could not extract RHCOS URL from {installer_binary_path}, malformed JSON" ) logging.info(f"Extracted RHCOS URL: {current_node}") return current_node
def delete(self) -> None: if self._does_rule_exist(): delete_rule = self._build_command_string( IpTableCommandOption.DELETE) log.info(f"Removing iptable rule: {delete_rule}") utils.run_command(delete_rule, shell=True)
def day2_nodes_flow(client, terraform_cluster_dir_prefix, tf_folder, cluster, has_ipv_6, num_worker_nodes, install_cluster_flag, day2_type_flag, with_static_network_config, base_cluster_name): tf_network_name, total_num_nodes = get_network_num_nodes_from_tf(tf_folder) # Running twice as a workaround for an issue with terraform not spawning a new node on first apply. for _ in range(2): with utils.file_lock_context(): utils.run_command( f'make _apply_terraform CLUSTER_NAME={terraform_cluster_dir_prefix} PLATFORM={consts.Platforms.BARE_METAL}' ) time.sleep(5) if day2_type_flag == "ocp": num_nodes_to_wait = total_num_nodes installed_status = consts.NodesStatus.INSTALLED else: num_nodes_to_wait = num_worker_nodes installed_status = consts.NodesStatus.DAY2_INSTALLED wait_till_nodes_are_ready(nodes_count=num_nodes_to_wait, network_name=tf_network_name) waiting.wait( lambda: are_libvirt_nodes_in_cluster_hosts(client, cluster.id, num_nodes_to_wait), timeout_seconds=consts.NODES_REGISTERED_TIMEOUT, sleep_seconds=10, waiting_for="Nodes to be registered in inventory service", ) set_nodes_hostnames_if_needed(client, tf_folder, with_static_network_config, has_ipv_6, tf_network_name, cluster.id) wait_till_all_hosts_are_in_status( client=client, cluster_id=cluster.id, nodes_count=num_worker_nodes, statuses=[consts.NodesStatus.KNOWN], interval=30, ) if install_cluster_flag: log.info("Start installing all known nodes in the cluster %s", cluster.id) kubeconfig = utils.get_kubeconfig_path(base_cluster_name) ocp_orig_ready_nodes = get_ocp_cluster_ready_nodes_num(kubeconfig) hosts = client.get_cluster_hosts(cluster.id) [ client.install_day2_host(cluster.id, host['id']) for host in hosts if host["status"] == 'known' ] log.info( "Start waiting until all nodes of cluster %s have been installed( reached added-to-existing-clustertate)", cluster.id) wait_till_all_hosts_are_in_status( client=client, cluster_id=cluster.id, nodes_count=num_nodes_to_wait, statuses=[installed_status], interval=30, ) log.info( "Start waiting until installed nodes has actually been added to the OCP cluster" ) waiting.wait(lambda: wait_nodes_join_ocp_cluster( ocp_orig_ready_nodes, num_worker_nodes, day2_type_flag, kubeconfig ), timeout_seconds=consts.NODES_REGISTERED_TIMEOUT, sleep_seconds=30, waiting_for="Day2 nodes to be added to OCP cluster", expected_exceptions=Exception) log.info("%d worker nodes were successfully added to OCP cluster", num_worker_nodes)
def insert(self) -> None: if not self._does_rule_exist(): insert_rule = self._build_command_string( IpTableCommandOption.INSERT) log.info(f"Setting iptable rule: {insert_rule}") utils.run_command(insert_rule, shell=True)
def _build_server_image(self): log.info(f"Creating Image for iPXE Server {self._name}") build_flags = f"--build-arg SERVER_IP={self._ip} --build-arg SERVER_PORT={self._port}" utils.run_command(f"podman-remote build {self._dir}/server -t {self._name} {build_flags}")
def undefine_interface(self, node_name, mac): log.info(f"Undefining an interface mac: {mac}, for node: {node_name}") command = f"virsh detach-interface {node_name} --type network --mac {mac}" utils.run_command(command, True) log.info("Successfully removed interface.")
def download_logs( client: InventoryClient, cluster: dict, dest: str, must_gather: bool, update_by_events: bool = False, retry_interval: int = RETRY_INTERVAL, ): if "hosts" not in cluster or len(cluster["hosts"]) == 0: cluster["hosts"] = client.get_cluster_hosts(cluster_id=cluster["id"]) output_folder = get_logs_output_folder(dest, cluster) if not is_update_needed(output_folder, update_by_events, client, cluster): log.info(f"Skipping, no need to update {output_folder}.") return recreate_folder(output_folder) recreate_folder(os.path.join(output_folder, "cluster_files")) try: write_metadata_file(client, cluster, os.path.join(output_folder, "metadata.json")) with SuppressAndLog(requests.exceptions.RequestException, ConnectionError, KeyboardInterrupt): client.download_metrics(os.path.join(output_folder, "metrics.txt")) for cluster_file in ( "bootstrap.ign", "master.ign", "worker.ign", "install-config.yaml", ): with SuppressAndLog(assisted_service_client.rest.ApiException, KeyboardInterrupt): client.download_and_save_file( cluster["id"], cluster_file, os.path.join(output_folder, "cluster_files", cluster_file)) with SuppressAndLog(assisted_service_client.rest.ApiException, KeyboardInterrupt): download_manifests(client, cluster["id"], output_folder) infra_env_list = set() for host_id, infra_env_id in map( lambda host: (host["id"], host["infra_env_id"]), cluster["hosts"]): with SuppressAndLog(assisted_service_client.rest.ApiException, KeyboardInterrupt): client.download_host_ignition( infra_env_id, host_id, os.path.join(output_folder, "cluster_files")) if infra_env_id not in infra_env_list: infra_env_list.add(infra_env_id) with SuppressAndLog(assisted_service_client.rest.ApiException, KeyboardInterrupt): client.download_infraenv_events( infra_env_id, get_infraenv_events_path(infra_env_id, output_folder)) with SuppressAndLog(assisted_service_client.rest.ApiException, KeyboardInterrupt): client.download_cluster_events( cluster["id"], get_cluster_events_path(cluster, output_folder)) shutil.copy2( os.path.join(os.path.dirname(os.path.realpath(__file__)), "events.html"), output_folder) with SuppressAndLog(assisted_service_client.rest.ApiException, KeyboardInterrupt): are_masters_in_configuring_state = are_host_progress_in_stage( cluster["hosts"], [HostsProgressStages.CONFIGURING], 2) are_masters_in_join_or_done_state = are_host_progress_in_stage( cluster["hosts"], [HostsProgressStages.JOINED, HostsProgressStages.DONE], 2) max_retries = MUST_GATHER_MAX_RETRIES if are_masters_in_join_or_done_state else MAX_RETRIES is_controller_expected = cluster[ "status"] == ClusterStatus.INSTALLED or are_masters_in_configuring_state min_number_of_logs = min_number_of_log_files( cluster, is_controller_expected) for i in range(max_retries): cluster_logs_tar = os.path.join( output_folder, f"cluster_{cluster['id']}_logs.tar") with suppress(FileNotFoundError): os.remove(cluster_logs_tar) client.download_cluster_logs(cluster["id"], cluster_logs_tar) try: verify_logs_uploaded( cluster_logs_tar, min_number_of_logs, installation_success=( cluster["status"] == ClusterStatus.INSTALLED), check_oc=are_masters_in_join_or_done_state, ) break except AssertionError as ex: log.warning("Cluster logs verification failed: %s", ex) # Skip sleeping on last retry if i < MAX_RETRIES - 1: log.info(f"Going to retry in {retry_interval} seconds") time.sleep(retry_interval) kubeconfig_path = os.path.join(output_folder, "kubeconfig-noingress") with SuppressAndLog(assisted_service_client.rest.ApiException): client.download_kubeconfig_no_ingress(cluster["id"], kubeconfig_path) if must_gather: config_etc_hosts( cluster["name"], cluster["base_dns_domain"], client.get_api_vip(cluster, cluster["id"]), ) download_must_gather(kubeconfig_path, output_folder) finally: run_command(f"chmod -R ugo+rx '{output_folder}'")
def extract_installer(release_image, dest): logging.info("Extracting installer from %s to %s", release_image, dest) with utils.pull_secret_file() as pull_secret: utils.run_command( f"oc adm release extract --registry-config '{pull_secret}'" f" --command=openshift-install --to={dest} {release_image}")
def set_thumbprint(self): exec_command = f"podman-remote exec -it {self._name} tang-show-keys {self._port}" self.thumbprint, _, _ = utils.run_command(exec_command, shell=True)
def start_install_and_wait_for_installed(self): cluster_name = self.config.day1_cluster_name # Running twice as a workaround for an issue with terraform not spawning a new node on first apply. for _ in range(2): with utils.file_lock_context(): utils.run_command( f"make _apply_terraform CLUSTER_NAME={cluster_name} PLATFORM={consts.Platforms.BARE_METAL}" ) time.sleep(5) num_nodes_to_wait = self.config.day2_workers_count installed_status = consts.NodesStatus.DAY2_INSTALLED tfvars = utils.get_tfvars(self.config.tf_folder) tf_network_name = tfvars["libvirt_network_name"] config = TerraformConfig() config.nodes_count = num_nodes_to_wait libvirt_controller = LibvirtController(config=config, entity_config=ClusterConfig()) libvirt_controller.wait_till_nodes_are_ready( network_name=tf_network_name) # Wait for day2 nodes waiting.wait( lambda: self.are_libvirt_nodes_in_cluster_hosts(), timeout_seconds=consts.NODES_REGISTERED_TIMEOUT, sleep_seconds=10, waiting_for="Nodes to be registered in inventory service", ) self.set_nodes_hostnames_if_needed(tf_network_name) wait_till_all_hosts_are_in_status( client=self.api_client, cluster_id=self.config.cluster_id, nodes_count=self.config.day2_workers_count, statuses=[consts.NodesStatus.KNOWN], interval=30, ) # Start day2 nodes installation log.info("Start installing all known nodes in the cluster %s", self.config.cluster_id) kubeconfig = utils.get_kubeconfig_path(self.config.day1_cluster_name) ocp_ready_nodes = self.get_ocp_cluster_ready_nodes_num(kubeconfig) hosts = self.api_client.get_cluster_hosts(self.config.cluster_id) [ self.api_client.install_day2_host(self.config.infra_env_id, host["id"]) for host in hosts if host["status"] == "known" ] log.info( "Waiting until all nodes of cluster %s have been installed (reached added-to-existing-cluster)", self.config.cluster_id, ) wait_till_all_hosts_are_in_status( client=self.api_client, cluster_id=self.config.cluster_id, nodes_count=num_nodes_to_wait, statuses=[installed_status], interval=30, ) log.info( "Waiting until installed nodes has actually been added to the OCP cluster" ) waiting.wait( lambda: self.wait_nodes_join_ocp_cluster( ocp_ready_nodes, self.config.day2_workers_count, kubeconfig), timeout_seconds=consts.NODES_REGISTERED_TIMEOUT, sleep_seconds=30, waiting_for="Day2 nodes to be added to OCP cluster", expected_exceptions=Exception, ) log.info("%d worker nodes were successfully added to OCP cluster", self.config.day2_workers_count)
def create_disk(disk_path, disk_size): command = f"qemu-img create -f qcow2 {disk_path} {disk_size}" utils.run_command(command, shell=True)
def _insert_rule(cls, rule_suffix: str) -> None: """Insert a new rule""" insert_rule = cls._build_rule_string(IpTableCommandOption.INSERT, rule_suffix) log.info('Adding rule "%s"', insert_rule) utils.run_command(insert_rule, shell=True)