def download_logs(client: InventoryClient, cluster: dict, dest: str, must_gather: bool, retry_interval: int = RETRY_INTERVAL): output_folder = get_logs_output_folder(dest, cluster) if os.path.isdir(output_folder): log.info(f"Skipping. The logs directory {output_folder} already exists.") return recreate_folder(output_folder) recreate_folder(os.path.join(output_folder, "cluster_files")) try: write_metadata_file(client, cluster, os.path.join(output_folder, 'metdata.json')) with suppress(assisted_service_client.rest.ApiException): client.download_ignition_files(cluster['id'], os.path.join(output_folder, "cluster_files")) for host_id in map(lambda host: host['id'], cluster['hosts']): with suppress(assisted_service_client.rest.ApiException): client.download_host_ignition(cluster['id'], host_id, os.path.join(output_folder, "cluster_files")) with suppress(assisted_service_client.rest.ApiException): client.download_cluster_events(cluster['id'], os.path.join(output_folder, f"cluster_{cluster['id']}_events.json")) shutil.copy2(os.path.join(os.path.dirname(os.path.realpath(__file__)), "events.html"), output_folder) with suppress(assisted_service_client.rest.ApiException): for i in range(MAX_RETRIES): cluster_logs_tar = os.path.join(output_folder, f"cluster_{cluster['id']}_logs.tar") with suppress(FileNotFoundError): os.remove(cluster_logs_tar) client.download_cluster_logs(cluster['id'], cluster_logs_tar) min_number_of_logs = len(cluster['hosts']) + 1 if cluster['status'] == ClusterStatus.INSTALLED else len(cluster['hosts']) try: verify_logs_uploaded(cluster_logs_tar, min_number_of_logs, cluster['status'] == ClusterStatus.INSTALLED) break except AssertionError as ex: log.warn(f"Cluster logs verification failed: {ex}") # Skip sleeping on last retry if i < MAX_RETRIES - 1: log.info(f"Going to retry in {retry_interval} seconds") time.sleep(retry_interval) kubeconfig_path = os.path.join(output_folder, "kubeconfig-noingress") with suppress(assisted_service_client.rest.ApiException): client.download_kubeconfig_no_ingress(cluster['id'], kubeconfig_path) if must_gather: recreate_folder(os.path.join(output_folder, "must-gather")) config_etc_hosts(cluster['name'], cluster['base_dns_domain'], helper_cluster.get_api_vip_from_cluster(client, cluster)) download_must_gather(kubeconfig_path, os.path.join(output_folder, "must-gather")) finally: run_command(f"chmod -R ugo+rx '{output_folder}'")
def update_oc_config(nodes, cluster): os.environ["KUBECONFIG"] = env_variables['kubeconfig_path'] vips = nodes.controller.get_ingress_and_api_vips() api_vip = vips['api_vip'] infra_utils.config_etc_hosts(cluster_name=cluster.name, base_dns_domain=env_variables["base_domain"], api_vip=api_vip)
def waiting_for_installation_completion(controller): vm_ip = controller.master_ips[0][0] try: logging.info("Configuring /etc/hosts...") utils.config_etc_hosts(cluster_name=controller.cluster_name, base_dns_domain=controller.cluster_domain, api_vip=vm_ip) logging.info("Waiting for installation to complete...") waiting.wait(all_operators_up, sleep_seconds=20, timeout_seconds=60 * 60, waiting_for="all operators to get up") logging.info("Installation completed successfully!") finally: logging.info("Gathering sosreport data from host...") node = Nodes(controller, private_ssh_key_path=SSH_KEY)[0] gather_sosreport_data(node) logging.info("Gathering information via installer-gather...") utils.recreate_folder(INSTALLER_GATHER_DIR, force_recreate=True) installer_gather(ip=vm_ip, ssh_key=SSH_KEY, out_dir=INSTALLER_GATHER_DIR) logging.info("Gathering information via must-gather...") utils.recreate_folder(MUST_GATHER_DIR) download_must_gather(KUBE_CONFIG, MUST_GATHER_DIR)
def update_oc_config(nodes, cluster): os.environ["KUBECONFIG"] = cluster.kubeconfig_path if nodes.masters_count == 1: main_cidr = cluster.get_primary_machine_cidr() api_vip = cluster.get_ip_for_single_node(cluster.api_client, cluster.id, main_cidr) else: vips = nodes.controller.get_ingress_and_api_vips() api_vip = vips['api_vip'] infra_utils.config_etc_hosts( cluster_name=cluster.name, base_dns_domain=global_variables.base_dns_domain, api_vip=api_vip)
def waiting_for_installation_completion(controller): vm_ip = controller.master_ips[0][0] try: logging.info("Configuring /etc/hosts...") utils.config_etc_hosts(cluster_name=controller.cluster_name, base_dns_domain=controller.cluster_domain, api_vip=vm_ip) logging.info("Waiting for installation to complete...") waiting.wait(all_operators_up, sleep_seconds=20, timeout_seconds=60 * 60, waiting_for="all operators to get up") logging.info("Installation completed successfully!") finally: log_collection(vm_ip)
def download_logs(client: InventoryClient, cluster: dict, dest: str, must_gather: bool): output_folder = get_logs_output_folder(dest, cluster) if os.path.isdir(output_folder): log.info( f"Skipping. The logs directory {output_folder} already exists.") return recreate_folder(output_folder) recreate_folder(os.path.join(output_folder, "cluster_files")) write_metadata_file(client, cluster, os.path.join(output_folder, 'metdata.json')) with suppress(assisted_service_client.rest.ApiException): client.download_ignition_files( cluster['id'], os.path.join(output_folder, "cluster_files")) with suppress(assisted_service_client.rest.ApiException): client.download_cluster_events( cluster['id'], os.path.join(output_folder, f"cluster_{cluster['id']}_events.json")) shutil.copy2( os.path.join(os.path.dirname(os.path.realpath(__file__)), "events.html"), output_folder) with suppress(assisted_service_client.rest.ApiException): client.download_cluster_logs( cluster['id'], os.path.join(output_folder, f"cluster_{cluster['id']}_logs.tar")) kubeconfig_path = os.path.join(output_folder, "kubeconfig-noingress") with suppress(assisted_service_client.rest.ApiException): client.download_kubeconfig_no_ingress(cluster['id'], kubeconfig_path) if must_gather: recreate_folder(os.path.join(output_folder, "must-gather")) config_etc_hosts(cluster['name'], cluster['base_dns_domain'], cluster['api_vip']) download_must_gather(kubeconfig_path, os.path.join(output_folder, "must-gather")) run_command("chmod -R ugo+rx '%s'" % output_folder)
def nodes_flow(client, cluster_name, cluster, image_path): nodes_details = _create_node_details(cluster_name) if cluster: nodes_details["cluster_inventory_id"] = cluster.id tf_folder = utils.get_tf_folder(cluster_name, args.namespace) utils.recreate_folder(tf_folder) copy_tree(consts.TF_TEMPLATE, tf_folder) tf = terraform_utils.TerraformUtils(working_dir=tf_folder) machine_net = MachineNetwork(args.ipv4, args.ipv6, args.vm_network_cidr, args.vm_network_cidr6, args.ns_index) create_nodes_and_wait_till_registered(cluster_name=cluster_name, inventory_client=client, cluster=cluster, image_path=image_path, storage_path=args.storage_path, master_count=args.master_count, nodes_details=nodes_details, tf=tf, machine_net=machine_net) if client: cluster_info = client.cluster_get(cluster.id) macs = utils.get_libvirt_nodes_macs( nodes_details["libvirt_network_name"]) if not (cluster_info.api_vip and cluster_info.ingress_vip): utils.wait_till_hosts_with_macs_are_in_status( client=client, cluster_id=cluster.id, macs=macs, statuses=[ consts.NodesStatus.INSUFFICIENT, consts.NodesStatus.PENDING_FOR_INPUT, ], ) if args.vip_dhcp_allocation: set_cluster_machine_cidr(client, cluster.id, machine_net) else: set_cluster_vips(client, cluster.id, machine_net) else: log.info("VIPs already configured") network_name = nodes_details["libvirt_network_name"] if machine_net.has_ip_v4: libvirt_nodes = utils.get_libvirt_nodes_mac_role_ip_and_name( network_name) update_hostnames = False else: log.warning( "Work around libvirt for Terrafrom not setting hostnames of IPv6-only hosts" ) libvirt_nodes = _get_libvirt_nodes_from_tf_state( network_name, tf.get_state()) update_hostnames = True update_hosts(client, cluster.id, libvirt_nodes, update_hostnames) utils.wait_till_hosts_with_macs_are_in_status( client=client, cluster_id=cluster.id, macs=macs, statuses=[consts.NodesStatus.KNOWN], ) if args.install_cluster: time.sleep(10) install_cluster.run_install_flow( client=client, cluster_id=cluster.id, kubeconfig_path=consts.DEFAULT_CLUSTER_KUBECONFIG_PATH, pull_secret=args.pull_secret, tf=tf) # Validate DNS domains resolvability validate_dns(client, cluster.id) if args.wait_for_cvo: cluster_info = client.cluster_get(cluster.id) log.info("Start waiting till CVO status is available") config_etc_hosts(cluster_info.name, cluster_info.base_dns_domain, cluster_info.api_vip) utils.wait_for_cvo_available()
def download_logs(client: InventoryClient, cluster: dict, dest: str, must_gather: bool, update_by_events: bool = False, retry_interval: int = RETRY_INTERVAL, pull_secret=""): if "hosts" not in cluster or len(cluster["hosts"]) == 0: cluster["hosts"] = client.get_cluster_hosts(cluster_id=cluster["id"]) output_folder = get_logs_output_folder(dest, cluster) if not is_update_needed(output_folder, update_by_events, client, cluster): log.info(f"Skipping, no need to update {output_folder}.") return recreate_folder(output_folder) recreate_folder(os.path.join(output_folder, "cluster_files")) try: write_metadata_file(client, cluster, os.path.join(output_folder, 'metadata.json')) with suppressAndLog(AssertionError, ConnectionError, requests.exceptions.ConnectionError): client.download_metrics(os.path.join(output_folder, "metrics.txt")) for cluster_file in ("bootstrap.ign", "master.ign", "worker.ign", "install-config.yaml"): with suppressAndLog(assisted_service_client.rest.ApiException): client.download_and_save_file( cluster['id'], cluster_file, os.path.join(output_folder, "cluster_files", cluster_file)) for host_id in map(lambda host: host['id'], cluster['hosts']): with suppressAndLog(assisted_service_client.rest.ApiException): client.download_host_ignition( cluster['id'], host_id, os.path.join(output_folder, "cluster_files")) with suppressAndLog(assisted_service_client.rest.ApiException): client.download_cluster_events( cluster['id'], get_cluster_events_path(cluster, output_folder)) shutil.copy2( os.path.join(os.path.dirname(os.path.realpath(__file__)), "events.html"), output_folder) with suppressAndLog(assisted_service_client.rest.ApiException): are_masters_in_configuring_state = are_host_progress_in_stage( cluster['hosts'], [HostsProgressStages.CONFIGURING], 2) are_masters_in_join_state = are_host_progress_in_stage( cluster['hosts'], [HostsProgressStages.JOINED], 2) max_retries = MUST_GATHER_MAX_RETRIES if are_masters_in_join_state else MAX_RETRIES is_controller_expected = cluster[ 'status'] == ClusterStatus.INSTALLED or are_masters_in_configuring_state min_number_of_logs = min_number_of_log_files( cluster, is_controller_expected) for i in range(max_retries): cluster_logs_tar = os.path.join( output_folder, f"cluster_{cluster['id']}_logs.tar") with suppress(FileNotFoundError): os.remove(cluster_logs_tar) client.download_cluster_logs(cluster['id'], cluster_logs_tar) try: verify_logs_uploaded( cluster_logs_tar, min_number_of_logs, installation_success=( cluster['status'] == ClusterStatus.INSTALLED), check_oc=are_masters_in_join_state) break except AssertionError as ex: log.warn(f"Cluster logs verification failed: {ex}") # Skip sleeping on last retry if i < MAX_RETRIES - 1: log.info(f"Going to retry in {retry_interval} seconds") time.sleep(retry_interval) kubeconfig_path = os.path.join(output_folder, "kubeconfig-noingress") with suppressAndLog(assisted_service_client.rest.ApiException): client.download_kubeconfig_no_ingress(cluster['id'], kubeconfig_path) if must_gather: recreate_folder(os.path.join(output_folder, "must-gather")) config_etc_hosts( cluster['name'], cluster['base_dns_domain'], helper_cluster.get_api_vip_from_cluster( client, cluster, pull_secret)) download_must_gather( kubeconfig_path, os.path.join(output_folder, "must-gather")) finally: run_command(f"chmod -R ugo+rx '{output_folder}'")
def nodes_flow(client, cluster_name, cluster): tf_folder = utils.get_tf_folder(cluster_name, args.namespace) nodes_details = utils.get_tfvars(tf_folder) if cluster: nodes_details["cluster_inventory_id"] = cluster.id utils.set_tfvars(tf_folder, nodes_details) tf = terraform_utils.TerraformUtils(working_dir=tf_folder) machine_net = MachineNetwork(args.ipv4, args.ipv6, args.vm_network_cidr, args.vm_network_cidr6, args.ns_index) create_nodes_and_wait_till_registered(inventory_client=client, cluster=cluster, nodes_details=nodes_details, tf=tf) if client: cluster_info = client.cluster_get(cluster.id) macs = utils.get_libvirt_nodes_macs( nodes_details["libvirt_network_name"]) if not (cluster_info.api_vip and cluster_info.ingress_vip): utils.wait_till_hosts_with_macs_are_in_status( client=client, cluster_id=cluster.id, macs=macs, statuses=[ consts.NodesStatus.INSUFFICIENT, consts.NodesStatus.PENDING_FOR_INPUT, consts.NodesStatus.KNOWN ], ) if args.master_count == 1: is_ip4 = machine_net.has_ip_v4 or not machine_net.has_ip_v6 cidr = args.vm_network_cidr if is_ip4 else args.vm_network_cidr6 tf.change_variables({ "single_node_ip": helper_cluster.Cluster.get_ip_for_single_node( client, cluster.id, cidr, ipv4_first=is_ip4) }) elif args.vip_dhcp_allocation: set_cluster_machine_cidr(client, cluster.id, machine_net) else: set_cluster_vips(client, cluster.id, machine_net) else: log.info("VIPs already configured") set_hosts_roles(client, cluster, nodes_details, machine_net, tf, args.master_count, args.with_static_ips) utils.wait_till_hosts_with_macs_are_in_status( client=client, cluster_id=cluster.id, macs=macs, statuses=[consts.NodesStatus.KNOWN], ) if args.install_cluster: time.sleep(10) install_cluster.run_install_flow( client=client, cluster_id=cluster.id, kubeconfig_path=consts.DEFAULT_CLUSTER_KUBECONFIG_PATH, pull_secret=args.pull_secret, tf=tf) # Validate DNS domains resolvability validate_dns(client, cluster.id) if args.wait_for_cvo: cluster_info = client.cluster_get(cluster.id) log.info("Start waiting till CVO status is available") api_vip = helper_cluster.get_api_vip_from_cluster( client, cluster_info) config_etc_hosts(cluster_info.name, cluster_info.base_dns_domain, api_vip) utils.wait_for_cvo_available()