Example #1
0
def download_logs(client: InventoryClient, cluster: dict, dest: str,
                  must_gather: bool, retry_interval: int = RETRY_INTERVAL):

    output_folder = get_logs_output_folder(dest, cluster)

    if os.path.isdir(output_folder):
        log.info(f"Skipping. The logs directory {output_folder} already exists.")
        return

    recreate_folder(output_folder)
    recreate_folder(os.path.join(output_folder, "cluster_files"))

    try:
        write_metadata_file(client, cluster, os.path.join(output_folder, 'metdata.json'))

        with suppress(assisted_service_client.rest.ApiException):
            client.download_ignition_files(cluster['id'], os.path.join(output_folder, "cluster_files"))

        for host_id in map(lambda host: host['id'], cluster['hosts']):
            with suppress(assisted_service_client.rest.ApiException):
                client.download_host_ignition(cluster['id'], host_id, os.path.join(output_folder, "cluster_files"))

        with suppress(assisted_service_client.rest.ApiException):
            client.download_cluster_events(cluster['id'], os.path.join(output_folder, f"cluster_{cluster['id']}_events.json"))
            shutil.copy2(os.path.join(os.path.dirname(os.path.realpath(__file__)), "events.html"), output_folder)

        with suppress(assisted_service_client.rest.ApiException):
            for i in range(MAX_RETRIES):
                cluster_logs_tar = os.path.join(output_folder, f"cluster_{cluster['id']}_logs.tar")

                with suppress(FileNotFoundError):
                    os.remove(cluster_logs_tar)

                client.download_cluster_logs(cluster['id'], cluster_logs_tar)

                min_number_of_logs = len(cluster['hosts']) + 1 if cluster['status'] == ClusterStatus.INSTALLED else len(cluster['hosts'])

                try:
                    verify_logs_uploaded(cluster_logs_tar, min_number_of_logs, cluster['status'] == ClusterStatus.INSTALLED)
                    break
                except AssertionError as ex:
                    log.warn(f"Cluster logs verification failed: {ex}")

                    # Skip sleeping on last retry
                    if i < MAX_RETRIES - 1:
                        log.info(f"Going to retry in {retry_interval} seconds")
                        time.sleep(retry_interval)

        kubeconfig_path = os.path.join(output_folder, "kubeconfig-noingress")

        with suppress(assisted_service_client.rest.ApiException):
            client.download_kubeconfig_no_ingress(cluster['id'], kubeconfig_path)

            if must_gather:
                recreate_folder(os.path.join(output_folder, "must-gather"))
                config_etc_hosts(cluster['name'], cluster['base_dns_domain'],
                                 helper_cluster.get_api_vip_from_cluster(client, cluster))
                download_must_gather(kubeconfig_path, os.path.join(output_folder, "must-gather"))
    finally:
        run_command(f"chmod -R ugo+rx '{output_folder}'")
 def update_oc_config(nodes, cluster):
     os.environ["KUBECONFIG"] = env_variables['kubeconfig_path']
     vips = nodes.controller.get_ingress_and_api_vips()
     api_vip = vips['api_vip']
     infra_utils.config_etc_hosts(cluster_name=cluster.name,
                                  base_dns_domain=env_variables["base_domain"],
                                  api_vip=api_vip)
Example #3
0
def waiting_for_installation_completion(controller):
    vm_ip = controller.master_ips[0][0]

    try:
        logging.info("Configuring /etc/hosts...")
        utils.config_etc_hosts(cluster_name=controller.cluster_name,
                               base_dns_domain=controller.cluster_domain,
                               api_vip=vm_ip)

        logging.info("Waiting for installation to complete...")
        waiting.wait(all_operators_up,
                     sleep_seconds=20,
                     timeout_seconds=60 * 60,
                     waiting_for="all operators to get up")
        logging.info("Installation completed successfully!")

    finally:
        logging.info("Gathering sosreport data from host...")
        node = Nodes(controller, private_ssh_key_path=SSH_KEY)[0]
        gather_sosreport_data(node)

        logging.info("Gathering information via installer-gather...")
        utils.recreate_folder(INSTALLER_GATHER_DIR, force_recreate=True)
        installer_gather(ip=vm_ip, ssh_key=SSH_KEY, out_dir=INSTALLER_GATHER_DIR)

        logging.info("Gathering information via must-gather...")
        utils.recreate_folder(MUST_GATHER_DIR)
        download_must_gather(KUBE_CONFIG, MUST_GATHER_DIR)
 def update_oc_config(nodes, cluster):
     os.environ["KUBECONFIG"] = cluster.kubeconfig_path
     if nodes.masters_count == 1:
         main_cidr = cluster.get_primary_machine_cidr()
         api_vip = cluster.get_ip_for_single_node(cluster.api_client,
                                                  cluster.id, main_cidr)
     else:
         vips = nodes.controller.get_ingress_and_api_vips()
         api_vip = vips['api_vip']
     infra_utils.config_etc_hosts(
         cluster_name=cluster.name,
         base_dns_domain=global_variables.base_dns_domain,
         api_vip=api_vip)
def waiting_for_installation_completion(controller):
    vm_ip = controller.master_ips[0][0]

    try:
        logging.info("Configuring /etc/hosts...")
        utils.config_etc_hosts(cluster_name=controller.cluster_name,
                               base_dns_domain=controller.cluster_domain,
                               api_vip=vm_ip)

        logging.info("Waiting for installation to complete...")
        waiting.wait(all_operators_up,
                     sleep_seconds=20,
                     timeout_seconds=60 * 60,
                     waiting_for="all operators to get up")
        logging.info("Installation completed successfully!")
    finally:
        log_collection(vm_ip)
Example #6
0
def download_logs(client: InventoryClient, cluster: dict, dest: str,
                  must_gather: bool):
    output_folder = get_logs_output_folder(dest, cluster)

    if os.path.isdir(output_folder):
        log.info(
            f"Skipping. The logs directory {output_folder} already exists.")
        return

    recreate_folder(output_folder)
    recreate_folder(os.path.join(output_folder, "cluster_files"))

    write_metadata_file(client, cluster,
                        os.path.join(output_folder, 'metdata.json'))

    with suppress(assisted_service_client.rest.ApiException):
        client.download_ignition_files(
            cluster['id'], os.path.join(output_folder, "cluster_files"))

    with suppress(assisted_service_client.rest.ApiException):
        client.download_cluster_events(
            cluster['id'],
            os.path.join(output_folder,
                         f"cluster_{cluster['id']}_events.json"))
        shutil.copy2(
            os.path.join(os.path.dirname(os.path.realpath(__file__)),
                         "events.html"), output_folder)

    with suppress(assisted_service_client.rest.ApiException):
        client.download_cluster_logs(
            cluster['id'],
            os.path.join(output_folder, f"cluster_{cluster['id']}_logs.tar"))

    kubeconfig_path = os.path.join(output_folder, "kubeconfig-noingress")

    with suppress(assisted_service_client.rest.ApiException):
        client.download_kubeconfig_no_ingress(cluster['id'], kubeconfig_path)

        if must_gather:
            recreate_folder(os.path.join(output_folder, "must-gather"))
            config_etc_hosts(cluster['name'], cluster['base_dns_domain'],
                             cluster['api_vip'])
            download_must_gather(kubeconfig_path,
                                 os.path.join(output_folder, "must-gather"))

    run_command("chmod -R ugo+rx '%s'" % output_folder)
Example #7
0
def nodes_flow(client, cluster_name, cluster, image_path):
    nodes_details = _create_node_details(cluster_name)
    if cluster:
        nodes_details["cluster_inventory_id"] = cluster.id

    tf_folder = utils.get_tf_folder(cluster_name, args.namespace)
    utils.recreate_folder(tf_folder)
    copy_tree(consts.TF_TEMPLATE, tf_folder)
    tf = terraform_utils.TerraformUtils(working_dir=tf_folder)
    machine_net = MachineNetwork(args.ipv4, args.ipv6, args.vm_network_cidr,
                                 args.vm_network_cidr6, args.ns_index)

    create_nodes_and_wait_till_registered(cluster_name=cluster_name,
                                          inventory_client=client,
                                          cluster=cluster,
                                          image_path=image_path,
                                          storage_path=args.storage_path,
                                          master_count=args.master_count,
                                          nodes_details=nodes_details,
                                          tf=tf,
                                          machine_net=machine_net)

    if client:
        cluster_info = client.cluster_get(cluster.id)
        macs = utils.get_libvirt_nodes_macs(
            nodes_details["libvirt_network_name"])

        if not (cluster_info.api_vip and cluster_info.ingress_vip):
            utils.wait_till_hosts_with_macs_are_in_status(
                client=client,
                cluster_id=cluster.id,
                macs=macs,
                statuses=[
                    consts.NodesStatus.INSUFFICIENT,
                    consts.NodesStatus.PENDING_FOR_INPUT,
                ],
            )

            if args.vip_dhcp_allocation:
                set_cluster_machine_cidr(client, cluster.id, machine_net)
            else:
                set_cluster_vips(client, cluster.id, machine_net)
        else:
            log.info("VIPs already configured")

        network_name = nodes_details["libvirt_network_name"]
        if machine_net.has_ip_v4:
            libvirt_nodes = utils.get_libvirt_nodes_mac_role_ip_and_name(
                network_name)
            update_hostnames = False
        else:
            log.warning(
                "Work around libvirt for Terrafrom not setting hostnames of IPv6-only hosts"
            )
            libvirt_nodes = _get_libvirt_nodes_from_tf_state(
                network_name, tf.get_state())
            update_hostnames = True

        update_hosts(client, cluster.id, libvirt_nodes, update_hostnames)
        utils.wait_till_hosts_with_macs_are_in_status(
            client=client,
            cluster_id=cluster.id,
            macs=macs,
            statuses=[consts.NodesStatus.KNOWN],
        )

        if args.install_cluster:
            time.sleep(10)
            install_cluster.run_install_flow(
                client=client,
                cluster_id=cluster.id,
                kubeconfig_path=consts.DEFAULT_CLUSTER_KUBECONFIG_PATH,
                pull_secret=args.pull_secret,
                tf=tf)
            # Validate DNS domains resolvability
            validate_dns(client, cluster.id)
            if args.wait_for_cvo:
                cluster_info = client.cluster_get(cluster.id)
                log.info("Start waiting till CVO status is available")
                config_etc_hosts(cluster_info.name,
                                 cluster_info.base_dns_domain,
                                 cluster_info.api_vip)
                utils.wait_for_cvo_available()
Example #8
0
def download_logs(client: InventoryClient,
                  cluster: dict,
                  dest: str,
                  must_gather: bool,
                  update_by_events: bool = False,
                  retry_interval: int = RETRY_INTERVAL,
                  pull_secret=""):

    if "hosts" not in cluster or len(cluster["hosts"]) == 0:
        cluster["hosts"] = client.get_cluster_hosts(cluster_id=cluster["id"])

    output_folder = get_logs_output_folder(dest, cluster)
    if not is_update_needed(output_folder, update_by_events, client, cluster):
        log.info(f"Skipping, no need to update {output_folder}.")
        return

    recreate_folder(output_folder)
    recreate_folder(os.path.join(output_folder, "cluster_files"))

    try:
        write_metadata_file(client, cluster,
                            os.path.join(output_folder, 'metadata.json'))

        with suppressAndLog(AssertionError, ConnectionError,
                            requests.exceptions.ConnectionError):
            client.download_metrics(os.path.join(output_folder, "metrics.txt"))

        for cluster_file in ("bootstrap.ign", "master.ign", "worker.ign",
                             "install-config.yaml"):
            with suppressAndLog(assisted_service_client.rest.ApiException):
                client.download_and_save_file(
                    cluster['id'], cluster_file,
                    os.path.join(output_folder, "cluster_files", cluster_file))

        for host_id in map(lambda host: host['id'], cluster['hosts']):
            with suppressAndLog(assisted_service_client.rest.ApiException):
                client.download_host_ignition(
                    cluster['id'], host_id,
                    os.path.join(output_folder, "cluster_files"))

        with suppressAndLog(assisted_service_client.rest.ApiException):
            client.download_cluster_events(
                cluster['id'], get_cluster_events_path(cluster, output_folder))
            shutil.copy2(
                os.path.join(os.path.dirname(os.path.realpath(__file__)),
                             "events.html"), output_folder)

        with suppressAndLog(assisted_service_client.rest.ApiException):
            are_masters_in_configuring_state = are_host_progress_in_stage(
                cluster['hosts'], [HostsProgressStages.CONFIGURING], 2)
            are_masters_in_join_state = are_host_progress_in_stage(
                cluster['hosts'], [HostsProgressStages.JOINED], 2)
            max_retries = MUST_GATHER_MAX_RETRIES if are_masters_in_join_state else MAX_RETRIES
            is_controller_expected = cluster[
                'status'] == ClusterStatus.INSTALLED or are_masters_in_configuring_state
            min_number_of_logs = min_number_of_log_files(
                cluster, is_controller_expected)

            for i in range(max_retries):
                cluster_logs_tar = os.path.join(
                    output_folder, f"cluster_{cluster['id']}_logs.tar")

                with suppress(FileNotFoundError):
                    os.remove(cluster_logs_tar)

                client.download_cluster_logs(cluster['id'], cluster_logs_tar)
                try:
                    verify_logs_uploaded(
                        cluster_logs_tar,
                        min_number_of_logs,
                        installation_success=(
                            cluster['status'] == ClusterStatus.INSTALLED),
                        check_oc=are_masters_in_join_state)
                    break
                except AssertionError as ex:
                    log.warn(f"Cluster logs verification failed: {ex}")

                    # Skip sleeping on last retry
                    if i < MAX_RETRIES - 1:
                        log.info(f"Going to retry in {retry_interval} seconds")
                        time.sleep(retry_interval)

        kubeconfig_path = os.path.join(output_folder, "kubeconfig-noingress")

        with suppressAndLog(assisted_service_client.rest.ApiException):
            client.download_kubeconfig_no_ingress(cluster['id'],
                                                  kubeconfig_path)

            if must_gather:
                recreate_folder(os.path.join(output_folder, "must-gather"))
                config_etc_hosts(
                    cluster['name'], cluster['base_dns_domain'],
                    helper_cluster.get_api_vip_from_cluster(
                        client, cluster, pull_secret))
                download_must_gather(
                    kubeconfig_path, os.path.join(output_folder,
                                                  "must-gather"))

    finally:
        run_command(f"chmod -R ugo+rx '{output_folder}'")
def nodes_flow(client, cluster_name, cluster):
    tf_folder = utils.get_tf_folder(cluster_name, args.namespace)
    nodes_details = utils.get_tfvars(tf_folder)
    if cluster:
        nodes_details["cluster_inventory_id"] = cluster.id
        utils.set_tfvars(tf_folder, nodes_details)

    tf = terraform_utils.TerraformUtils(working_dir=tf_folder)
    machine_net = MachineNetwork(args.ipv4, args.ipv6, args.vm_network_cidr,
                                 args.vm_network_cidr6, args.ns_index)

    create_nodes_and_wait_till_registered(inventory_client=client,
                                          cluster=cluster,
                                          nodes_details=nodes_details,
                                          tf=tf)

    if client:
        cluster_info = client.cluster_get(cluster.id)
        macs = utils.get_libvirt_nodes_macs(
            nodes_details["libvirt_network_name"])

        if not (cluster_info.api_vip and cluster_info.ingress_vip):
            utils.wait_till_hosts_with_macs_are_in_status(
                client=client,
                cluster_id=cluster.id,
                macs=macs,
                statuses=[
                    consts.NodesStatus.INSUFFICIENT,
                    consts.NodesStatus.PENDING_FOR_INPUT,
                    consts.NodesStatus.KNOWN
                ],
            )

            if args.master_count == 1:
                is_ip4 = machine_net.has_ip_v4 or not machine_net.has_ip_v6
                cidr = args.vm_network_cidr if is_ip4 else args.vm_network_cidr6
                tf.change_variables({
                    "single_node_ip":
                    helper_cluster.Cluster.get_ip_for_single_node(
                        client, cluster.id, cidr, ipv4_first=is_ip4)
                })
            elif args.vip_dhcp_allocation:
                set_cluster_machine_cidr(client, cluster.id, machine_net)
            else:
                set_cluster_vips(client, cluster.id, machine_net)
        else:
            log.info("VIPs already configured")

        set_hosts_roles(client, cluster, nodes_details, machine_net, tf,
                        args.master_count, args.with_static_ips)

        utils.wait_till_hosts_with_macs_are_in_status(
            client=client,
            cluster_id=cluster.id,
            macs=macs,
            statuses=[consts.NodesStatus.KNOWN],
        )

        if args.install_cluster:
            time.sleep(10)
            install_cluster.run_install_flow(
                client=client,
                cluster_id=cluster.id,
                kubeconfig_path=consts.DEFAULT_CLUSTER_KUBECONFIG_PATH,
                pull_secret=args.pull_secret,
                tf=tf)
            # Validate DNS domains resolvability
            validate_dns(client, cluster.id)
            if args.wait_for_cvo:
                cluster_info = client.cluster_get(cluster.id)
                log.info("Start waiting till CVO status is available")
                api_vip = helper_cluster.get_api_vip_from_cluster(
                    client, cluster_info)
                config_etc_hosts(cluster_info.name,
                                 cluster_info.base_dns_domain, api_vip)
                utils.wait_for_cvo_available()