Example #1
0
    def installer_gather(self, ip: str, ssh_key: Path, out_dir: str):
        stdout, stderr, _ret = utils.run_command(
            f"{INSTALLER_BINARY} gather bootstrap --log-level debug --bootstrap {ip} --master {ip} --key {str(ssh_key)}"
        )

        with open(INSTALLER_GATHER_DEBUG_STDOUT, "w") as f:
            f.write(stdout)

        with open(INSTALLER_GATHER_DEBUG_STDERR, "w") as f:
            f.write(stderr)

        matches = re.compile(r'.*logs captured here "(.*)".*').findall(stderr)

        if len(matches) == 0:
            log.warning(
                f"It seems like installer-gather didn't generate any bundles, stderr: {stderr}"
            )
            return

        bundle_file_path, *_ = matches

        log.info(f"Found installer-gather bundle at path {bundle_file_path}")

        utils.run_command_with_output(
            f"tar -xzf {bundle_file_path} -C {out_dir}")
        os.remove(bundle_file_path) if os.path.exists(
            bundle_file_path) else None
Example #2
0
    def approve_csrs(kubeconfig_path: str, done: threading.Event):
        log.info(
            "Started background worker to approve CSRs when they appear...")
        while not done.is_set():
            unapproved_csrs = []
            try:
                unapproved_csrs = get_unapproved_csr_names(kubeconfig_path)
            except subprocess.SubprocessError:
                log.debug(
                    "Failed to list csrs. This is usually due to API downtime. Retrying"
                )
            except Exception:
                # We're in a thread so it's a bit awkward to stop everything else...
                # Just continue after logging the unexpected exception
                log.exception("Unknown exception while listing csrs")

            for csr_name in unapproved_csrs:
                log.info(f"Found unapproved CSR {csr_name}, approving...")

                try:
                    approve_csr(kubeconfig_path, csr_name)
                except subprocess.SubprocessError:
                    log.warning(
                        "Failed attempt to approve CSR, this may be due to API downtime. Will retry later"
                    )
                except Exception:
                    # We're in a thread so it's a bit awkward to stop everything else...
                    # Just continue after logging the unexpected exception
                    log.exception(
                        f"Unknown exception while approving the {csr_name} CSR"
                    )

            time.sleep(10)
Example #3
0
def fetch_url_and_write_to_file(url_key, file_name, debug_info, output_folder):
    if url_key in debug_info:
        logs_url = debug_info[url_key]
        content = fetch_url(logs_url)
        output_file = os.path.join(output_folder, file_name)
        with open(output_file, "wb") as _file:
            _file.write(content)
    else:
        log.warning(f"{url_key} is not available")
def download_must_gather(kubeconfig: str, dest_dir: str):
    log.info(f"Downloading must-gather to {dest_dir}")
    command = (
        f"oc --insecure-skip-tls-verify --kubeconfig={kubeconfig} adm must-gather"
        f" --dest-dir {dest_dir} > {dest_dir}/must-gather.log")
    try:
        run_command(command, shell=True, raise_errors=True)
    except RuntimeError as ex:
        log.warning(f"Failed to run must gather: {ex}")
def try_get_cluster():
    if args.cluster_id:
        try:
            client = ClientFactory.create_client(url=utils.get_assisted_service_url_by_args(args=args),
                                                 offline_token=utils.get_env("OFFLINE_TOKEN"))

            return client, client.cluster_get(cluster_id=args.cluster_id)

        except ApiException as e:
            log.warning(f"Can't retrieve cluster_id={args.cluster_id}, {e}")

    return None, None
    def handle_trigger(self, conditions_string: List[str],
                       values: Dict[str, Any]) -> None:
        for k, v in values.items():
            if not hasattr(self, k):
                continue

            if not self.is_user_set(k):
                log.debug(
                    f"{self.__class__.__name__} - Trigger set `{k}` to `{v}`, Condition: {conditions_string}"
                )
                self._set(k, v)
            else:
                log.warning(
                    f"Skipping setting {k} to value {v} due that it already been set by the user"
                )
Example #7
0
    def worker_ready() -> bool:
        try:
            node_readiness_map = get_nodes_readiness(KUBE_CONFIG)
        except subprocess.SubprocessError:
            log.debug("Failed to list nodes. This is usually due to API downtime. Retrying")
            return False

        if f"{CLUSTER_PREFIX}-master-0" not in node_readiness_map:
            log.warning("Couldn't find master in node status list, this should not happen")
            return False

        if f"{CLUSTER_PREFIX}-worker-0" not in node_readiness_map:
            return False

        return all(node_status for node_status in node_readiness_map.values())
def collect_debug_info_from_cluster(cluster_deployment, agent_cluster_install):
    cluster_name = cluster_deployment.ref.name
    output_folder = f"build/{cluster_name}"
    recreate_folder(output_folder)
    aci = agent_cluster_install.get()
    debug_info = aci["status"]["debugInfo"]

    try:
        log.info("Collecting debugInfo (events/logs) from cluster")
        fetch_url_and_write_to_file("eventsURL", "events.json", debug_info,
                                    output_folder)
        fetch_url_and_write_to_file("logsURL", "logs.tar", debug_info,
                                    output_folder)
    except Exception as err:
        log.warning(
            f"Failed to collect debug info for cluster {cluster_name} ({err})")
    def start_node(self, node_name, check_ips=True):
        log.info("Going to power-on %s, check ips flag %s", node_name,
                 check_ips)
        node = self.libvirt_connection.lookupByName(node_name)

        if not node.isActive():
            try:
                node.create()
                if check_ips:
                    self._wait_till_domain_has_ips(node)
            except waiting.exceptions.TimeoutExpired:
                log.warning("Node %s failed to recive IP, retrying", node_name)
                self.shutdown_node(node_name)
                node.create()
                if check_ips:
                    self._wait_till_domain_has_ips(node)
Example #10
0
 def _connect_to_load_balancer(self, load_balancer_ip: str) -> bool:
     family = socket.AF_INET6 if ":" in load_balancer_ip else socket.AF_INET
     try:
         with socket.socket(family, socket.SOCK_STREAM) as s:
             s.connect(
                 (load_balancer_ip, consts.DEFAULT_LOAD_BALANCER_PORT))
             log.info(
                 f"Successfully connected to load balancer "
                 f"{load_balancer_ip}:{consts.DEFAULT_LOAD_BALANCER_PORT}")
             return True
     except Exception as e:
         log.warning(
             "Could not connect to load balancer endpoint %s: %s",
             self._render_socket_endpoint(
                 load_balancer_ip, consts.DEFAULT_LOAD_BALANCER_PORT),
             e,
         )
         return False
def set_hosts_roles(client, cluster, nodes_details, machine_net, tf, master_count, static_network_mode):
    networks_names = (
        nodes_details["libvirt_network_name"],
        nodes_details["libvirt_secondary_network_name"]
    )

    # don't set roles in bip role
    if not machine_net.has_ip_v6:
        libvirt_nodes = get_libvirt_nodes_mac_role_ip_and_name(networks_names[0])
        libvirt_nodes.update(get_libvirt_nodes_mac_role_ip_and_name(networks_names[1]))
        if static_network_mode:
            log.info("Setting hostnames when running in static network config mode")
            update_hostnames = True
        else:
            update_hostnames = False
    else:
        log.warning("Work around libvirt for Terrafrom not setting hostnames of IPv6 hosts")
        libvirt_nodes = utils.get_libvirt_nodes_from_tf_state(networks_names, tf.get_state())
        update_hostnames = True

    utils.update_hosts(client, cluster.id, libvirt_nodes, update_hostnames=update_hostnames,
                       update_roles=master_count > 1)
Example #12
0
    def ssh_connection(self):
        if not self.ips:
            raise RuntimeError(f"No available IPs for node {self.name}")

        log.info("Trying to access through IP addresses: %s",
                 ", ".join(self.ips))
        for ip in self.ips:
            exception = None
            try:
                connection = ssh.SshConnection(
                    ip,
                    private_ssh_key_path=self.private_ssh_key_path,
                    username=self.username)
                connection.connect()
                return connection

            except (TimeoutError, SCPException) as e:
                log.warning("Could not SSH through IP %s: %s", ip, str(e))
                exception = e

        if exception is not None:
            raise exception
Example #13
0
    def worker_installation(self, controller: TerraformController, cluster_configuration: ClusterConfig):
        controller.start_node(node_name=f"{CLUSTER_PREFIX}-worker-0")

        # Start a background worker to approve CSRs
        approve_csr_worker_done = threading.Event()
        approve_csr_worker = threading.Thread(
            target=self.approve_csrs,
            args=(KUBE_CONFIG, approve_csr_worker_done),
            # Don't hang if this thread is still running for some reason
            daemon=True,
        )

        approve_csr_worker.start()

        try:
            self.waiting_for_added_worker(controller)
        finally:
            approve_csr_worker_done.set()

        approve_csr_worker.join(timeout=10)
        if approve_csr_worker.is_alive():
            log.warning("CSR thread is still running for some reason")
Example #14
0
    def __init__(self, inventory_url: str, type: str, offline_token: str):

        self.client = ClientFactory.create_client(url=inventory_url,
                                                  offline_token=offline_token)

        with open("src/manage/manageable_options.yaml", "r") as f:
            options = yaml.load(f, Loader=yaml.FullLoader)

        manage_config = options.get(type, None)

        if not manage_config:
            raise ValueError(
                f"{type} is not a valid manageable_options option")

        days_back = manage_config["days_back"]
        measure_field = manage_config["measure_field"]

        clusters = self.get_clusters()
        clusters_to_process = list()

        for cluster in clusters:
            if is_older_then(cluster[measure_field], days_back):
                clusters_to_process.append(cluster["id"])

        len_of_clusters_to_prcess = len(clusters_to_process)

        log.info(f"Running {type} of {len_of_clusters_to_prcess} clusters")

        if not query_yes_no():
            return

        method = getattr(self.client, manage_config["method"])

        for cluster_id in clusters_to_process:
            try:
                method(cluster_id=cluster_id)
            except ApiException as e:
                log.warning(f"Can't process cluster_id={cluster_id}, {e}")
Example #15
0
def download_must_gather(kubeconfig: str, dest_dir: str):
    must_gather_dir = f"{dest_dir}/must-gather-dir"
    os.mkdir(must_gather_dir)

    log.info(
        f"Downloading must-gather to {must_gather_dir}, kubeconfig {kubeconfig}"
    )
    command = (
        f"oc --insecure-skip-tls-verify --kubeconfig={kubeconfig} adm must-gather"
        f" --dest-dir {must_gather_dir} > {must_gather_dir}/must-gather.log")
    try:
        run_command(command, shell=True, raise_errors=True)

    except RuntimeError as ex:
        log.warning(f"Failed to run must gather: {ex}")

    log.debug("Archiving %s...", must_gather_dir)
    with tarfile.open(f"{dest_dir}/must-gather.tar", "w:gz") as tar:
        tar.add(must_gather_dir, arcname=os.path.sep)

    log.debug("Removing must-gather directory %s after we archived it",
              must_gather_dir)
    shutil.rmtree(must_gather_dir)
Example #16
0
    def attach_disk_flags(persistent):
        modified_nodes = set()

        def attach(node, disk_size, bootable=False, with_wwn=False):
            nonlocal modified_nodes
            node.attach_test_disk(disk_size,
                                  bootable=bootable,
                                  persistent=persistent,
                                  with_wwn=with_wwn)
            modified_nodes.add(node)

        yield attach
        if global_variables.test_teardown:
            for modified_node in modified_nodes:
                try:
                    modified_node.detach_all_test_disks()
                    log.info(
                        f"Successfully detach test disks from node {modified_node.name}"
                    )
                except (libvirt.libvirtError, FileNotFoundError):
                    log.warning(
                        f"Failed to detach test disks from node {modified_node.name}"
                    )
Example #17
0
    def _collect_virsh_logs(cls, nodes: Nodes, log_dir_name):
        log.info("Collecting virsh logs\n")
        os.makedirs(log_dir_name, exist_ok=True)
        virsh_log_path = os.path.join(log_dir_name, "libvirt_logs")
        os.makedirs(virsh_log_path, exist_ok=False)

        libvirt_list_path = os.path.join(virsh_log_path, "virsh_list")
        utils.run_command(f"virsh list --all >> {libvirt_list_path}",
                          shell=True)

        libvirt_net_list_path = os.path.join(virsh_log_path, "virsh_net_list")
        utils.run_command(f"virsh net-list --all >> {libvirt_net_list_path}",
                          shell=True)

        network_name = nodes.get_cluster_network()
        virsh_leases_path = os.path.join(virsh_log_path, "net_dhcp_leases")
        utils.run_command(
            f"virsh net-dhcp-leases {network_name} >> {virsh_leases_path}",
            shell=True)

        messages_log_path = os.path.join(virsh_log_path, "messages.log")
        try:
            shutil.copy("/var/log/messages", messages_log_path)
        except FileNotFoundError:
            log.warning(
                "Failed to copy /var/log/messages, file does not exist")

        qemu_libvirt_path = os.path.join(virsh_log_path, "qemu_libvirt_logs")
        os.makedirs(qemu_libvirt_path, exist_ok=False)
        for node in nodes:
            try:
                shutil.copy(f"/var/log/libvirt/qemu/{node.name}.log",
                            f"{qemu_libvirt_path}/{node.name}-qemu.log")
            except FileNotFoundError:
                log.warning(
                    f"Failed to copy {node.name} qemu log, file does not exist"
                )

        console_log_path = os.path.join(virsh_log_path, "console_logs")
        os.makedirs(console_log_path, exist_ok=False)
        for node in nodes:
            try:
                shutil.copy(f"/var/log/libvirt/qemu/{node.name}-console.log",
                            f"{console_log_path}/{node.name}-console.log")
            except FileNotFoundError:
                log.warning(
                    f"Failed to copy {node.name} console log, file does not exist"
                )

        libvird_log_path = os.path.join(virsh_log_path, "libvirtd_journal")
        utils.run_command(
            f'journalctl --since "{nodes.setup_time}" '
            f"-u libvirtd -D /run/log/journal >> {libvird_log_path}",
            shell=True,
        )
Example #18
0
def folder_exists(file_path):
    folder = Path(file_path).parent
    if not folder:
        log.warning("Directory %s doesn't exist. Please create it", folder)
        return False
    return True
        const=True,
        default=False,
    )
    parser.add_argument(
        "--hyperthreading",
        help="nodes cpu hyperthreading mode",
        type=str,
        nargs='?',
        const='all',
        default=None,
    )
    parser.add_argument(
        "--kube-api",
        help='Should kube-api interface be used for cluster deployment',
        type=distutils.util.strtobool,
        nargs='?',
        const=True,
        default=False,
    )

    oc_utils.extend_parser_with_oc_arguments(parser)
    args = parser.parse_args()
    if not args.pull_secret:
        raise ValueError("Can't install cluster without pull secret, please provide one")

    if args.master_count == 1:
        log.warning("Master count is 1, setting workers to 0")
        args.number_of_workers = 0

    main()
Example #20
0
def download_logs(
    client: InventoryClient,
    cluster: dict,
    dest: str,
    must_gather: bool,
    update_by_events: bool = False,
    retry_interval: int = RETRY_INTERVAL,
):
    if "hosts" not in cluster or len(cluster["hosts"]) == 0:
        cluster["hosts"] = client.get_cluster_hosts(cluster_id=cluster["id"])

    output_folder = get_logs_output_folder(dest, cluster)
    if not is_update_needed(output_folder, update_by_events, client, cluster):
        log.info(f"Skipping, no need to update {output_folder}.")
        return

    recreate_folder(output_folder)
    recreate_folder(os.path.join(output_folder, "cluster_files"))

    try:
        write_metadata_file(client, cluster,
                            os.path.join(output_folder, "metadata.json"))

        with SuppressAndLog(requests.exceptions.RequestException,
                            ConnectionError, KeyboardInterrupt):
            client.download_metrics(os.path.join(output_folder, "metrics.txt"))

        for cluster_file in (
                "bootstrap.ign",
                "master.ign",
                "worker.ign",
                "install-config.yaml",
        ):
            with SuppressAndLog(assisted_service_client.rest.ApiException,
                                KeyboardInterrupt):
                client.download_and_save_file(
                    cluster["id"], cluster_file,
                    os.path.join(output_folder, "cluster_files", cluster_file))

        with SuppressAndLog(assisted_service_client.rest.ApiException,
                            KeyboardInterrupt):
            download_manifests(client, cluster["id"], output_folder)

        infra_env_list = set()
        for host_id, infra_env_id in map(
                lambda host: (host["id"], host["infra_env_id"]),
                cluster["hosts"]):
            with SuppressAndLog(assisted_service_client.rest.ApiException,
                                KeyboardInterrupt):
                client.download_host_ignition(
                    infra_env_id, host_id,
                    os.path.join(output_folder, "cluster_files"))
            if infra_env_id not in infra_env_list:
                infra_env_list.add(infra_env_id)
                with SuppressAndLog(assisted_service_client.rest.ApiException,
                                    KeyboardInterrupt):
                    client.download_infraenv_events(
                        infra_env_id,
                        get_infraenv_events_path(infra_env_id, output_folder))

        with SuppressAndLog(assisted_service_client.rest.ApiException,
                            KeyboardInterrupt):
            client.download_cluster_events(
                cluster["id"], get_cluster_events_path(cluster, output_folder))
            shutil.copy2(
                os.path.join(os.path.dirname(os.path.realpath(__file__)),
                             "events.html"), output_folder)

        with SuppressAndLog(assisted_service_client.rest.ApiException,
                            KeyboardInterrupt):
            are_masters_in_configuring_state = are_host_progress_in_stage(
                cluster["hosts"], [HostsProgressStages.CONFIGURING], 2)
            are_masters_in_join_or_done_state = are_host_progress_in_stage(
                cluster["hosts"],
                [HostsProgressStages.JOINED, HostsProgressStages.DONE], 2)
            max_retries = MUST_GATHER_MAX_RETRIES if are_masters_in_join_or_done_state else MAX_RETRIES
            is_controller_expected = cluster[
                "status"] == ClusterStatus.INSTALLED or are_masters_in_configuring_state
            min_number_of_logs = min_number_of_log_files(
                cluster, is_controller_expected)

            for i in range(max_retries):
                cluster_logs_tar = os.path.join(
                    output_folder, f"cluster_{cluster['id']}_logs.tar")

                with suppress(FileNotFoundError):
                    os.remove(cluster_logs_tar)

                client.download_cluster_logs(cluster["id"], cluster_logs_tar)
                try:
                    verify_logs_uploaded(
                        cluster_logs_tar,
                        min_number_of_logs,
                        installation_success=(
                            cluster["status"] == ClusterStatus.INSTALLED),
                        check_oc=are_masters_in_join_or_done_state,
                    )
                    break
                except AssertionError as ex:
                    log.warning("Cluster logs verification failed: %s", ex)

                    # Skip sleeping on last retry
                    if i < MAX_RETRIES - 1:
                        log.info(f"Going to retry in {retry_interval} seconds")
                        time.sleep(retry_interval)

        kubeconfig_path = os.path.join(output_folder, "kubeconfig-noingress")

        with SuppressAndLog(assisted_service_client.rest.ApiException):
            client.download_kubeconfig_no_ingress(cluster["id"],
                                                  kubeconfig_path)

            if must_gather:
                config_etc_hosts(
                    cluster["name"],
                    cluster["base_dns_domain"],
                    client.get_api_vip(cluster, cluster["id"]),
                )
                download_must_gather(kubeconfig_path, output_folder)

    finally:
        run_command(f"chmod -R ugo+rx '{output_folder}'")