def test_dynamic_provisioning_glusterfile_glusterpod_failure(self):
        """Create glusterblock PVC when gluster pod is down."""

        # Check that we work with containerized Gluster
        if not self.is_containerized_gluster():
            self.skipTest("Only containerized Gluster clusters are supported.")

        mount_path = "/mnt"
        datafile_path = '%s/fake_file_for_%s' % (mount_path, self.id())

        # Create secret and storage class
        self.create_storage_class()

        # Create PVC
        pvc_name = self.create_and_wait_for_pvc()

        # Create app POD with attached volume
        pod_name = oc_create_tiny_pod_with_volume(self.node,
                                                  pvc_name,
                                                  "test-pvc-mount-on-app-pod",
                                                  mount_path=mount_path)
        self.addCleanup(wait_for_resource_absence, self.node, 'pod', pod_name)
        self.addCleanup(oc_delete, self.node, 'pod', pod_name)

        # Wait for app POD be up and running
        wait_for_pod_be_ready(self.node, pod_name, timeout=60, wait_step=2)

        # Run IO in background
        io_cmd = "oc rsh %s dd if=/dev/urandom of=%s bs=1000K count=900" % (
            pod_name, datafile_path)
        async_io = g.run_async(self.node, io_cmd, "root")

        # Pick up one of the hosts which stores PV brick (4+ nodes case)
        gluster_pod_data = get_gluster_pod_names_by_pvc_name(
            self.node, pvc_name)[0]

        # Delete glusterfs POD from chosen host and wait for spawn of new one
        oc_delete(self.node, 'pod', gluster_pod_data["pod_name"])
        cmd = ("oc get pods -o wide | grep glusterfs | grep %s | "
               "grep -v Terminating | awk '{print $1}'") % (
                   gluster_pod_data["host_name"])
        for w in Waiter(600, 15):
            out = self.cmd_run(cmd)
            new_gluster_pod_name = out.strip().split("\n")[0].strip()
            if not new_gluster_pod_name:
                continue
            else:
                break
        if w.expired:
            error_msg = "exceeded timeout, new gluster pod not created"
            g.log.error(error_msg)
            raise ExecutionError(error_msg)
        new_gluster_pod_name = out.strip().split("\n")[0].strip()
        g.log.info("new gluster pod name is %s" % new_gluster_pod_name)
        wait_for_pod_be_ready(self.node, new_gluster_pod_name)

        # Check that async IO was not interrupted
        ret, out, err = async_io.async_communicate()
        self.assertEqual(ret, 0, "IO %s failed on %s" % (io_cmd, self.node))
def wait_for_claim(ocp_node, pvc_name, timeout=60, interval=2):
    """Wait for a claim to be created & bound up to the given timeout.
    """
    for w in Waiter(timeout, interval):
        sts = oc_get_pvc(ocp_node, pvc_name)
        if sts and sts.get('status', {}).get('phase') == 'Bound':
            return sts
    raise AssertionError('wait_for_claim on pvc %s timed out' % (pvc_name, ))
def wait_for_sc_unused(ocp_node, sc_name, timeout=60, interval=1):
    for w in Waiter(timeout, interval):
        sts = oc_get_all_pvs(ocp_node)
        items = (sts and sts.get('items')) or []
        if not any(
                i.get('spec', {}).get('storageClassName') == sc_name
                for i in items):
            return
    raise AssertionError('wait_for_sc_unused on %s timed out' % (sc_name, ))
 def wait_to_settle(self, timeout=120, interval=1):
     # This was originally going to be a tearDown, but oddly enough
     # tearDown is called *before* the cleanup functions, so it
     # could never succeed. This needs to be added as a cleanup
     # function first so that we run after our test's other cleanup
     # functions but before we go on to the next test in order
     # to prevent the async cleanups in kubernetes from steping
     # on the next test's "toes".
     for w in Waiter(timeout):
         nvols = self._count_vols()
         if nvols == self.volcount:
             return
     raise AssertionError('wait for volume count to settle timed out')
    def _node_reboot(self):
        storage_hostname = (g.config["gluster_servers"]
                            [self.gluster_servers[0]]["storage"])

        cmd = "sleep 3; /sbin/shutdown -r now 'Reboot triggered by Glusto'"
        ret, out, err = g.run(storage_hostname, cmd)

        self.addCleanup(self._wait_for_gluster_pod_to_be_ready)

        if ret != 255:
            err_msg = "failed to reboot host %s error: %s" % (
                storage_hostname, err)
            g.log.error(err_msg)
            raise AssertionError(err_msg)

        try:
            g.ssh_close_connection(storage_hostname)
        except Exception as e:
            g.log.error("failed to close connection with host %s"
                        " with error: %s" % (storage_hostname, e))
            raise

        # added sleep as node will restart after 3 sec
        time.sleep(3)

        for w in Waiter(timeout=600, interval=10):
            try:
                if g.rpyc_get_connection(storage_hostname, user="******"):
                    g.rpyc_close_connection(storage_hostname, user="******")
                    break
            except Exception as err:
                g.log.info("exception while getting connection: '%s'" % err)

        if w.expired:
            error_msg = ("exceeded timeout 600 sec, node '%s' is "
                         "not reachable" % storage_hostname)
            g.log.error(error_msg)
            raise ExecutionError(error_msg)

        # wait for the gluster pod to be in 'Running' state
        self._wait_for_gluster_pod_to_be_ready()

        # glusterd and gluster-blockd service should be up and running
        service_names = ("glusterd", "gluster-blockd", "tcmu-runner")
        for gluster_pod in self.gluster_pod_list:
            for service in service_names:
                g.log.info("gluster_pod - '%s' : gluster_service '%s'" % (
                    gluster_pod, service))
                check_service_status_on_pod(
                    self.oc_node, gluster_pod, service, "running"
                )
 def background_ops():
     subname = make_unique_label(short_tc_name)
     for i, w in enumerate(Waiter(60 * 60)):
         time.sleep(random.randint(1, 10) * 0.1)
         c = ClaimInfo(name='{}-{}'.format(subname, i),
                       storageclass=tname,
                       size=2)
         c.create_pvc(ocp_node)
         time.sleep(1)
         c.update_pvc_info(ocp_node, timeout=300)
         c.update_pv_info(ocp_node)
         time.sleep(random.randint(1, 10) * 0.1)
         c.delete_pvc(ocp_node)
         if done.is_set():
             break
 def verify_all_paths_are_up_in_multipath(
         self, mpath_name, hacount, node, timeout=30, interval=5):
     for w in Waiter(timeout, interval):
         out = command.cmd_run('multipath -ll %s' % mpath_name, node)
         count = 0
         for line in out.split('\n'):
             if 'active ready running' in line:
                 count += 1
         if hacount == count:
             break
     msg = "Paths are not up equal to hacount %s in mpath %s on Node %s" % (
         hacount, out, node)
     self.assertEqual(hacount, count, msg)
     for state in ['failed', 'faulty', 'undef']:
         msg = "All paths are not up in mpath %s on Node %s" % (out, node)
         self.assertNotIn(state, out, msg)
    def _wait_for_gluster_pod_to_be_ready(self):
        for gluster_pod in self.gluster_pod_list:
            for w in Waiter(timeout=600, interval=10):
                try:
                    success = wait_for_pod_be_ready(
                        self.oc_node, gluster_pod, timeout=1, wait_step=1
                    )
                    if success:
                        break
                except ExecutionError as e:
                    g.log.info("exception %s while validating gluster "
                               "pod %s" % (e, gluster_pod))

            if w.expired:
                error_msg = ("exceeded timeout 600 sec, pod '%s' is "
                             "not in 'running' state" % gluster_pod)
                g.log.error(error_msg)
                raise ExecutionError(error_msg)
Beispiel #9
0
    def create_heketi_volume_with_name_and_wait(self,
                                                name,
                                                size,
                                                raise_on_cleanup_error=True,
                                                timeout=600,
                                                wait_step=10,
                                                **kwargs):
        json = kwargs.get("json", False)

        try:
            h_volume_info = heketi_volume_create(self.heketi_client_node,
                                                 self.heketi_server_url,
                                                 size,
                                                 name=name,
                                                 **kwargs)
        except Exception as e:
            if ('more required' in six.text_type(e)
                    or ('Failed to allocate new volume' in six.text_type(e))):
                raise

            for w in Waiter(timeout, wait_step):
                h_volumes = heketi_volume_list(self.heketi_client_node,
                                               self.heketi_server_url)
                h_volume_match = re.search(HEKETI_VOLUME_REGEX % name,
                                           h_volumes)
                if h_volume_match:
                    h_volume_info = heketi_volume_info(self.heketi_client_node,
                                                       self.heketi_server_url,
                                                       h_volume_match.group(1),
                                                       json=json)
                    break

            if w.expired:
                g.log.info(
                    "Heketi volume with name %s not created in 600 sec" % name)
                raise

        self.addCleanup(heketi_volume_delete,
                        self.heketi_client_node,
                        self.heketi_server_url,
                        h_volume_info["id"],
                        raise_on_error=raise_on_cleanup_error)

        return h_volume_info
Beispiel #10
0
    def wait_for_hostname(self, vm_name, timeout=600, interval=10):
        """Wait for hostname to get assigned to a VM.

        Args:
            vm_name (str): Name of the VM.
        Returns:
            str: hostname of the VM.
        Raises:
            CloudProviderError: In case of any failures.
        """
        for w in Waiter(timeout, interval):
            vmlist = (
                self.vsphere_client.content.viewManager.CreateContainerView(
                    self.vsphere_client.content.rootFolder,
                    [vim.VirtualMachine], True))
            vm = [vm for vm in vmlist.view if vm.name == vm_name]
            hostname = vm[0].summary.guest.hostName
            if hostname:
                return hostname
        msg = 'VM %s did not got assigned hostname' % vm_name
        g.log.error(msg)
        raise exceptions.CloudProviderError(msg)
    def test_heketi_server_operations_cleanup_on_idle_setup(self):
        """Run heketi db clean up on an idle setup"""
        h_node, h_url = self.heketi_client_node, self.heketi_server_url
        err_msg = "There should not be any pending operations list {}"

        # Verify the server operations
        for waiter_add in Waiter(300, 20):
            initial_ops = heketi_ops.heketi_server_operations_list(
                h_node, h_url)
            if not initial_ops:
                break
        if waiter_add.expired and initial_ops:
            self.skipTest(err_msg.format(initial_ops))

        # Run cleanup
        cleanup = heketi_ops.heketi_server_operation_cleanup(h_node, h_url)
        self.assertFalse(
            cleanup, "Cleanup command failed with message {}".format(cleanup))

        # Verify the server operations
        final_ops = heketi_ops.heketi_server_operations_list(h_node, h_url)
        self.assertFalse(final_ops, err_msg.format(final_ops))
    def test_dynamic_provisioning_glusterfile_gluster_pod_or_node_failure(
            self):
        """Create glusterblock PVC when gluster pod or node is down."""
        mount_path = "/mnt"
        datafile_path = '%s/fake_file_for_%s' % (mount_path, self.id())

        # Create secret and storage class
        self.create_storage_class()

        # Create PVC
        pvc_name = self.create_and_wait_for_pvc()

        # Create app POD with attached volume
        pod_name = oc_create_tiny_pod_with_volume(
            self.node,
            pvc_name,
            "test-pvc-mount-on-app-pod",
            mount_path=mount_path,
            image=self.io_container_image_cirros)
        self.addCleanup(wait_for_resource_absence, self.node, 'pod', pod_name)
        self.addCleanup(oc_delete, self.node, 'pod', pod_name)

        # Wait for app POD be up and running
        wait_for_pod_be_ready(self.node, pod_name, timeout=60, wait_step=2)

        # Run IO in background
        io_cmd = "oc rsh %s dd if=/dev/urandom of=%s bs=1000K count=900" % (
            pod_name, datafile_path)
        async_io = g.run_async(self.node, io_cmd, "root")

        # Check for containerized Gluster
        if self.is_containerized_gluster():
            # Pick up one of the hosts which stores PV brick (4+ nodes case)
            gluster_pod_data = get_gluster_pod_names_by_pvc_name(
                self.node, pvc_name)[0]

            # Delete glusterfs POD from chosen host and wait for
            # spawn of new one
            oc_delete(self.node, 'pod', gluster_pod_data["pod_name"])
            cmd = ("oc get pods -o wide | grep glusterfs | grep %s | "
                   "grep -v Terminating | awk '{print $1}'") % (
                       gluster_pod_data["pod_hostname"])
            for w in Waiter(600, 15):
                new_gluster_pod_name = self.cmd_run(cmd)
                if new_gluster_pod_name:
                    break
            if w.expired:
                error_msg = "exceeded timeout, new gluster pod not created"
                g.log.error(error_msg)
                raise AssertionError(error_msg)
            g.log.info("new gluster pod name is %s" % new_gluster_pod_name)
            wait_for_pod_be_ready(self.node, new_gluster_pod_name)
        else:
            pvc_hosting_node_ip = get_gluster_host_ips_by_pvc_name(
                self.node, pvc_name)[0]
            heketi_nodes = heketi_node_list(self.heketi_client_node,
                                            self.heketi_server_url)
            node_ip_for_reboot = None
            for heketi_node in heketi_nodes:
                heketi_node_ip = heketi_node_info(
                    self.heketi_client_node,
                    self.heketi_server_url,
                    heketi_node,
                    json=True)["hostnames"]["storage"][0]
                if heketi_node_ip == pvc_hosting_node_ip:
                    node_ip_for_reboot = heketi_node_ip
                    break

            if not node_ip_for_reboot:
                raise AssertionError(
                    "Gluster node IP %s not matched with heketi node %s" %
                    (pvc_hosting_node_ip, heketi_node_ip))

            node_reboot_by_command(node_ip_for_reboot)

        # Check that async IO was not interrupted
        ret, out, err = async_io.async_communicate()
        self.assertEqual(ret, 0, "IO %s failed on %s" % (io_cmd, self.node))
    def test_targetcli_failure_during_block_pvc_creation(self):
        h_node, h_server = self.heketi_client_node, self.heketi_server_url

        # Disable redundant nodes and leave just 3 nodes online
        h_node_id_list = heketi_node_list(h_node, h_server)
        self.assertGreater(len(h_node_id_list), 2)
        for node_id in h_node_id_list[3:]:
            heketi_node_disable(h_node, h_server, node_id)
            self.addCleanup(heketi_node_enable, h_node, h_server, node_id)

        # Gather info about the Gluster node we are going to use for killing
        # targetcli processes.
        chosen_g_node_id = h_node_id_list[0]
        chosen_g_node_info = heketi_node_info(h_node,
                                              h_server,
                                              chosen_g_node_id,
                                              json=True)
        chosen_g_node_ip = chosen_g_node_info['hostnames']['storage'][0]
        chosen_g_node_hostname = chosen_g_node_info['hostnames']['manage'][0]
        chosen_g_node_ip_and_hostname = set(
            (chosen_g_node_ip, chosen_g_node_hostname))

        g_pods = oc_get_custom_resource(
            self.node,
            'pod', [
                ':.metadata.name', ':.status.hostIP', ':.status.podIP',
                ':.spec.nodeName'
            ],
            selector='glusterfs-node=pod')
        if g_pods and g_pods[0]:
            for g_pod in g_pods:
                if chosen_g_node_ip_and_hostname.intersection(set(g_pod[1:])):
                    host_to_run_cmds = self.node
                    g_pod_prefix, g_pod = 'oc exec %s -- ' % g_pod[0], g_pod[0]
                    break
            else:
                err_msg = (
                    'Failed to find Gluster pod filtering it by following IPs '
                    'and hostnames: %s\nFound following Gluster pods: %s') % (
                        chosen_g_node_ip_and_hostname, g_pods)
                g.log.error(err_msg)
                raise AssertionError(err_msg)
        else:
            host_to_run_cmds, g_pod_prefix, g_pod = chosen_g_node_ip, '', ''

        # Schedule deletion of targetcli process
        file_for_bkp, pvc_number = "~/.targetcli/prefs.bin", 10
        self.cmd_run("%scp %s %s_backup" %
                     (g_pod_prefix, file_for_bkp, file_for_bkp),
                     hostname=host_to_run_cmds)
        self.addCleanup(self.cmd_run,
                        "%srm -f %s_backup" % (g_pod_prefix, file_for_bkp),
                        hostname=host_to_run_cmds)
        kill_targetcli_services_cmd = (
            "while true; do "
            "  %spkill targetcli || echo 'failed to kill targetcli process'; "
            "done" % g_pod_prefix)
        loop_for_killing_targetcli_process = g.run_async(
            host_to_run_cmds, kill_targetcli_services_cmd, "root")
        try:
            # Create bunch of PVCs
            sc_name, pvc_names = self.create_storage_class(), []
            for i in range(pvc_number):
                pvc_names.append(oc_create_pvc(self.node, sc_name, pvc_size=1))
            self.addCleanup(wait_for_resources_absence, self.node, 'pvc',
                            pvc_names)
            self.addCleanup(oc_delete, self.node, 'pvc', ' '.join(pvc_names))

            # Check that we get expected number of provisioning errors
            timeout, wait_step, succeeded_pvcs, failed_pvcs = 120, 1, [], []
            _waiter, err_msg = Waiter(timeout=timeout, interval=wait_step), ""
            for pvc_name in pvc_names:
                _waiter._attempt = 0
                for w in _waiter:
                    events = get_events(self.node,
                                        pvc_name,
                                        obj_type="PersistentVolumeClaim")
                    for event in events:
                        if event['reason'] == 'ProvisioningSucceeded':
                            succeeded_pvcs.append(pvc_name)
                            break
                        elif event['reason'] == 'ProvisioningFailed':
                            failed_pvcs.append(pvc_name)
                            break
                    else:
                        continue
                    break
                if w.expired:
                    err_msg = (
                        "Failed to get neither 'ProvisioningSucceeded' nor "
                        "'ProvisioningFailed' statuses for all the PVCs in "
                        "time. Timeout was %ss, interval was %ss." %
                        (timeout, wait_step))
                    g.log.error(err_msg)
                    raise AssertionError(err_msg)
            self.assertGreater(len(failed_pvcs), len(succeeded_pvcs))
        finally:
            # Restore targetcli workability
            loop_for_killing_targetcli_process._proc.terminate()

            # Revert breakage back which can be caused by BZ-1769426
            check_bkp_file_size_cmd = ("%sls -lah %s | awk '{print $5}'" %
                                       (g_pod_prefix, file_for_bkp))
            bkp_file_size = self.cmd_run(check_bkp_file_size_cmd,
                                         hostname=host_to_run_cmds).strip()
            if bkp_file_size == "0":
                self.cmd_run("%smv %s_backup %s" %
                             (g_pod_prefix, file_for_bkp, file_for_bkp),
                             hostname=host_to_run_cmds)
                breakage_err_msg = (
                    "File located at '%s' was corrupted (zero size) on the "
                    "%s. Looks like BZ-1769426 took effect. \n"
                    "Don't worry, it has been restored after test failure." %
                    (file_for_bkp, "'%s' Gluster pod" % g_pod
                     if g_pod else "'%s' Gluster node" % chosen_g_node_ip))
                g.log.error(breakage_err_msg)
                if err_msg:
                    breakage_err_msg = "%s\n%s" % (err_msg, breakage_err_msg)
                raise AssertionError(breakage_err_msg)

        # Wait for all the PVCs to be in bound state
        wait_for_pvcs_be_bound(self.node, pvc_names, timeout=300, wait_step=5)