def _fetch_initial_metrics(self, vol_name_prefix=None, volume_expansion=False): # Create PVC and wait for it to be in 'Bound' state sc_name = self.create_storage_class( vol_name_prefix=vol_name_prefix, allow_volume_expansion=volume_expansion) if vol_name_prefix: pvc_name = self.create_and_wait_for_pvc( pvc_name_prefix=vol_name_prefix, sc_name=sc_name) else: pvc_name = self.create_and_wait_for_pvc(sc_name=sc_name) # Create DC and attach with pvc self.dc_name, pod_name = self.create_dc_with_pvc(pvc_name) for w in waiter.Waiter(120, 10): initial_metrics = self._get_and_manipulate_metric_data( self.metrics, pvc_name) if bool(initial_metrics) and len(initial_metrics) == 6: break if w.expired: raise AssertionError("Unable to fetch metrics for the pvc") return pvc_name, pod_name, initial_metrics
def test_block_vol_offline_expand(self): """Test blockvol expansion while PVC is not in use""" node = self.ocp_master_node[0] pvc_name, dc_name, bvol_info = ( self._block_vol_expand_common_offline_vs_online(False)) # create and wait for job to be completed jobname = oc_create_offline_block_volume_expand_job(node, pvc_name) self.addCleanup(oc_delete, node, 'job', jobname) for w in waiter.Waiter(300, 5): if is_job_complete(node, jobname): break if w.expired: raise AssertionError( "block expand job {} is not completed".format(jobname)) # verify expand size scale_dc_pod_amount_and_wait(node, dc_name[0], pod_amount=1) pod_name = get_pod_name_from_dc(node, dc_name[0]) ret, size, _ = oc_rsh( node, pod_name, 'df -kh /mnt | sed "/Filesystem/d" | awk \'{print $2}\' ' '| sed "s/G//"') self.assertFalse(ret, "Failed to get size from client side") self.assertEqual( int(float(size)), bvol_info["size"], "new size is not " "reflected at mount point after block volume expand")
def _perform_io_and_fetch_metrics(self, pod_name, pvc_name, filename, dirname, metric_data, operation): """Create 1000 files and dirs and validate with old metrics""" openshift_ops.switch_oc_project(self._master, self.storage_project_name) if operation == "create": cmds = ("touch /mnt/{}{{1..1000}}".format(filename), "mkdir /mnt/{}{{1..1000}}".format(dirname)) else: cmds = ("rm -rf /mnt/large_file", "rm -rf /mnt/{}{{1..1000}}".format(filename), "rm -rf /mnt/{}{{1..1000}}".format(dirname)) for cmd in cmds: self.cmd_run("oc rsh {} {}".format(pod_name, cmd)) # Fetch the new metrics and compare the inodes used and bytes used for w in waiter.Waiter(120, 10): after_io_metrics = self._get_and_manipulate_metric_data( self.metrics, pvc_name) if operation == "create": if (int(after_io_metrics['kubelet_volume_stats_inodes_used']) > int(metric_data['kubelet_volume_stats_inodes_used']) and int(after_io_metrics['kubelet_volume_stats_used_bytes'] ) > int(metric_data['kubelet_volume_stats_used_bytes'])): break else: if int(metric_data['kubelet_volume_stats_used_bytes']) > int( after_io_metrics['kubelet_volume_stats_used_bytes']): break if w.expired: raise AssertionError( "After data is modified metrics like bytes used and inodes " "used are not reflected in prometheus")
def _set_log_level(self, node, level, msg, exec_time): delete_log_level = r'sed -i "/\(^log_level.*=.*[0-9]\)/d" {}' set_log_level = r'sed -i "\$alog_level = {}" {}' check_log_msg = r'sed -n "/.*\({}\).*/{{p;}}" {} | tail -1' # Set log level openshift_ops.cmd_run_on_gluster_pod_or_node( self.node, set_log_level.format(level, TCMU_CONF), gluster_node=node) self.addCleanup( openshift_ops.cmd_run_on_gluster_pod_or_node, self.node, delete_log_level.format(TCMU_CONF), gluster_node=node) # Validate log level log_msg = "log level now is {}".format(msg) for w in waiter.Waiter(120, 3): out = openshift_ops.cmd_run_on_gluster_pod_or_node( self.node, check_log_msg.format(log_msg, TCMU_RUNNER_LOG), gluster_node=node) match = re.match(LOG_REGEX, out) if (match and exec_time < datetime.datetime.strptime( match.group(1), self.timeformat)): break if w.expired: raise exceptions.ExecutionError( "Log level '{}:{}' of tcmu did not get changed on node" " {}".format(level, msg, node)) openshift_ops.cmd_run_on_gluster_pod_or_node( self.node, delete_log_level.format(TCMU_CONF), gluster_node=node)
def _wait_for_docker_service_status(self, pod_host_ip, status, state): for w in waiter.Waiter(30, 3): out = command.cmd_run(DOCKER_SERVICE.format("status"), pod_host_ip) for line in out.splitlines(): status_match = re.search(SERVICE_STATUS_REGEX, line) if (status_match and status_match.group(1) == status and status_match.group(2) == state): return True
def _check_for_pending_operations(self, h_node, h_url): # Check for pending operations for w in waiter.Waiter(timeout=120, interval=10): h_db_check = heketi_db_check(h_node, h_url) h_db_check_vol = h_db_check.get("blockvolumes") if h_db_check_vol.get("pending"): break if w.expired: raise exceptions.ExecutionError( "No pending operations found during blockvolumes creation " "{}".format(h_db_check_vol.get("pending")))
def _rebalance_completion(self, volume_name): """Rebalance start and completion after expansion.""" ret, _, err = rebalance_ops.rebalance_start( 'auto_get_gluster_endpoint', volume_name) self.assertFalse( ret, "Rebalance for {} volume not started with error {}".format( volume_name, err)) for w in waiter.Waiter(240, 10): reb_status = rebalance_ops.get_rebalance_status( 'auto_get_gluster_endpoint', volume_name) if reb_status["aggregate"]["statusStr"] == "completed": break if w.expired: raise AssertionError( "Failed to complete the rebalance in 240 seconds")
def test_prometheus_basic_validation(self): """ Validate basic volume metrics using prometheus """ # Fetch the metrics and storing initial_metrics as dictionary pvc_name, pod_name, initial_metrics = self._fetch_initial_metrics( volume_expansion=False) # Create 1000 files and fetch the metrics that the data is updated self._perform_io_and_fetch_metrics(pod_name=pod_name, pvc_name=pvc_name, filename="filename1", dirname="dirname1", metric_data=initial_metrics, operation="create") # Write the IO half the size of the volume and validated from # prometheus pod that the size change is reflected size_to_write = int( initial_metrics['kubelet_volume_stats_capacity_bytes']) // 2 openshift_ops.switch_oc_project(self._master, self.storage_project_name) cmd = ("dd if=/dev/urandom of=/mnt/large_file bs={} count=1024".format( size_to_write // 1024)) ret, _, err = openshift_ops.oc_rsh(self._master, pod_name, cmd) self.assertFalse(ret, 'Failed to write file due to err {}'.format(err)) # Fetching the metrics and validating the data change is reflected for w in waiter.Waiter(120, 10): half_io_metrics = self._get_and_manipulate_metric_data( ['kubelet_volume_stats_used_bytes'], pvc_name) if bool(half_io_metrics) and (int( half_io_metrics['kubelet_volume_stats_used_bytes']) > size_to_write): break if w.expired: raise AssertionError( "After Data is written on the pvc, metrics like inodes used " "and bytes used are not reflected in the prometheus") # Delete the files from the volume and wait for the # updated details reflected in prometheus self._perform_io_and_fetch_metrics(pod_name=pod_name, pvc_name=pvc_name, filename="filename1", dirname="dirname1", metric_data=half_io_metrics, operation="delete")
def _power_off_node_and_wait_node_to_be_not_ready(self, hostname): # Bring down the glusterfs node vm_name = node_ops.find_vm_name_by_ip_or_hostname(hostname) self.addCleanup(self._wait_for_gluster_pod_after_node_reboot, hostname) self.addCleanup(node_ops.power_on_vm_by_name, vm_name) node_ops.power_off_vm_by_name(vm_name) # Wait glusterfs node to become NotReady custom = r'":.status.conditions[?(@.type==\"Ready\")]".status' for w in waiter.Waiter(300, 20): status = openshift_ops.oc_get_custom_resource( self.ocp_client, 'node', custom, hostname) if status[0] in ['False', 'Unknown']: break if w.expired: raise exceptions.ExecutionError( "Failed to bring down node {}".format(hostname))
def wait_to_heal_complete(timeout=300, wait_step=5): """Monitors heal for volumes on gluster""" gluster_vol_list = get_volume_list("auto_get_gluster_endpoint") if not gluster_vol_list: raise AssertionError("failed to get gluster volume list") _waiter = waiter.Waiter(timeout=timeout, interval=wait_step) for gluster_vol in gluster_vol_list: for w in _waiter: if is_heal_complete("auto_get_gluster_endpoint", gluster_vol): break if w.expired: err_msg = ("reached timeout waiting for all the gluster volumes " "to reach the 'healed' state.") g.log.error(err_msg) raise AssertionError(err_msg)
def power_on_vm_by_name(name, timeout=600, interval=10): """Power on the virtual machine and wait for SSH ready within given timeout. Args: name (str): name of the VM which needs to be powered on. Returns: None Raises: CloudProviderError: In case of any failures. """ cloudProvider = _get_cloud_provider() g.log.info('powering on the VM "%s"' % name) cloudProvider.power_on_vm_by_name(name) g.log.info('Powered on the VM "%s" successfully' % name) # Wait for hostname to get assigned _waiter = waiter.Waiter(timeout, interval) err = "" for w in _waiter: try: hostname = cloudProvider.wait_for_hostname(name, 1, 1) # NOTE(vponomar): Reset attempts for waiter to avoid redundant # sleep equal to 'interval' on the next usage. _waiter._attempt = 0 break except Exception as e: err = e g.log.info(e) if w.expired: raise exceptions.CloudProviderError(err) # Wait for hostname to ssh connection ready for w in _waiter: try: wait_for_ssh_connection(hostname, 1, 1) break except Exception as e: g.log.info(e) err = e if w.expired: raise exceptions.CloudProviderError(err)
def wait_to_heal_complete( timeout=300, wait_step=5, g_node="auto_get_gluster_endpoint"): """Monitors heal for volumes on gluster""" gluster_vol_list = get_volume_list(g_node) if not gluster_vol_list: raise AssertionError("failed to get gluster volume list") _waiter = waiter.Waiter(timeout=timeout, interval=wait_step) for gluster_vol in gluster_vol_list: for w in _waiter: if is_heal_complete(g_node, gluster_vol): # NOTE(vponomar): Reset attempts for waiter to avoid redundant # sleep equal to 'interval' on the next usage. _waiter._attempt = 0 break if w.expired: err_msg = ("reached timeout waiting for all the gluster volumes " "to reach the 'healed' state.") g.log.error(err_msg) raise AssertionError(err_msg)
def wait_for_ssh_connection(hostname, timeout=600, interval=10): """Wait for ssh conection to be ready within given timeout. Args: hostname (str): hostname of a machine. Returns: None Raises: CloudProviderError: In case of any failures. """ for w in waiter.Waiter(timeout, interval): try: # Run random command to verify ssh connection g.run(hostname, 'ls') return except (exceptions.ExecutionError, ExecutionError): g.log.info("Waiting for ssh connection on host '%s'" % hostname) msg = 'Not able to connect with the %s' % hostname g.log.error(msg) raise exceptions.CloudProviderError(msg)
def node_reboot_by_command(node, timeout=600, wait_step=10): """Reboot node and wait to start for given timeout. Args: node (str) : Node which needs to be rebooted. timeout (int) : Seconds to wait before node to be started. wait_step (int): Interval in seconds to wait before checking status of node again. """ cmd = "sleep 3; /sbin/shutdown -r now 'Reboot triggered by Glusto'" ret, out, err = g.run(node, cmd) if ret != 255: err_msg = "failed to reboot host '%s' error %s" % (node, err) g.log.error(err_msg) raise AssertionError(err_msg) try: g.ssh_close_connection(node) except Exception as e: g.log.error("failed to close connection with host %s " "with error: %s" % (node, e)) raise # added sleep as node will restart after 3 sec time.sleep(3) for w in waiter.Waiter(timeout=timeout, interval=wait_step): try: if g.rpyc_get_connection(node, user="******"): g.rpyc_close_connection(node, user="******") return except Exception as err: g.log.info("exception while getting connection: '%s'" % err) if w.expired: error_msg = ("exceeded timeout %s sec, node '%s' is " "not reachable" % (timeout, node)) g.log.error(error_msg) raise exceptions.ExecutionError(error_msg)
def test_run_workload_with_logging(self): """Validate logs are being generated aifter running workload""" # Get the size of used space of logs es_pod = openshift_ops.get_pod_name_from_dc( self._master, self._logging_es_dc) mount_point = "/elasticsearch/persistent" cmd_space_check = ('df -kh --output=used {} | sed "/Used/d" |' 'sed "s/G//"'.format(mount_point)) ret, initial_used_percent, err = openshift_ops.oc_rsh( self._master, es_pod, cmd_space_check) err_msg = "Failed to fetch the size of used space, error {}" self.assertFalse(ret, err_msg.format(err)) # Create 20 pvcs and app pods with io openshift_ops.switch_oc_project( self._master, self.storage_project_name) pvc_count, batch_count = 5, 4 for _ in range(batch_count): pvcs = self.create_and_wait_for_pvcs(pvc_amount=pvc_count) self.create_dcs_with_pvc(pvcs) self.addCleanup( openshift_ops.switch_oc_project, self._master, self.storage_project_name) # Get and verify the final used size of used space of logs openshift_ops.switch_oc_project( self._master, self._logging_project_name) for w in waiter.Waiter(600, 30): ret, final_used_percent, err = openshift_ops.oc_rsh( self._master, es_pod, cmd_space_check) self.assertFalse(ret, err_msg.format(err)) if int(initial_used_percent) < int(final_used_percent): break if w.expired: raise AssertionError( "Initial used space {} for logs is not less than final " "used space {}".format( initial_used_percent, final_used_percent))
def wait_to_heal_complete(vol_name=None, g_node="auto_get_gluster_endpoint", timeout=300, wait_step=5): """Monitors heal for volumes on gluster Args: vol_name (str): Name of the gluster volume else default is None and will check for all the volumes g_node (str): Name of the gluster node else default is auto_get_gluster_endpoint timeout (int): Time to wait for heal check to complete default is 300 wait_step (int): Time to trigger heal check command for next iteration Raises: AssertionError: In case heal is not complete """ if not vol_name: gluster_vol_list = get_volume_list(g_node) if not gluster_vol_list: raise AssertionError("failed to get gluster volume list") else: gluster_vol_list = [vol_name] _waiter = waiter.Waiter(timeout=timeout, interval=wait_step) for gluster_vol in gluster_vol_list: for w in _waiter: if is_heal_complete(g_node, gluster_vol): # NOTE(vponomar): Reset attempts for waiter to avoid redundant # sleep equal to 'interval' on the next usage. _waiter._attempt = 0 break if w.expired: err_msg = ("reached timeout waiting for all the gluster volumes " "to reach the 'healed' state.") g.log.error(err_msg) raise AssertionError(err_msg)
def test_run_workload_with_metrics(self): """Validate if logs are being generated after running workload""" # Get the size of used space of logs cassandra_pod = get_pod_name_from_rc( self.master, self.metrics_rc_hawkular_cassandra) mount_point = "/cassandra_data" cmd_space_check = ('df -k --output=used {} | sed "/Used/d" |' 'sed "s/G//"'.format(mount_point)) ret, initial_used_percent, err = oc_rsh(self.master, cassandra_pod, cmd_space_check) err_msg = "Failed to fetch the size of used space, error {}" self.assertFalse(ret, err_msg.format(err)) # Create 20 PVCs and app pods with IO switch_oc_project(self.master, self.storage_project_name) pvc_count, batch_count = 5, 4 for _ in range(batch_count): pvcs = self.create_and_wait_for_pvcs(pvc_amount=pvc_count) self.create_dcs_with_pvc(pvcs) self.addCleanup(switch_oc_project, self.master, self.storage_project_name) # Get and verify the final size of used space of logs switch_oc_project(self.master, self.metrics_project_name) for w in waiter.Waiter(600, 30): ret, final_used_percent, err = oc_rsh(self.master, cassandra_pod, cmd_space_check) self.assertFalse(ret, err_msg.format(err)) if int(initial_used_percent) < int(final_used_percent): break if w.expired: raise AssertionError( "Initial used space {} for logs is not less than final " "used space {}".format(initial_used_percent, final_used_percent))
def test_prometheus_pv_resize(self): """ Validate prometheus metrics with pv resize""" # Fetch the metrics and storing initial_metrics as dictionary pvc_name, pod_name, initial_metrics = self._fetch_initial_metrics( vol_name_prefix="for-pv-resize", volume_expansion=True) # Write data on the pvc and confirm it is reflected in the prometheus self._perform_io_and_fetch_metrics( pod_name=pod_name, pvc_name=pvc_name, filename="filename1", dirname="dirname1", metric_data=initial_metrics, operation="create") # Resize the pvc to 2GiB openshift_ops.switch_oc_project( self._master, self.storage_project_name) pvc_size = 2 openshift_ops.resize_pvc(self._master, pvc_name, pvc_size) openshift_ops.wait_for_events(self._master, obj_name=pvc_name, event_reason='VolumeResizeSuccessful') openshift_ops.verify_pvc_size(self._master, pvc_name, pvc_size) pv_name = openshift_ops.get_pv_name_from_pvc( self._master, pvc_name) openshift_ops.verify_pv_size(self._master, pv_name, pvc_size) heketi_volume_name = heketi_ops.heketi_volume_list_by_name_prefix( self.heketi_client_node, self.heketi_server_url, "for-pv-resize", json=True)[0][2] self.assertIsNotNone( heketi_volume_name, "Failed to fetch volume with prefix {}". format("for-pv-resize")) openshift_ops.oc_delete(self._master, 'pod', pod_name) openshift_ops.wait_for_resource_absence(self._master, 'pod', pod_name) pod_name = openshift_ops.get_pod_name_from_dc( self._master, self.dc_name) openshift_ops.wait_for_pod_be_ready(self._master, pod_name) # Check whether the metrics are updated or not for w in waiter.Waiter(120, 10): resize_metrics = self._get_and_manipulate_metric_data( self.metrics, pvc_name) if bool(resize_metrics) and int(resize_metrics[ 'kubelet_volume_stats_capacity_bytes']) > int( initial_metrics['kubelet_volume_stats_capacity_bytes']): break if w.expired: raise AssertionError("Failed to reflect PVC Size after resizing") openshift_ops.switch_oc_project( self._master, self.storage_project_name) time.sleep(240) # Lookup and trigger rebalance and wait for the its completion for _ in range(100): self.cmd_run("oc rsh {} ls /mnt/".format(pod_name)) self._rebalance_completion(heketi_volume_name) # Write data on the resized pvc and compared with the resized_metrics self._perform_io_and_fetch_metrics( pod_name=pod_name, pvc_name=pvc_name, filename="secondfilename", dirname="seconddirname", metric_data=resize_metrics, operation="create")
def restart_gluster_vol_brick_processes(ocp_client_node, file_vol, gluster_nodes): """Restarts brick process of a file volume. Args: ocp_client_node (str): Node to execute OCP commands on. file_vol (str): file volume name. gluster_nodes (str/list): One or several IPv4 addresses of Gluster nodes, where 'file_vol' brick processes must be recreated. """ if not isinstance(gluster_nodes, (list, set, tuple)): gluster_nodes = [gluster_nodes] # Get Gluster vol brick PIDs gluster_volume_status = get_gluster_vol_status(file_vol) pids = [] for gluster_node in gluster_nodes: pid = None for g_node, g_node_data in gluster_volume_status.items(): if g_node != gluster_node: continue for process_name, process_data in g_node_data.items(): if not process_name.startswith("/var"): continue pid = process_data["pid"] # When birck is down, pid of the brick is returned as -1. # Which is unexepeted situation. So, add appropriate assertion. assert pid != "-1", ( "Got unexpected PID (-1) for '%s' gluster vol on '%s' " "node." % file_vol, gluster_node) assert pid, ("Could not find 'pid' in Gluster vol data for '%s' " "Gluster node. Data: %s" % ( gluster_node, gluster_volume_status)) pids.append((gluster_node, pid)) # Restart Gluster vol brick processes using found PIDs for gluster_node, pid in pids: cmd = "kill -9 %s" % pid cmd_run_on_gluster_pod_or_node(ocp_client_node, cmd, gluster_node) # Wait for Gluster vol brick processes to be recreated for gluster_node, pid in pids: killed_pid_cmd = "ps -eaf | grep %s | grep -v grep | awk '{print $2}'" _waiter = waiter.Waiter(timeout=60, interval=2) for w in _waiter: result = cmd_run_on_gluster_pod_or_node( ocp_client_node, killed_pid_cmd, gluster_node) if result.strip() == pid: continue g.log.info("Brick process '%s' was killed successfully on '%s'" % ( pid, gluster_node)) break if w.expired: error_msg = ("Process ID '%s' still exists on '%s' after waiting " "for it 60 seconds to get killed." % ( pid, gluster_node)) g.log.error(error_msg) raise exceptions.ExecutionError(error_msg) # Start volume after gluster vol brick processes recreation ret, out, err = volume_start( "auto_get_gluster_endpoint", file_vol, force=True) if ret != 0: err_msg = "Failed to start gluster volume %s on %s. error: %s" % ( file_vol, gluster_node, err) g.log.error(err_msg) raise AssertionError(err_msg)
def test_heketi_server_stale_operations_during_heketi_pod_reboot(self): """ Validate failed/stale entries in db and performs a cleanup of those entries """ volume_id_list, async_obj, ocp_node = [], [], self.ocp_master_node[0] h_node, h_server = self.heketi_client_node, self.heketi_server_url for i in range(0, 8): volume_info = heketi_ops.heketi_volume_create(h_node, h_server, 1, json=True) volume_id_list.append(volume_info["id"]) self.addCleanup(heketi_ops.heketi_volume_delete, h_node, h_server, volume_info["id"], raise_on_error=False) def run_async(cmd, hostname, raise_on_error=True): async_op = g.run_async(host=hostname, command=cmd) async_obj.append(async_op) return async_op # Temporary replace g.run with g.async_run in heketi_volume_delete # to be able to run it in background. for vol_id in volume_id_list: with mock.patch.object(command, 'cmd_run', side_effect=run_async): heketi_ops.heketi_volume_delete(h_node, h_server, vol_id) # Restart heketi pod and check pod is running heketi_pod_name = openshift_ops.get_pod_name_from_dc( ocp_node, self.heketi_dc_name) openshift_ops.oc_delete(ocp_node, 'pod', heketi_pod_name, collect_logs=self.heketi_logs_before_delete) self.addCleanup(self._heketi_pod_delete_cleanup, ocp_node) openshift_ops.wait_for_resource_absence(ocp_node, 'pod', heketi_pod_name) heketi_pod_name = openshift_ops.get_pod_name_from_dc( ocp_node, self.heketi_dc_name) openshift_ops.wait_for_pod_be_ready(ocp_node, heketi_pod_name) self.assertTrue(heketi_ops.hello_heketi(h_node, h_server), "Heketi server {} is not alive".format(h_server)) # Wait for pending operations to get generate for w in waiter.Waiter(timeout=30, interval=3): h_db_check = heketi_ops.heketi_db_check(h_node, h_server) h_db_check_vol = h_db_check.get("volumes") h_db_check_bricks = h_db_check.get("bricks") if ((h_db_check_vol.get("pending")) and (h_db_check_bricks.get("pending"))): break if w.expired: raise exceptions.ExecutionError( "No any pending operations found during volumes deletion " "volumes:{}, Bricks:{} ".format( h_db_check_vol.get("pending"), h_db_check_bricks.get("pending"))) # Verify pending bricks are multiples of 3 self.assertFalse( h_db_check_bricks.get("pending") % 3, "Expecting bricks pending count to be multiple of 3 but " "found {}".format(h_db_check_bricks.get("pending"))) # Verify and Wait for pending operations to complete for w in waiter.Waiter(timeout=120, interval=10): h_db_check = heketi_ops.heketi_db_check(h_node, h_server) h_db_check_vol = h_db_check.get("volumes") h_db_check_bricks = h_db_check.get("bricks") if ((not h_db_check_bricks.get("pending")) and (not h_db_check_vol.get("pending"))): break if w.expired: raise AssertionError("Failed to delete volumes after 120 secs")
def test_heketi_metrics_validation_after_node(self, condition): """Validate heketi metrics after adding and remove node""" # Get additional node additional_host_info = g.config.get("additional_gluster_servers") if not additional_host_info: self.skipTest( "Skipping this test case as additional gluster server is " "not provied in config file") additional_host_info = list(additional_host_info.values())[0] storage_hostname = additional_host_info.get("manage") storage_ip = additional_host_info.get("storage") if not (storage_hostname and storage_ip): self.skipTest( "Config options 'additional_gluster_servers.manage' " "and 'additional_gluster_servers.storage' must be set.") h_client, h_server = self.heketi_client_node, self.heketi_server_url initial_node_count, final_node_count = 0, 0 # Get initial node count from prometheus metrics metric_result = self._fetch_metric_from_promtheus_pod( metric='heketi_nodes_count') initial_node_count = reduce( lambda x, y: x + y, [result.get('value')[1] for result in metric_result]) # Switch to storage project openshift_ops.switch_oc_project( self._master, self.storage_project_name) # Configure node before adding node self.configure_node_to_run_gluster(storage_hostname) # Get cluster list cluster_info = heketi_ops.heketi_cluster_list( h_client, h_server, json=True) # Add node to the cluster heketi_node_info = heketi_ops.heketi_node_add( h_client, h_server, len(self.gluster_servers), cluster_info.get('clusters')[0], storage_hostname, storage_ip, json=True) heketi_node_id = heketi_node_info.get("id") self.addCleanup( heketi_ops.heketi_node_delete, h_client, h_server, heketi_node_id, raise_on_error=False) self.addCleanup( heketi_ops.heketi_node_remove, h_client, h_server, heketi_node_id, raise_on_error=False) self.addCleanup( heketi_ops.heketi_node_disable, h_client, h_server, heketi_node_id, raise_on_error=False) self.addCleanup( openshift_ops.switch_oc_project, self._master, self.storage_project_name) if condition == 'delete': # Switch to openshift-monitoring project openshift_ops.switch_oc_project( self.ocp_master_node[0], self._prometheus_project_name) # Get initial node count from prometheus metrics for w in waiter.Waiter(timeout=60, interval=10): metric_result = self._fetch_metric_from_promtheus_pod( metric='heketi_nodes_count') node_count = reduce( lambda x, y: x + y, [result.get('value')[1] for result in metric_result]) if node_count != initial_node_count: break if w.expired: raise exceptions.ExecutionError( "Failed to get updated node details from prometheus") # Remove node from cluster heketi_ops.heketi_node_disable(h_client, h_server, heketi_node_id) heketi_ops.heketi_node_remove(h_client, h_server, heketi_node_id) for device in heketi_node_info.get('devices'): heketi_ops.heketi_device_delete( h_client, h_server, device.get('id')) heketi_ops.heketi_node_delete(h_client, h_server, heketi_node_id) # Switch to openshift-monitoring project openshift_ops.switch_oc_project( self.ocp_master_node[0], self._prometheus_project_name) # Get final node count from prometheus metrics for w in waiter.Waiter(timeout=60, interval=10): metric_result = self._fetch_metric_from_promtheus_pod( metric='heketi_nodes_count') final_node_count = reduce( lambda x, y: x + y, [result.get('value')[1] for result in metric_result]) if condition == 'delete': if final_node_count < node_count: break else: if final_node_count > initial_node_count: break if w.expired: raise exceptions.ExecutionError( "Failed to update node details in prometheus")
def test_verify_delete_heketi_volumes_pending_entries_in_db( self, vol_type): """Verify pending entries of blockvolumes/volumes and bricks in heketi db during blockvolume/volume delete operation. """ # Create a large volumes to observe the pending operation vol_count, volume_ids, async_obj = 10, [], [] h_node, h_url = self.heketi_client_node, self.heketi_server_url # Verify file/block volumes pending operation before creation, h_db_check_before = heketi_db_check(h_node, h_url) h_db_check_bricks_before = h_db_check_before.get("bricks") h_db_check_vol_before = (h_db_check_before.get( "{}volumes".format(vol_type))) # Get existing heketi volume list existing_volumes = heketi_volume_list(h_node, h_url, json=True) # Add cleanup function to clean stale volumes created during test self.addCleanup(self._cleanup_heketi_volumes, existing_volumes.get("volumes")) # Delete heketi pod to clean db operations if (h_db_check_bricks_before.get("pending") or h_db_check_vol_before.get("pending")): self._respin_heketi_pod() # Calculate heketi volume size free_space, nodenum = get_total_free_space(h_node, h_url) free_space_available = int(free_space / nodenum) if free_space_available > vol_count: h_volume_size = int(free_space_available / vol_count) if h_volume_size > 50: h_volume_size = 50 else: h_volume_size, vol_count = 1, free_space_available # Create BHV in case blockvolume size is greater than default BHV size if vol_type: default_bhv_size = get_default_block_hosting_volume_size( h_node, self.heketi_dc_name) if default_bhv_size < h_volume_size: h_volume_name = "autotest-{}".format(utils.get_random_str()) bhv_info = self.create_heketi_volume_with_name_and_wait( h_volume_name, free_space_available, raise_on_cleanup_error=False, block=True, json=True) free_space_available -= ( int(bhv_info.get("blockinfo").get("reservedsize")) + 1) h_volume_size = int(free_space_available / vol_count) # Create file/block volumes for _ in range(vol_count): vol_id = eval("heketi_{}volume_create".format(vol_type))( h_node, h_url, h_volume_size, json=True).get("id") volume_ids.append(vol_id) self.addCleanup(eval("heketi_{}volume_delete".format(vol_type)), h_node, h_url, vol_id, raise_on_error=False) def run_async(cmd, hostname, raise_on_error=True): async_op = g.run_async(host=hostname, command=cmd) async_obj.append(async_op) return async_op bhv_list = [] for vol_id in volume_ids: # Get BHV ids to delete in case of block volumes if vol_type: vol_info = (heketi_blockvolume_info(h_node, h_url, vol_id, json=True)) if not vol_info.get("blockhostingvolume") in bhv_list: bhv_list.append(vol_info.get("blockhostingvolume")) # Temporary replace g.run with g.async_run in heketi_volume_delete # and heketi_blockvolume_delete func to be able to run it in # background. with mock.patch.object(command, 'cmd_run', side_effect=run_async): eval("heketi_{}volume_delete".format(vol_type))(h_node, h_url, vol_id) # Wait for pending operations to get generate for w in waiter.Waiter(timeout=30, interval=3): h_db_check = heketi_db_check(h_node, h_url) h_db_check_vol = h_db_check.get("{}volumes".format(vol_type)) if h_db_check_vol.get("pending"): h_db_check_bricks = h_db_check.get("bricks") break if w.expired: raise exceptions.ExecutionError( "No any pending operations found during {}volumes deletion " "{}".format(vol_type, h_db_check_vol.get("pending"))) # Verify bricks pending operation during creation if not vol_type: self.assertTrue(h_db_check_bricks.get("pending"), "Expecting at least one bricks pending count") self.assertFalse( h_db_check_bricks.get("pending") % 3, "Expecting bricks pending count to be multiple of 3 but " "found {}".format(h_db_check_bricks.get("pending"))) # Verify file/block volume pending operation during delete for w in waiter.Waiter(timeout=120, interval=10): h_db_check = heketi_db_check(h_node, h_url) h_db_check_vol = h_db_check.get("{}volumes".format(vol_type)) h_db_check_bricks = h_db_check.get("bricks") if ((not h_db_check_bricks.get("pending")) and (not h_db_check_vol.get("pending"))): break if w.expired: raise AssertionError( "Failed to delete {}volumes after 120 secs".format(vol_type)) # Check that all background processes got exited for obj in async_obj: ret, out, err = obj.async_communicate() self.assertFalse( ret, "Failed to delete {}volume due to error: {}".format( vol_type, err)) # Delete BHV created during block volume creation if vol_type: for bhv_id in bhv_list: heketi_volume_delete(h_node, h_url, bhv_id) # Verify bricks and volume pending operations h_db_check_after = heketi_db_check(h_node, h_url) h_db_check_bricks_after = h_db_check_after.get("bricks") h_db_check_vol_after = (h_db_check_after.get( "{}volumes".format(vol_type))) act_brick_count = h_db_check_bricks_after.get("pending") act_vol_count = h_db_check_vol_after.get("pending") # Verify bricks pending operation after delete err_msg = "{} operations are pending for {} after {}volume deletion" if not vol_type: self.assertFalse( act_brick_count, err_msg.format(act_brick_count, "brick", vol_type)) # Verify file/bock volumes pending operation after delete self.assertFalse(act_vol_count, err_msg.format(act_vol_count, "volume", vol_type)) act_brick_count = h_db_check_bricks_after.get("total") act_vol_count = h_db_check_vol_after.get("total") exp_brick_count = h_db_check_bricks_before.get("total") exp_vol_count = h_db_check_vol_before.get("total") err_msg = "Actual {} and expected {} {} counts are not matched" # Verify if initial and final file/block volumes are same self.assertEqual( act_vol_count, exp_vol_count, err_msg.format(act_vol_count, exp_vol_count, "volume")) # Verify if initial and final bricks are same self.assertEqual( act_brick_count, exp_brick_count, err_msg.format(act_brick_count, exp_brick_count, "brick"))
def test_verify_create_heketi_volumes_pending_entries_in_db( self, vol_type): """Verify pending entries of file/block volumes in db during volumes creation from heketi side """ # Create large volumes to observe the pending operations vol_count, h_vol_creation_async_op = 3, [] h_node, h_url = self.heketi_client_node, self.heketi_server_url # Verify file/block volumes pending operation before creation, h_db_check_before = heketi_db_check(h_node, h_url) h_db_check_vol_before = (h_db_check_before.get( "{}volumes".format(vol_type))) # Delete heketi pod to clean db operations if (h_db_check_vol_before.get("pending") or h_db_check_before.get("bricks").get("pending")): self._respin_heketi_pod() # Calculate heketi volume size free_space, nodenum = get_total_free_space(h_node, h_url) free_space_available = int(free_space / nodenum) if free_space_available > vol_count: h_volume_size = int(free_space_available / vol_count) if h_volume_size > 30: h_volume_size = 30 else: h_volume_size, vol_count = 1, free_space_available # Get existing heketi volume list existing_volumes = heketi_volume_list(h_node, h_url, json=True) # Add cleanup function to clean stale volumes created during test self.addCleanup(self._cleanup_heketi_volumes, existing_volumes.get("volumes")) # Create BHV in case blockvolume size is greater than default BHV size if vol_type: default_bhv_size = get_default_block_hosting_volume_size( h_node, self.heketi_dc_name) if default_bhv_size < h_volume_size: h_volume_name = "autotest-{}".format(utils.get_random_str()) bhv_info = self.create_heketi_volume_with_name_and_wait( h_volume_name, free_space_available, raise_on_cleanup_error=False, block=True, json=True) free_space_available -= ( int(bhv_info.get("blockinfo").get("reservedsize")) + 1) h_volume_size = int(free_space_available / vol_count) # Temporary replace g.run with g.async_run in heketi_blockvolume_create # func to be able to run it in background.Also, avoid parsing the # output as it won't be json at that moment. Parse it after reading # the async operation results. def run_async(cmd, hostname, raise_on_error=True): return g.run_async(host=hostname, command=cmd) for count in range(vol_count): with mock.patch.object(json, 'loads', side_effect=(lambda j: j)): with mock.patch.object(command, 'cmd_run', side_effect=run_async): h_vol_creation_async_op.append( eval("heketi_{}volume_create".format(vol_type))( h_node, h_url, h_volume_size, json=True)) # Check for pending operations for w in waiter.Waiter(timeout=120, interval=10): h_db_check = heketi_db_check(h_node, h_url) h_db_check_vol = h_db_check.get("{}volumes".format(vol_type)) if h_db_check_vol.get("pending"): h_db_check_bricks = h_db_check.get("bricks") break if w.expired: raise exceptions.ExecutionError( "No any pending operations found during {}volumes creation " "{}".format(vol_type, h_db_check_vol.get("pending"))) # Verify bricks pending operation during creation if not vol_type: self.assertTrue(h_db_check_bricks.get("pending"), "Expecting at least one bricks pending count") self.assertFalse( h_db_check_bricks.get("pending") % 3, "Expecting bricks pending count to be multiple of 3 but " "found {}".format(h_db_check_bricks.get("pending"))) # Wait for all counts of pending operations to be zero for w in waiter.Waiter(timeout=300, interval=10): h_db_check = heketi_db_check(h_node, h_url) h_db_check_vol = h_db_check.get("{}volumes".format(vol_type)) if not h_db_check_vol.get("pending"): break if w.expired: raise exceptions.ExecutionError( "Expecting no pending operations after 300 sec but " "found {} operation".format(h_db_check_vol.get("pending"))) # Get heketi server DB details h_db_check_after = heketi_db_check(h_node, h_url) h_db_check_vol_after = (h_db_check_after.get( "{}volumes".format(vol_type))) h_db_check_bricks_after = h_db_check_after.get("bricks") # Verify if initial and final file/block volumes are same act_vol_count = h_db_check_vol_after.get("total") exp_vol_count = h_db_check_vol_before.get("total") + vol_count err_msg = ( "Actual {} and expected {} {}volume counts are not matched".format( act_vol_count, exp_vol_count, vol_type)) self.assertEqual(act_vol_count, exp_vol_count, err_msg) # Verify if initial and final bricks are same for file volume volumes = heketi_volume_list(h_node, h_url, json=True).get("volumes") new_volumes = list(set(volumes) - set(existing_volumes)) exp_brick_count = 0 for volume in new_volumes: vol_info = heketi_volume_info(h_node, h_url, volume, json=True) exp_brick_count += len(vol_info.get("bricks")) err_msg = "Actual {} and expected {} bricks counts are not matched" act_brick_count = h_db_check_bricks_after.get("total") self.assertEqual(act_brick_count, exp_brick_count, err_msg.format(act_brick_count, exp_brick_count))
def test_verify_pending_entries_in_db(self): """Verify pending entries of volumes and bricks in db during volume creation from heketi side """ h_volume_size = 100 h_db_chk_bfr_v_creation = heketi_db_check(self.heketi_client_node, self.heketi_server_url) if (h_db_chk_bfr_v_creation["bricks"]["pending"] != 0 or h_db_chk_bfr_v_creation["volumes"]["pending"] != 0): self.skipTest( "Skip TC due to unexpected bricks/volumes pending operations") # Verify bricks and volume pending operation before creation self.assertEqual(h_db_chk_bfr_v_creation["bricks"]["pending"], 0) self.assertEqual(h_db_chk_bfr_v_creation["volumes"]["pending"], 0) # Temporary replace g.run with g.async_run in heketi_volume_create func # to be able to run it in background.Also, avoid parsing the output as # it won't be json at that moment. Parse it after reading the async # operation results. def run_async(cmd, hostname, raise_on_error=True): return g.run_async(host=hostname, command=cmd) with mock.patch.object(json, 'loads', side_effect=(lambda j: j)): with mock.patch.object(command, 'cmd_run', side_effect=run_async): h_vol_creation_async_op = heketi_volume_create( self.heketi_client_node, self.heketi_server_url, h_volume_size, json=True) for w in waiter.Waiter(timeout=5, interval=1): h_db_chk_during_v_creation = heketi_db_check( self.heketi_client_node, self.heketi_server_url) if h_db_chk_during_v_creation["bricks"]["pending"] != 0: break if w.expired: err_msg = "No pending operation in Heketi db" g.log.error(err_msg) raise exceptions.ExecutionError(err_msg) retcode, stdout, stderr = h_vol_creation_async_op.async_communicate() heketi_vol = json.loads(stdout) volume_id = heketi_vol["id"] self.addCleanup(heketi_volume_delete, self.heketi_client_node, self.heketi_server_url, volume_id, raise_on_error=True) # Verify volume pending operation during creation self.assertFalse(h_db_chk_during_v_creation["bricks"]["pending"] % 3) self.assertEqual(h_db_chk_bfr_v_creation["volumes"]["pending"] + 1, h_db_chk_during_v_creation["volumes"]["pending"]) h_db_chk_after_v_creation = heketi_db_check(self.heketi_client_node, self.heketi_server_url) # Verify bricks and volume pending operation after creation self.assertEqual(h_db_chk_after_v_creation["bricks"]["pending"], 0) self.assertEqual(h_db_chk_after_v_creation["volumes"]["pending"], 0)
def test_heketi_manual_cleanup_operation_in_bhv(self): """Validate heketi db cleanup will resolve the mismatch in the free size of the block hosting volume with failed block device create operations. """ bhv_size_before, bhv_size_after, vol_count = [], [], 5 ocp_node, g_node = self.ocp_master_node[0], self.gluster_servers[0] h_node, h_url = self.heketi_client_node, self.heketi_server_url # Get existing heketi volume list existing_volumes = heketi_volume_list(h_node, h_url, json=True) # Add function to clean stale volumes created during test self.addCleanup(self._cleanup_heketi_volumes, existing_volumes.get("volumes")) # Get nodes id list node_id_list = heketi_node_list(h_node, h_url) # Disable 4th and other nodes for node_id in node_id_list[3:]: heketi_node_disable(h_node, h_url, node_id) self.addCleanup(heketi_node_enable, h_node, h_url, node_id) # Calculate heketi volume size free_space, nodenum = get_total_free_space(h_node, h_url) free_space_available = int(free_space / nodenum) if free_space_available > vol_count: h_volume_size = int(free_space_available / vol_count) if h_volume_size > 50: h_volume_size = 50 else: h_volume_size, vol_count = 1, free_space_available # Create BHV in case blockvolume size is greater than default BHV size default_bhv_size = get_default_block_hosting_volume_size( h_node, self.heketi_dc_name) if default_bhv_size < h_volume_size: h_volume_name = "autotest-{}".format(utils.get_random_str()) bhv_info = self.create_heketi_volume_with_name_and_wait( h_volume_name, free_space_available, raise_on_cleanup_error=False, block=True, json=True) free_space_available -= ( int(bhv_info.get("blockinfo").get("reservedsize")) + 1) h_volume_size = int(free_space_available / vol_count) # Get BHV list h_bhv_list = get_block_hosting_volume_list(h_node, h_url).keys() self.assertTrue(h_bhv_list, "Failed to get the BHV list") # Get BHV size for bhv in h_bhv_list: vol_info = heketi_volume_info(h_node, h_url, bhv, json=True) bhv_vol_size_before = vol_info.get("freesize") bhv_size_before.append(bhv_vol_size_before) # Kill Tcmu-runner service services = ("tcmu-runner", "gluster-block-target", "gluster-blockd") kill_service_on_gluster_pod_or_node(ocp_node, "tcmu-runner", g_node) # Restart the services for service in services: state = ('exited' if service == 'gluster-block-target' else 'running') self.addCleanup(wait_for_service_status_on_gluster_pod_or_node, ocp_node, service, 'active', state, g_node) self.addCleanup(restart_service_on_gluster_pod_or_node, ocp_node, service, g_node) def run_async(cmd, hostname, raise_on_error=True): return g.run_async(host=hostname, command=cmd) # Create stale block volumes in async for count in range(vol_count): with mock.patch.object(json, 'loads', side_effect=(lambda j: j)): with mock.patch.object(command, 'cmd_run', side_effect=run_async): heketi_blockvolume_create(h_node, h_url, h_volume_size, json=True) # Wait for pending operation to get generated self._check_for_pending_operations(h_node, h_url) # Restart the services for service in services: state = ('exited' if service == 'gluster-block-target' else 'running') restart_service_on_gluster_pod_or_node(ocp_node, service, g_node) wait_for_service_status_on_gluster_pod_or_node( ocp_node, service, 'active', state, g_node) # Cleanup pending operation heketi_server_operation_cleanup(h_node, h_url) # wait for pending operation to get cleaned up for w in waiter.Waiter(timeout=120, interval=10): # Get BHV size for bhv in h_bhv_list: vol_info = heketi_volume_info(h_node, h_url, bhv, json=True) bhv_vol_size_after = vol_info.get("freesize") bhv_size_after.append(bhv_vol_size_after) if (set(bhv_size_before) == set(bhv_size_after)): break if w.expired: raise exceptions.ExecutionError( "Failed to Validate volume size Actual:{}," " Expected:{}".format(set(bhv_size_before), set(bhv_size_after)))
def test_pv_resize_when_heketi_down(self): """Create a PVC and try to expand it when heketi is down, It should fail. After heketi is up, expand PVC should work. """ self.create_storage_class(allow_volume_expansion=True) pvc_name = self.create_and_wait_for_pvc() dc_name, pod_name = self.create_dc_with_pvc(pvc_name) pv_name = get_pv_name_from_pvc(self.node, pvc_name) custom = (r':metadata.annotations.' r'"gluster\.kubernetes\.io\/heketi-volume-id"') vol_id = oc_get_custom_resource(self.node, 'pv', custom, pv_name)[0] h_vol_info = heketi_ops.heketi_volume_info(self.heketi_client_node, self.heketi_server_url, vol_id, json=True) # Bring the heketi POD down scale_dc_pod_amount_and_wait(self.node, self.heketi_dc_name, pod_amount=0) self.addCleanup(scale_dc_pod_amount_and_wait, self.node, self.heketi_dc_name, pod_amount=1) cmd = 'dd if=/dev/urandom of=/mnt/%s bs=614400k count=1' ret, out, err = oc_rsh(self.node, pod_name, cmd % 'file1') self.assertFalse(ret, 'Not able to write file with err: %s' % err) wait_for_pod_be_ready(self.node, pod_name, 10, 5) resize_pvc(self.node, pvc_name, 2) wait_for_events(self.node, pvc_name, obj_type='PersistentVolumeClaim', event_type='Warning', event_reason='VolumeResizeFailed') # Verify volume was not expanded vol_info = get_gluster_vol_info_by_pvc_name(self.node, pvc_name) self.assertEqual(vol_info['gluster_vol_id'], h_vol_info['name']) self.assertEqual(len(vol_info['bricks']['brick']), len(h_vol_info['bricks'])) # Bring the heketi POD up scale_dc_pod_amount_and_wait(self.node, self.heketi_dc_name, pod_amount=1) # Verify volume expansion verify_pvc_size(self.node, pvc_name, 2) vol_info = get_gluster_vol_info_by_pvc_name(self.node, pvc_name) self.assertFalse(len(vol_info['bricks']['brick']) % 3) self.assertLess(len(h_vol_info['bricks']), len(vol_info['bricks']['brick'])) # Wait for remount after expansion for w in waiter.Waiter(timeout=30, interval=5): ret, out, err = oc_rsh(self.node, pod_name, "df -Ph /mnt | awk '{print $2}' | tail -1") self.assertFalse(ret, 'Failed with err: %s and Output: %s' % (err, out)) if out.strip() == '2.0G': break # Write data making sure we have more space than it was ret, out, err = oc_rsh(self.node, pod_name, cmd % 'file2') self.assertFalse(ret, 'Not able to write file with err: %s' % err) # Verify pod is running wait_for_pod_be_ready(self.node, pod_name, 10, 5)
def test_heketi_server_db_pending_entries_for_volume_operations( self, vol_type): """Verify pending entries of blockvolumes/volumes and bricks in db during heketi blockvolume/volume delete operation. """ # Create a large volumes to observe the pending operation h_volume_size, volume_ids, async_obj = 95, [], [] h_node, h_url = self.heketi_client_node, self.heketi_server_url h_db_check_before = heketi_db_check(h_node, h_url) h_db_check_bricks_before = h_db_check_before["bricks"] h_db_check_vol_before = h_db_check_before["{}volumes".format(vol_type)] # Check file/block volume pending operations before creation. if h_db_check_vol_before["pending"]: self.skipTest( "Skip TC due to unexpected {}volumes pending operations". format(vol_type)) # Check bricks pending operations before creation. if h_db_check_bricks_before["pending"]: self.skipTest( "Skip TC due to unexpected bricks pending operations for" " {}volume".format(vol_type)) # Create 5 file/block volumes to find out pending operations for count in range(5): vol_info = eval("heketi_{}volume_create".format(vol_type))( h_node, h_url, h_volume_size, json=True) volume_ids.append(vol_info["id"]) self.addCleanup(eval("heketi_{}volume_delete".format(vol_type)), h_node, h_url, vol_info["id"], raise_on_error=False) h_db_check_after = heketi_db_check(h_node, h_url) h_db_check_bricks_after = h_db_check_after["bricks"] h_db_check_vol_after = h_db_check_after["{}volumes".format( vol_type)] # Verify file/block volumes pending operation after creation err_msg = ("Expecting heketi db {}volume pending operation to be " "0 but found {}") self.assertFalse( h_db_check_vol_after["pending"], err_msg.format(vol_type, h_db_check_vol_after["pending"])) # Verify bricks pending operation after volume creation err_msg = ("Expecting heketi db bricks pending operation to be " "0 but found {} after {}volume creation") self.assertFalse( h_db_check_bricks_after["pending"], err_msg.format(h_db_check_bricks_after["pending"], vol_type)) def run_async(cmd, hostname, raise_on_error=True): async_op = g.run_async(host=hostname, command=cmd) async_obj.append(async_op) return async_op for vol_id in volume_ids: # Temporary replace g.run with g.async_run in heketi_volume_delete # and heketi_blockvolume_delete func to be able to run it in # background. with mock.patch.object(command, 'cmd_run', side_effect=run_async): eval("heketi_{}volume_delete".format(vol_type))(h_node, h_url, vol_id) for w in waiter.Waiter(timeout=10, interval=1): h_db_check = heketi_db_check(h_node, h_url) h_db_check_bricks = h_db_check["bricks"] h_db_check_vol = h_db_check["{}volumes".format(vol_type)] if h_db_check_vol["pending"] != 0: break if w.expired: err_msg = ("Expected some pending operations found {} operation" " for {}volume in Heketi db") g.log.error(err_msg.format(h_db_check_vol["pending"], vol_type)) raise exceptions.ExecutionError( err_msg.format(h_db_check_vol["pending"], vol_type)) # Verify pending operation during file/block volumes delete err_msg = ("Expecting pending operations for {}volume during" " deletion") self.assertTrue(h_db_check_vol["pending"], err_msg.format(vol_type)) # Verify brick pending operation during delete err_msg = ("Expecting bricks pending in multiple of 3 but found {}") if vol_type == '': self.assertFalse(h_db_check_bricks["pending"] % 3, err_msg.format(h_db_check_bricks["pending"])) # Verify volume/blockvolume pending operation during delete for w in waiter.Waiter(timeout=100, interval=5): h_db_check_vol_after = heketi_db_check(h_node, h_url) h_db_check_bricks_after = h_db_check_vol_after["bricks"] h_db_check_vol_after = h_db_check_vol_after["{}volumes".format( vol_type)] # verify if file/block volumes and bricks are properly deleted if (((not vol_type) and (not h_db_check_bricks_after["pending"])) or (not h_db_check_vol_after["pending"])): break if w.expired: err_msg = ("Failed to delete {}volumes after waiting for 100 secs") raise exceptions.AssertionError(err_msg.format(vol_type)) # Check that all background processes got exited for obj in async_obj: ret, out, err = obj.async_communicate() self.assertFalse(ret, err) # Verify bricks pending operation after delete if vol_type == "": err_msg = ("Expecting 0 bricks pending operations after deletion" " but found {} after {}volume deletion") self.assertFalse( h_db_check_bricks_after["pending"], err_msg.format(h_db_check_bricks_after["pending"], vol_type)) # Verify volumes/bockvolumes pending operation after delete err_msg = ("Expecting 0 {}volume pending operations after deletion" " but found {}") self.assertFalse( h_db_check_vol_after["pending"], err_msg.format(vol_type, h_db_check_vol_after["pending"])) # Verify if initial and final volumes/blockvolumes are same err_msg = ( "Total volume before {} and after {} creation not matched".format( h_db_check_vol_after["total"], h_db_check_vol_before["total"])) self.assertEqual(h_db_check_vol_after["total"], h_db_check_vol_before["total"], err_msg) # Verify if initial and final bricks are same err_msg = ( "Total bricks before {} and after {} creation not matched".format( h_db_check_bricks_after["total"], h_db_check_bricks_before["total"])) self.assertEqual(h_db_check_bricks_after["total"], h_db_check_bricks_before["total"], err_msg)
def test_heketi_prometheus_usedbytes_brickcount_on_device_delete( self, operation): """Validate used bytes,device count on heketi and prometheus""" h_node, h_server = self.heketi_client_node, self.heketi_server_url # Get list of additional devices for one of the Gluster nodes gluster_server_0 = list(self.gluster_servers_info.values())[0] manage_hostname = gluster_server_0.get("manage") self.assertTrue( manage_hostname, "IP Address is not specified for " "node {}".format(gluster_server_0)) device_name = gluster_server_0.get("additional_devices")[0] self.assertTrue( device_name, "Additional devices are not specified for " "node {}".format(gluster_server_0)) # Get node ID of the Gluster hostname node_list = heketi_ops.heketi_topology_info( h_node, h_server, json=True).get("clusters")[0].get("nodes") self.assertTrue( node_list, "Cluster info command returned empty list of nodes") node_id = [ node.get("id") for node in node_list if manage_hostname == node.get("hostnames").get("manage")[0]] self.assertTrue( node_id, "Failed to get node_id for {}".format(manage_hostname)) node_id = node_id[0] # Adding heketi device heketi_ops.heketi_device_add(h_node, h_server, device_name, node_id) node_info_after_addition = heketi_ops.heketi_node_info( h_node, h_server, node_id, json=True) device_id, bricks = None, None for device in node_info_after_addition.get("devices"): if device.get("name") == device_name: device_id, bricks = ( device.get("id"), len(device.get("bricks"))) break # Verify zero bricks on the device msg = ( "Number of bricks on the device {} of the nodes should be" "zero".format(device_name)) self.assertFalse(bricks, msg) self.addCleanup( heketi_ops.heketi_device_delete, h_node, h_server, device_id, raise_on_error=False) self.addCleanup( heketi_ops.heketi_device_remove, h_node, h_server, device_id, raise_on_error=False) self.addCleanup( heketi_ops.heketi_device_disable, h_node, h_server, device_id, raise_on_error=False) # Disable,Remove and Delete heketi device heketi_ops.heketi_device_disable(h_node, h_server, device_id) heketi_ops.heketi_device_remove(h_node, h_server, device_id) heketi_ops.heketi_device_delete(h_node, h_server, device_id) # Verify device deletion node_info_after_deletion = ( heketi_ops.heketi_node_info(h_node, h_server, node_id)) msg = ("Device {} should not be shown in node info of the node {}" "after the device deletion".format(device_id, node_id)) self.assertNotIn(device_id, node_info_after_deletion, msg) if operation == "usedbytes": # Validate heketi and prometheus device used bytes for w in waiter.Waiter(timeout=60, interval=10): device_used_bytes_prometheus = 0 device_used_bytes_metrics = 0 openshift_ops.switch_oc_project( self.ocp_master_node[0], 'openshift-monitoring') metric_result = self._fetch_metric_from_promtheus_pod( metric='heketi_device_used_bytes') for result in metric_result: if (node_id == result.get('cluster') and device_name == result.get('device')): device_used_bytes_prometheus += ( int(result.get('value')[1])) openshift_ops.switch_oc_project( self.ocp_master_node[0], 'glusterfs') metrics = heketi_ops.get_heketi_metrics(h_node, h_server) heketi_device_count_metric = ( metrics.get('heketi_device_used_bytes')) for result in heketi_device_count_metric: if (node_id == result.get('cluster') and device_name == result.get('device')): device_used_bytes_metrics = int(result.get('value')) if device_used_bytes_prometheus == device_used_bytes_metrics: break if w.expired: raise exceptions.ExecutionError( "Failed to update device details in prometheus") elif operation == "brickcount": # Validate heketi and prometheus device brick count for w in waiter.Waiter(timeout=60, interval=10): device_brick_count_prometheus = 0 device_brick_count_metrics = 0 metrics = heketi_ops.get_heketi_metrics(h_node, h_server) heketi_device_count_metric = metrics.get( 'heketi_device_brick_count') for result in heketi_device_count_metric: device_brick_count_metrics += int(result.get('value')) openshift_ops.switch_oc_project( self.ocp_master_node[0], 'openshift-monitoring') metric_result = self._fetch_metric_from_promtheus_pod( metric='heketi_device_brick_count') for result in metric_result: device_brick_count_prometheus += ( int(result.get('value')[1])) if device_brick_count_prometheus == device_brick_count_metrics: break if w.expired: raise exceptions.ExecutionError( "Failed to update device details in prometheus")
def enable_pvc_resize(master_node): ''' This function edits the /etc/origin/master/master-config.yaml file - to enable pv_resize feature and restarts atomic-openshift service on master node Args: master_node (str): hostname of masternode on which want to edit the master-config.yaml file Returns: bool: True if successful, otherwise raise Exception ''' version = get_openshift_version() if version < "3.9": msg = ("pv resize is not available in openshift " "version %s " % version) g.log.error(msg) raise NotSupportedException(msg) with tempfile.NamedTemporaryFile(delete=False) as temp: temp_filename = temp.name try: g.download(master_node, MASTER_CONFIG_FILEPATH, temp_filename) except Exception as e: err_msg = ( "Failed to download '{}' from master node '{}' due to" "exception\n{}".format( MASTER_CONFIG_FILEPATH, master_node, six.text_type(e))) raise ExecutionError(err_msg) with open(temp_filename, 'r') as f: data = yaml.load(f, Loader=yaml.FullLoader) dict_add = data['admissionConfig']['pluginConfig'] if "PersistentVolumeClaimResize" in dict_add: g.log.info("master-config.yaml file is already edited") return True dict_add['PersistentVolumeClaimResize'] = { 'configuration': { 'apiVersion': 'v1', 'disable': 'false', 'kind': 'DefaultAdmissionConfig'}} data['admissionConfig']['pluginConfig'] = dict_add kube_config = data['kubernetesMasterConfig'] for key in ('apiServerArguments', 'controllerArguments'): kube_config[key] = ( kube_config.get(key) if isinstance(kube_config.get(key), dict) else {}) value = ['ExpandPersistentVolumes=true'] kube_config[key]['feature-gates'] = value with open(temp_filename, 'w+') as f: yaml.dump(data, f, default_flow_style=False) try: g.upload(master_node, temp_filename, MASTER_CONFIG_FILEPATH) except Exception as e: err_msg = ( "Failed to upload '{}' to master node '{}' due to" "exception\n{}".format( master_node, MASTER_CONFIG_FILEPATH, six.text_type(e))) raise ExecutionError(err_msg) os.unlink(temp_filename) if version == "3.9": cmd = ("systemctl restart atomic-openshift-master-api " "atomic-openshift-master-controllers") else: cmd = ("/usr/local/bin/master-restart api && " "/usr/local/bin/master-restart controllers") ret, out, err = g.run(master_node, cmd, "root") if ret != 0: err_msg = "Failed to execute cmd %s on %s\nout: %s\nerr: %s" % ( cmd, master_node, out, err) g.log.error(err_msg) raise ExecutionError(err_msg) # Wait for API service to be ready after the restart for w in waiter.Waiter(timeout=120, interval=1): try: cmd_run("oc get nodes", master_node) return True except AssertionError: continue err_msg = "Exceeded 120s timeout waiting for OCP API to start responding." g.log.error(err_msg) raise ExecutionError(err_msg)
def test_heketi_metrics_validation_with_node_reboot(self): """Validate heketi metrics after node reboot using prometheus""" initial_metrics, final_metrics = {}, {} # Use storage project openshift_ops.switch_oc_project( self._master, self.storage_project_name) # Get initial metrics result h_node, h_server = self.heketi_client_node, self.heketi_server_url initial_metrics = tuple( heketi_ops.get_heketi_metrics(h_node, h_server).get(metric)[0] for metric in self.metrics) # Use prometheus project openshift_ops.switch_oc_project( self._master, self._prometheus_project_name) # Get initial prometheus result initial_prometheus = self._get_and_manipulate_metric_data( self.metrics) # Get hosted node IP of heketi pod openshift_ops.switch_oc_project( self._master, self.storage_project_name) heketi_pod = openshift_ops.get_pod_name_from_dc( self._master, self.heketi_dc_name) heketi_node = openshift_ops.oc_get_custom_resource( self._master, 'pod', '.:spec.nodeName', heketi_pod)[0] # Reboot the node on which heketi pod is scheduled self.addCleanup( self._check_heketi_and_gluster_pod_after_node_reboot, heketi_node) node_ops.node_reboot_by_command(heketi_node) # Wait node to become NotReady custom = r'":.status.conditions[?(@.type==\"Ready\")]".status' for w in waiter.Waiter(300, 10): status = openshift_ops.oc_get_custom_resource( self._master, 'node', custom, heketi_node) if status[0] == 'False': break if w.expired: raise exceptions.ExecutionError( "Failed to bring down node {}".format(heketi_node)) # Wait for node to become ready openshift_ops.wait_for_ocp_node_be_ready(self._master, heketi_node) # Wait for heketi and glusterfs pod to become ready self._check_heketi_and_gluster_pod_after_node_reboot(heketi_node) # Use prometheus project openshift_ops.switch_oc_project( self._master, self._prometheus_project_name) # Get final metrics result final_metrics = tuple( heketi_ops.get_heketi_metrics(h_node, h_server).get(metric)[0] for metric in self.metrics) # Get final prometheus result final_prometheus = self._get_and_manipulate_metric_data( self.metrics) err_msg = "Initial value {} is not same as final value {}" self.assertEqual( initial_metrics, final_metrics, err_msg.format( initial_metrics, final_metrics)) self.assertEqual( initial_prometheus, final_prometheus, err_msg.format( initial_prometheus, final_prometheus))