def test_set_custom_setting(self, averecmd_params): # noqa: F811 """INTERNAL USE ONLY""" custom_settings = os.environ.get('INTERNAL_CUSTOM_SETTING', None) if custom_settings: run_averecmd(**averecmd_params, method="support.setCustomSetting", args=custom_settings)
def test_node_health(self, averecmd_params, node_names, test_vars): # noqa: F811 """Get the node IPs and store them in test_vars.""" log = logging.getLogger("test_node_health") node_ips = {} # will store a map of node names and IPs for node in node_names: timeout_secs = 60 time_start = time() time_end = time_start + timeout_secs while time() <= time_end: result = run_averecmd(**averecmd_params, method="node.get", args=node) node_state = result[node]["state"] log.info('Node {0} has state "{1}"'.format(node, node_state)) if node_state == "up": # Save the node IPs while we're here. node_ips[node] = [ x["IP"] for x in result[node]["clusterIPs"] ] if result[node]["clientFacingIPs"]["vserver"]: node_ips[node].append(result[node]["clientFacingIPs"] ["vserver"][0]["IP"]) break sleep(10) assert node_state == "up" if node_ips: test_vars["node_ips"] = node_ips
def test_node_health(self, averecmd_params): # noqa: F811 """Check that cluster is reporting that all nodes are up.""" log = logging.getLogger("test_node_health") for node in run_averecmd(**averecmd_params, method="node.list"): timeout_secs = 60 time_start = time() time_end = time_start + timeout_secs while time() <= time_end: result = run_averecmd(**averecmd_params, method="node.get", args=node) node_state = result[node]["state"] log.info('Node {0} has state "{1}"'.format(node, node_state)) if node_state == "up": break sleep(10) assert node_state == "up"
def test_for_cores(self, averecmd_params): # noqa: F811 """ Check the cluster for cores. If a core is found, collect/send a GSI. """ log = logging.getLogger("test_for_cores") node_cores = run_averecmd(**averecmd_params, method="support.listCores", args="cluster") cores_found = False for cores in node_cores.values(): if len(cores): cores_found = True break if cores_found: log.error("Cores found: {}".format(node_cores)) upload_gsi(averecmd_params) # collect/upload a "normal" GSI bundle assert (not cores_found)
def node_names(cluster_ips, ssh_con, test_vars): """Queries the cluster to get a list of node names.""" log = logging.getLogger("node_names") last_ex = None for _ip in cluster_ips: # For resiliency, attempt to issue averecmd calls to known cluster IPs. try: nodes = run_averecmd(ssh_client=ssh_con, password=os.environ["AVERE_ADMIN_PW"], node_ip=_ip, method="node.list") test_vars["averecmd_ip"] = _ip # this IP worked for averecmd test_vars["nodes"] = nodes return test_vars["nodes"] except Exception as e: log.error("node.list failed for IP {}".format(_ip)) log.error(e) last_ex = e assert not last_ex
def node_ips(cluster_ips, ssh_con, test_vars): """Queries the cluster to get a list of IPs for each node.""" log = logging.getLogger("node_ips") last_ex = None for _ip in cluster_ips: # For resiliency, attempt to issue averecmd calls to known cluster IPs. try: result = run_averecmd(ssh_client=ssh_con, password=os.environ["AVERE_ADMIN_PW"], node_ip=_ip, method="cluster.get") test_vars["averecmd_ip"] = _ip # this IP worked for averecmd c_ips = result["clusterIPs"][0] node_ip_range = "{0}-{1}".format(c_ips["firstIP"], c_ips["lastIP"]) test_vars["cluster_node_ips"] = split_ip_range(node_ip_range) return test_vars["cluster_node_ips"] except Exception as e: log.error("cluster.get failed for IP {}".format(_ip)) log.error(e) last_ex = e assert not last_ex
def test_ha_enabled(self, averecmd_params): # noqa: F811 """Check that high-availability (HA) is enabled.""" result = run_averecmd(**averecmd_params, method="cluster.get") assert result["ha"] == "enabled"
def test_artifacts_collect(self, averecmd_params, scp_con, test_vars): # noqa: F811, E501 """ Collect test artifacts (node logs, rolling trace) from each node. Artifacts are stored to local directories. """ log = logging.getLogger("test_collect_artifacts") artifacts_dir = "vfxt_artifacts_" + test_vars["atd_obj"].deploy_id os.makedirs(artifacts_dir, exist_ok=True) log.debug("Copying logs from controller to {}".format(artifacts_dir)) for lf in [ "vfxt.log", "enablecloudtrace.log", "create_cluster_command.log" ]: scp_con.get("~/" + lf, artifacts_dir) log.debug("Copying SSH keys to the controller") scp_con.put(test_vars["ssh_priv_key"], "~/.ssh/.") scp_con.put(test_vars["ssh_pub_key"], "~/.ssh/.") nodes = run_averecmd(**averecmd_params, method="node.list") log.debug("Nodes found: {}".format(nodes)) last_error = None for node in nodes: node_dir = artifacts_dir + "/" + node node_dir_log = node_dir + "/log" node_dir_trace = node_dir + "/trace" log.debug("node_dir_log = {}, node_dir_trace = {}".format( node_dir_log, node_dir_trace)) # make local directories to store downloaded artifacts os.makedirs(node_dir_trace, exist_ok=True) os.makedirs(node_dir_log, exist_ok=True) # get this node's primary cluster IP address node_ip = run_averecmd(**averecmd_params, method="node.get", args=node)[node]["primaryClusterIP"]["IP"] log.debug("Tunneling to node {} using IP {}".format(node, node_ip)) # get_unused_local_port actually uses the port to know it's # available before making it available again and returning the # port number. Rarely, there is a race where the open() call # below fails because the port is not yet fully available # again. In those cases, try getting a new port. for port_attempt in range(1, 11): tunnel_local_port = get_unused_local_port() with Connection(test_vars["public_ip"], user=test_vars["controller_user"], connect_kwargs={ "key_filename": test_vars["ssh_priv_key"], }).forward_local(local_port=tunnel_local_port, remote_port=22, remote_host=node_ip): node_c = Connection("127.0.0.1", user="******", port=tunnel_local_port, connect_kwargs={ "password": os.environ["AVERE_ADMIN_PW"] }) try: node_c.open() # If port_attempt > 1, last_error had the exception # from the last iteration. Clear it. last_error = None except NoValidConnectionsError as ex: last_error = ex exp_err = "Unable to connect to port {} on 127.0.0.1".format( tunnel_local_port) if exp_err not in str(ex): raise else: log.warning("{0} (attempt #{1}, retrying)".format( exp_err, str(port_attempt))) continue # iterate scp_client = SCPClient(node_c.transport) try: # Calls below catch exceptions and report them to the # error log, but then continue. This is because a # failure to collect artifacts on one node should not # prevent collection from other nodes. After collection # has completed, the last exception will be raised. # list of files and directories to download to_collect = [ "/var/log/messages", "/var/log/xmlrpc.log", # assumes rolling trace was enabled during deploy "/support/trace/rolling", # TODO: 2019-0219: turned off for now # "/support/gsi", # "/support/cores", ] for tc in to_collect: log.debug("SCP'ing {} from node {} to {}".format( tc, node, node_dir_log)) try: scp_client.get(tc, node_dir_log, recursive=True) except Exception as ex: log.error("({}) Exception caught: {}".format( node, ex)) last_error = ex finally: scp_client.close() log.debug("Connections to node {} closed".format(node)) break # no need to iterate again if last_error: log.error("See previous error(s) above. Raising last exception.") raise last_error