def ssh_con_fabric(test_vars): """Create an SSH connection to the controller.""" log = logging.getLogger("ssh_con_fabric") # SSH connection/client to the public IP. pub_client = Connection(test_vars["public_ip"], user=test_vars["controller_user"], connect_kwargs={ "key_filename": test_vars["ssh_priv_key"], }) # If the controller's IP is not the same as the public IP, then we are # using a jumpbox to get into the VNET containing the controller. In that # case, create an SSH tunnel before connecting to the controller. msg_con = "SSH connection to controller ({})".format( test_vars["controller_ip"]) if test_vars["public_ip"] != test_vars["controller_ip"]: for port_attempt in range(1, 11): tunnel_local_port = get_unused_local_port() tunnel_remote_port = 22 msg_con += " via jumpbox ({0}), local port {1}".format( test_vars["public_ip"], tunnel_local_port) log.debug("Opening {}".format(msg_con)) with pub_client.forward_local( local_port=tunnel_local_port, remote_port=tunnel_remote_port, remote_host=test_vars["controller_ip"]): client = Connection("127.0.0.1", user=test_vars["controller_user"], port=tunnel_local_port, connect_kwargs={ "key_filename": test_vars["ssh_priv_key"], }) try: client.open() except NoValidConnectionsError as ex: exp_err = "Unable to connect to port {} on 127.0.0.1".format( tunnel_local_port) if exp_err not in str(ex): raise else: log.warning("{0} (attempt #{1}, retrying)".format( exp_err, str(port_attempt))) continue yield client log.debug("{} closed".format(msg_con)) break # no need to iterate again else: log.debug("Opening {}".format(msg_con)) pub_client.open() yield pub_client log.debug("Closing {}".format(msg_con)) pub_client.close()
def ssh_con_fabric(test_vars): """Create an SSH connection to the controller.""" log = logging.getLogger("ssh_con_fabric") # SSH connection/client to the public IP. pub_client = Connection(test_vars["public_ip"], user=test_vars["controller_user"], connect_kwargs={ "key_filename": test_vars["ssh_priv_key"], }) # If the controller's IP is not the same as the public IP, then we are # using a jumpbox to get into the VNET containing the controller. In that # case, create an SSH tunnel before connecting to the controller. msg_con = "SSH connection to controller ({})".format( test_vars["controller_ip"]) if test_vars["public_ip"] != test_vars["controller_ip"]: tunnel_local_port = get_unused_local_port() tunnel_remote_port = 22 msg_con += " via jumpbox ({0}), local port {1}".format( test_vars["public_ip"], tunnel_local_port) log.debug("Opening {}".format(msg_con)) with pub_client.forward_local(local_port=tunnel_local_port, remote_port=tunnel_remote_port, remote_host=test_vars["controller_ip"]): client = Connection("127.0.0.1", user=test_vars["controller_user"], port=tunnel_local_port, connect_kwargs={ "key_filename": test_vars["ssh_priv_key"], }) client.open() yield client log.debug("{} closed".format(msg_con)) else: log.debug("Opening {}".format(msg_con)) pub_client.open() yield pub_client log.debug("Closing {}".format(msg_con)) pub_client.close()
def test_artifacts_collect(self, averecmd_params, scp_con, test_vars): # noqa: F811, E501 """ Collect test artifacts (node logs, rolling trace) from each node. Artifacts are stored to local directories. """ log = logging.getLogger("test_collect_artifacts") artifacts_dir = "vfxt_artifacts_" + test_vars["atd_obj"].deploy_id os.makedirs(artifacts_dir, exist_ok=True) log.debug("Copying logs from controller to {}".format(artifacts_dir)) for lf in [ "vfxt.log", "enablecloudtrace.log", "create_cluster_command.log" ]: scp_con.get("~/" + lf, artifacts_dir) log.debug("Copying SSH keys to the controller") scp_con.put(test_vars["ssh_priv_key"], "~/.ssh/.") scp_con.put(test_vars["ssh_pub_key"], "~/.ssh/.") nodes = run_averecmd(**averecmd_params, method="node.list") log.debug("Nodes found: {}".format(nodes)) last_error = None for node in nodes: node_dir = artifacts_dir + "/" + node node_dir_log = node_dir + "/log" node_dir_trace = node_dir + "/trace" log.debug("node_dir_log = {}, node_dir_trace = {}".format( node_dir_log, node_dir_trace)) # make local directories to store downloaded artifacts os.makedirs(node_dir_trace, exist_ok=True) os.makedirs(node_dir_log, exist_ok=True) # get this node's primary cluster IP address node_ip = run_averecmd(**averecmd_params, method="node.get", args=node)[node]["primaryClusterIP"]["IP"] log.debug("Tunneling to node {} using IP {}".format(node, node_ip)) # get_unused_local_port actually uses the port to know it's # available before making it available again and returning the # port number. Rarely, there is a race where the open() call # below fails because the port is not yet fully available # again. In those cases, try getting a new port. for port_attempt in range(1, 11): tunnel_local_port = get_unused_local_port() with Connection(test_vars["public_ip"], user=test_vars["controller_user"], connect_kwargs={ "key_filename": test_vars["ssh_priv_key"], }).forward_local(local_port=tunnel_local_port, remote_port=22, remote_host=node_ip): node_c = Connection("127.0.0.1", user="******", port=tunnel_local_port, connect_kwargs={ "password": os.environ["AVERE_ADMIN_PW"] }) try: node_c.open() # If port_attempt > 1, last_error had the exception # from the last iteration. Clear it. last_error = None except NoValidConnectionsError as ex: last_error = ex exp_err = "Unable to connect to port {} on 127.0.0.1".format( tunnel_local_port) if exp_err not in str(ex): raise else: log.warning("{0} (attempt #{1}, retrying)".format( exp_err, str(port_attempt))) continue # iterate scp_client = SCPClient(node_c.transport) try: # Calls below catch exceptions and report them to the # error log, but then continue. This is because a # failure to collect artifacts on one node should not # prevent collection from other nodes. After collection # has completed, the last exception will be raised. # list of files and directories to download to_collect = [ "/var/log/messages", "/var/log/xmlrpc.log", # assumes rolling trace was enabled during deploy "/support/trace/rolling", # TODO: 2019-0219: turned off for now # "/support/gsi", # "/support/cores", ] for tc in to_collect: log.debug("SCP'ing {} from node {} to {}".format( tc, node, node_dir_log)) try: scp_client.get(tc, node_dir_log, recursive=True) except Exception as ex: log.error("({}) Exception caught: {}".format( node, ex)) last_error = ex finally: scp_client.close() log.debug("Connections to node {} closed".format(node)) break # no need to iterate again if last_error: log.error("See previous error(s) above. Raising last exception.") raise last_error
def test_update_reg_clients_hosts(self, test_vars): """ Updates /etc/hosts on the STAF clients so they can contact the STAF server. """ log = logging.getLogger("test_update_reg_clients_hosts") atd = test_vars["atd_obj"] commands = """ cp /etc/hosts . echo ' ' >> hosts echo '# STAF server IP' >> hosts echo '{0} staf' >> hosts sudo mv hosts /etc/hosts echo '#!/bin/bash' > ~/hostdb_entries.sh chmod 755 ~/hostdb_entries.sh echo "cd ~/Avere-sv" >> ~/hostdb_entries.sh echo "source /usr/sv/env/bin/activate" >> ~/hostdb_entries.sh echo "export PYTHONPATH=~/Avere-sv:~/Avere-sv/averesv:$PYTHONPATH:$PATH" >> ~/hostdb_entries.sh echo "averesv/hostdb.py -a vfxt -m {1} -p '{2}'" >> ~/hostdb_entries.sh """.format(test_vars["staf_server_priv_ip"], test_vars["cluster_mgmt_ip"], os.environ["AVERE_ADMIN_PW"]).split("\n") # Add hostdb entry calls for each regression client. for i, staf_client_ip in enumerate(test_vars["staf_client_priv_ips"]): commands.append( "echo 'averesv/hostdb.py -L regclient{0} -m {1}' >> ~/hostdb_entries.sh" .format(i, staf_client_ip)) # Get the storage account's access key and add that hostdb entry, too. sa_key = atd.st_client.storage_accounts.list_keys( atd.resource_group, test_vars["storage_account"]).keys[0].value commands.append( "echo 'averesv/hostdb.py -s {0}.blob.core.windows.net -m {0}.blob.core.windows.net -M az --cloudCreds \"{0}::{1}\"' >> ~/hostdb_entries.sh" .format(test_vars["storage_account"], sa_key)) last_error = None for staf_client_ip in test_vars["staf_client_priv_ips"]: for port_attempt in range(1, 11): tunnel_local_port = get_unused_local_port() with Connection(test_vars["public_ip"], user=test_vars["controller_user"], connect_kwargs={ "key_filename": test_vars["ssh_priv_key"], }).forward_local(local_port=tunnel_local_port, remote_port=22, remote_host=staf_client_ip): node_c = Connection("127.0.0.1", user=test_vars["controller_user"], port=tunnel_local_port, connect_kwargs={ "key_filename": test_vars["ssh_priv_key"], }) try: node_c.open() # If port_attempt > 1, last_error had the exception # from the last iteration. Clear it. last_error = None except NoValidConnectionsError as ex: last_error = ex exp_err = "Unable to connect to port {} on 127.0.0.1".format( tunnel_local_port) if exp_err not in str(ex): raise else: log.warning("{0} (attempt #{1}, retrying)".format( exp_err, str(port_attempt))) continue # iterate run_ssh_commands(node_c.client, commands) # Copy SSH keys to the client. scp_cli = SCPClient(node_c.transport) scp_cli.put(test_vars["ssh_priv_key"], "~/.ssh/id_rsa") scp_cli.put(test_vars["ssh_pub_key"], "~/.ssh/id_rsa.pub") scp_cli.close() log.debug("Connection to {} closed".format(staf_client_ip)) break # no need to iterate again if last_error: log.error( "See previous error(s) above. Raising last exception.") raise last_error