def lockHost(hostname, unlock=False): # hostname format: ip-10-0-0-114.eu-west-1.compute.internal hostname = hostname.split(".")[0] if unlock: log.info("Unlocking host %s", hostname) command = [ "/opt/slurm/bin/scontrol", "update", "NodeName={0}".format(hostname), "State=RESUME", 'Reason="Unlocking"', ] else: log.info("Locking host %s", hostname) command = [ "/opt/slurm/bin/scontrol", "update", "NodeName={0}".format(hostname), "State=DRAIN", 'Reason="Shutting down"', ] try: run_command(command) except subprocess.CalledProcessError: log.error("Error %s host %s", "unlocking" if unlock else "locking", hostname)
def _reconfigure_nodes(): log.info("Reconfiguring slurm") command = ["/opt/slurm/bin/scontrol", "reconfigure"] try: run_command(command, log) except Exception as e: log.error("Failed when reconfiguring slurm daemon with exception %s", e)
def addHost(hostname, cluster_user, slots, max_cluster_size): log.info('Adding %s with %s slots' % (hostname, slots)) command = ("/opt/torque/bin/qmgr -c 'create node %s np=%s'" % (hostname, slots)) run_command(command, log, raise_on_error=False) command = ('/opt/torque/bin/pbsnodes -c %s' % hostname) run_command(command, log, raise_on_error=False) # Connect and hostkey ssh = paramiko.SSHClient() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) hosts_key_file = os.path.expanduser("~" + cluster_user) + '/.ssh/known_hosts' user_key_file = os.path.expanduser("~" + cluster_user) + '/.ssh/id_rsa' iter = 0 connected = False while iter < 3 and connected == False: try: log.info('Connecting to host: %s iter: %d' % (hostname, iter)) ssh.connect(hostname, username=cluster_user, key_filename=user_key_file) connected = True except socket.error, e: log.info('Socket error: %s' % e) time.sleep(10 + iter) iter = iter + 1 if iter == 3: log.info("Unable to provison host") return
def wakeup_scheduler(): # Trigger a scheduling cycle. This is necessary when compute nodes are added to speed up jobs allocation. # This is also necessary when the first compute node gets added to the scheduler otherwise the jobs are never # started. logging.info("Triggering a scheduling cycle.") run_command(TORQUE_BIN_DIR + 'qmgr -c "set server scheduling=true"', raise_on_error=False)
def lockHost(hostname, unlock=False): # hostname format: ip-10-0-0-114.eu-west-1.compute.internal hostname = hostname.split(".")[0] if unlock: log.info("Unlocking host %s", hostname) command = [ "/opt/slurm/bin/scontrol", "update", "NodeName={0}".format(hostname), "State=RESUME", 'Reason="Unlocking"', ] else: log.info("Locking host %s", hostname) command = [ "/opt/slurm/bin/scontrol", "update", "NodeName={0}".format(hostname), "State=DRAIN", 'Reason="Shutting down"', ] try: run_command(command, log) except subprocess.CalledProcessError: log.error("Error %s host %s", "unlocking" if unlock else "locking", hostname)
def delete_nodes(hosts): # Setting nodes to offline before deleting to workaround issue with pbs_mom unable to # rerun the job. if hosts: run_command(TORQUE_BIN_DIR + "pbsnodes -o {0}".format(" ".join(hosts)), raise_on_error=False, log_error=False) # Process at most 20 concurrent deletions at a time since the required time linearly depends # on the number of nodes that we try to remove succeeded_hosts = set() chunk_size = 20 for i in range(0, len(hosts), chunk_size): succeeded_hosts.update( _qmgr_manage_nodes( operation="delete", hosts=hosts[i:i + chunk_size], # noqa E203: incompatible with black error_messages_to_ignore=[ "Unknown node", "The server was unable to communicate with the MOM to requeue or delete the job." " The node has been deleted and all jobs on the node have been purged.", ], )) return succeeded_hosts
def wakeupSchedOn(hostname): log.info('Waking up scheduler on host %s', hostname) command = ("/opt/torque/bin/pbsnodes -x %s" % (hostname)) sleep_time = 3 times = 20 host_state = None while isHostInitState(host_state) and times > 0: try: output = check_command_output(command, log) # Ex.1: <Data><Node><name>ip-10-0-76-39</name><state>down,offline,MOM-list-not-sent</state><power_state>Running</power_state> # <np>1</np><ntype>cluster</ntype><mom_service_port>15002</mom_service_port><mom_manager_port>15003</mom_manager_port></Node></Data> # Ex 2: <Data><Node><name>ip-10-0-76-39</name><state>free</state><power_state>Running</power_state><np>1</np><ntype>cluster</ntype> # <status>rectime=1527799181,macaddr=02:e4:00:b0:b1:72,cpuclock=Fixed,varattr=,jobs=,state=free,netload=210647044,gres=,loadave=0.00, # ncpus=1,physmem=1017208kb,availmem=753728kb,totmem=1017208kb,idletime=856,nusers=1,nsessions=1,sessions=19698, # uname=Linux ip-10-0-76-39 4.9.75-25.55.amzn1.x86_64 #1 SMP Fri Jan 5 23:50:27 UTC 2018 x86_64,opsys=linux</status> # <mom_service_port>15002</mom_service_port><mom_manager_port>15003</mom_manager_port></Node></Data> xmlnode = ElementTree.XML(output) host_state = xmlnode.findtext("./Node/state") except: log.error("Error parsing XML from %s" % output) if isHostInitState(host_state): log.debug("Host %s is still in state %s" % (hostname, host_state)) time.sleep(sleep_time) times -= 1 if host_state == "free": command = "/opt/torque/bin/qmgr -c \"set server scheduling=true\"" run_command(command, log, raise_on_error=False) elif times == 0: log.error("Host %s is still in state %s" % (hostname, host_state)) else: log.debug("Host %s is in state %s" % (hostname, host_state))
def removeHost(hostname, cluster_user, max_cluster_size): log.info('Removing %s', hostname) command = ('/opt/torque/bin/pbsnodes -o %s' % hostname) run_command(command, log, raise_on_error=False) command = ("/opt/torque/bin/qmgr -c 'delete node %s'" % hostname) run_command(command, log, raise_on_error=False)
def lockHost(hostname, unlock=False): # https://lists.sdsc.edu/pipermail/npaci-rocks-discussion/2007-November/027919.html mod = unlock and '-c' or '-o' command = ['/opt/torque/bin/pbsnodes', mod, hostname] try: run_command(command, log) except subprocess.CalledProcessError: log.error("Error %s host %s", "unlocking" if unlock else "locking", hostname)
def run_sge_command(command): """ Execute SGE shell command, by exporting the appropriate environment. :param command: command to execute :raise: subprocess.CalledProcessError if the command fails """ command = _prepend_sge_bin_dir(command) run_command(command, SGE_ENV)
def run_sge_command(command, log): """ Execute SGE shell command, by exporting the appropriate environment. :param command: command to execute :param log: logger :raise: subprocess.CalledProcessError if the command fails """ command = _prepend_sge_bin_dir(command) run_command(command, log, SGE_ENV)
def lockHost(hostname, unlock=False): # hostname format: ip-10-0-0-114.eu-west-1.compute.internal hostname = hostname.split(".")[0] mod = unlock and "-c" or "-o" command = [TORQUE_BIN_DIR + "pbsnodes", mod, hostname] try: run_command(command) except subprocess.CalledProcessError: log.error("Error %s host %s", "unlocking" if unlock else "locking", hostname)
def _restart_master_node(): log.info("Restarting slurm on master node") if os.path.isfile("/etc/systemd/system/slurmctld.service"): command = ["sudo", "systemctl", "restart", "slurmctld.service"] else: command = ["/etc/init.d/slurm", "restart"] try: run_command(command, log) except Exception as e: log.error("Failed when restarting slurm daemon on master node with exception %s", e) raise
def _update_master_np(max_nodes, node_slots): """Master np is dynamically based on the number of compute nodes that join the cluster.""" current_nodes_count = len( check_command_output("cat /var/spool/torque/server_priv/nodes").strip( ).splitlines()) - 1 # If cluster is at max size set the master np to 1 since 0 is not allowed. master_node_np = max(1, (max_nodes - current_nodes_count) * node_slots) master_hostname = check_command_output("hostname") logging.info("Setting master np to: %d", master_node_np) run_command(TORQUE_BIN_DIR + 'qmgr -c "set node {hostname} np = {slots}"'.format( hostname=master_hostname, slots=master_node_np))
def launch(benchmark_script, benchmark_script_args, with_nvprof=False): """ If with_nvprof is True, it will launch the following command firstly to get the gpu_time: nvprof python benchmark_script benchmark_script_args Then the normal testing command will be launched: python benchmark_script benchmark_script_args """ cmd = "{} {} {}".format(sys.executable, benchmark_script, " ".join(benchmark_script_args)) if with_nvprof: stdout, exit_code = _nvprof(cmd) if exit_code == 0: parse_status, gpu_time = _parse_nvprof_logs(stdout.split("\n")) else: parse_status = False if parse_status: return gpu_time else: print("Runing Error:\n {}".format(stdout)) else: stdout, exit_code = utils.run_command(cmd) print(stdout) if exit_code != 0: sys.exit(exit_code) return 0.0
def _remove_keys_from_known_hosts(hostnames, host_keys_file, user): for hostname in hostnames: command = "ssh-keygen -R " + hostname + " -f " + host_keys_file run_command(command, raise_on_error=False, execute_as_user=user) command = "ssh-keygen -R " + hostname + ". -f " + host_keys_file run_command(command, raise_on_error=False, execute_as_user=user) command = "ssh-keygen -R " + socket.gethostbyname( hostname) + " -f " + host_keys_file run_command(command, raise_on_error=False, execute_as_user=user)
def addHost(hostname, cluster_user, slots, max_cluster_size): log.info("Adding %s with %s slots" % (hostname, slots)) command = "/opt/torque/bin/qmgr -c 'create node %s np=%s'" % (hostname, slots) run_command(command, raise_on_error=False) command = "/opt/torque/bin/pbsnodes -c %s" % hostname run_command(command, raise_on_error=False) # Connect and hostkey ssh = paramiko.SSHClient() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) hosts_key_file = os.path.expanduser("~" + cluster_user) + "/.ssh/known_hosts" user_key_file = os.path.expanduser("~" + cluster_user) + "/.ssh/id_rsa" iter = 0 connected = False while iter < 3 and connected == False: try: log.info("Connecting to host: %s iter: %d" % (hostname, iter)) ssh.connect(hostname, username=cluster_user, key_filename=user_key_file) connected = True except socket.error as e: log.info("Socket error: %s" % e) time.sleep(10 + iter) iter = iter + 1 if iter == 3: log.info("Unable to provison host") return try: ssh.load_host_keys(hosts_key_file) except IOError: ssh._host_keys_filename = None pass ssh.save_host_keys(hosts_key_file) ssh.close() wakeupSchedOn(hostname)
def launch(benchmark_script, benchmark_script_args, with_nvprof=False): cmd = "{} {} {}".format(sys.executable, benchmark_script, " ".join(benchmark_script_args)) if with_nvprof: stdout, exit_code = _nvprof(cmd) if exit_code == 0: return _parse_nvprof_logs(stdout.split("\n")) else: print("Runing Error:\n {}".format(stdout)) else: stdout, exit_code = utils.run_command(cmd) print(stdout) if exit_code != 0: sys.exit(exit_code) return 0.0
def update_cluster_limits(max_nodes, node_slots): try: logging.info("Updating cluster limits: max_nodes=%d, node_slots=%d", max_nodes, node_slots) run_command(TORQUE_BIN_DIR + 'qmgr -c "set queue batch resources_available.nodect={0}"'. format(max_nodes)) run_command(TORQUE_BIN_DIR + 'qmgr -c "set server resources_available.nodect={0}"'. format(max_nodes)) run_command(TORQUE_BIN_DIR + 'qmgr -c "set queue batch resources_max.ncpus={0}"'.format( node_slots)) _update_master_np(max_nodes, node_slots) except Exception as e: logging.error("Failed when updating cluster limits with exception %s.", e)
import streamlit as st import time import multiprocessing import pandas as pd from common import utils st.sidebar.title("Mini-DLPerf") st.sidebar.subheader("\nControls") threads = st.sidebar.number_input("Threads", min_value=1, value=multiprocessing.cpu_count() - 2) batch_size = st.sidebar.number_input("Batch Size", min_value=1, value=64) ready = st.sidebar.checkbox("Ready to run!") _ = utils.run_command("nvidia-smi nvlink -sc 0bz") with st.spinner("Getting GPU info..."): @st.cache def app_get_gpu_info(): return utils.get_gpu_info() st.markdown("GPU info:") st.json(app_get_gpu_info()) threads = str(threads) batch_size = str(batch_size) if ready: progress_bar = st.progress(0)
def _nvprof(cmd): return utils.run_command("nvprof {}".format(cmd))