예제 #1
0
def lockHost(hostname, unlock=False):
    # hostname format: ip-10-0-0-114.eu-west-1.compute.internal
    hostname = hostname.split(".")[0]
    if unlock:
        log.info("Unlocking host %s", hostname)
        command = [
            "/opt/slurm/bin/scontrol",
            "update",
            "NodeName={0}".format(hostname),
            "State=RESUME",
            'Reason="Unlocking"',
        ]
    else:
        log.info("Locking host %s", hostname)
        command = [
            "/opt/slurm/bin/scontrol",
            "update",
            "NodeName={0}".format(hostname),
            "State=DRAIN",
            'Reason="Shutting down"',
        ]
    try:
        run_command(command)
    except subprocess.CalledProcessError:
        log.error("Error %s host %s", "unlocking" if unlock else "locking",
                  hostname)
예제 #2
0
def _reconfigure_nodes():
    log.info("Reconfiguring slurm")
    command = ["/opt/slurm/bin/scontrol", "reconfigure"]
    try:
        run_command(command, log)
    except Exception as e:
        log.error("Failed when reconfiguring slurm daemon with exception %s", e)
예제 #3
0
def addHost(hostname, cluster_user, slots, max_cluster_size):
    log.info('Adding %s with %s slots' % (hostname, slots))

    command = ("/opt/torque/bin/qmgr -c 'create node %s np=%s'" %
               (hostname, slots))
    run_command(command, log, raise_on_error=False)

    command = ('/opt/torque/bin/pbsnodes -c %s' % hostname)
    run_command(command, log, raise_on_error=False)

    # Connect and hostkey
    ssh = paramiko.SSHClient()
    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    hosts_key_file = os.path.expanduser("~" +
                                        cluster_user) + '/.ssh/known_hosts'
    user_key_file = os.path.expanduser("~" + cluster_user) + '/.ssh/id_rsa'
    iter = 0
    connected = False
    while iter < 3 and connected == False:
        try:
            log.info('Connecting to host: %s iter: %d' % (hostname, iter))
            ssh.connect(hostname,
                        username=cluster_user,
                        key_filename=user_key_file)
            connected = True
        except socket.error, e:
            log.info('Socket error: %s' % e)
            time.sleep(10 + iter)
            iter = iter + 1
            if iter == 3:
                log.info("Unable to provison host")
                return
예제 #4
0
def wakeup_scheduler():
    # Trigger a scheduling cycle. This is necessary when compute nodes are added to speed up jobs allocation.
    # This is also necessary when the first compute node gets added to the scheduler otherwise the jobs are never
    # started.
    logging.info("Triggering a scheduling cycle.")
    run_command(TORQUE_BIN_DIR + 'qmgr -c "set server scheduling=true"',
                raise_on_error=False)
예제 #5
0
def lockHost(hostname, unlock=False):
    # hostname format: ip-10-0-0-114.eu-west-1.compute.internal
    hostname = hostname.split(".")[0]
    if unlock:
        log.info("Unlocking host %s", hostname)
        command = [
            "/opt/slurm/bin/scontrol",
            "update",
            "NodeName={0}".format(hostname),
            "State=RESUME",
            'Reason="Unlocking"',
        ]
    else:
        log.info("Locking host %s", hostname)
        command = [
            "/opt/slurm/bin/scontrol",
            "update",
            "NodeName={0}".format(hostname),
            "State=DRAIN",
            'Reason="Shutting down"',
        ]
    try:
        run_command(command, log)
    except subprocess.CalledProcessError:
        log.error("Error %s host %s", "unlocking" if unlock else "locking", hostname)
예제 #6
0
def delete_nodes(hosts):
    # Setting nodes to offline before deleting to workaround issue with pbs_mom unable to
    # rerun the job.
    if hosts:
        run_command(TORQUE_BIN_DIR + "pbsnodes -o {0}".format(" ".join(hosts)),
                    raise_on_error=False,
                    log_error=False)
    # Process at most 20 concurrent deletions at a time since the required time linearly depends
    # on the number of nodes that we try to remove
    succeeded_hosts = set()
    chunk_size = 20
    for i in range(0, len(hosts), chunk_size):
        succeeded_hosts.update(
            _qmgr_manage_nodes(
                operation="delete",
                hosts=hosts[i:i +
                            chunk_size],  # noqa E203: incompatible with black
                error_messages_to_ignore=[
                    "Unknown node",
                    "The server was unable to communicate with the MOM to requeue or delete the job."
                    " The node has been deleted and all jobs on the node have been purged.",
                ],
            ))

    return succeeded_hosts
예제 #7
0
def wakeupSchedOn(hostname):
    log.info('Waking up scheduler on host %s', hostname)
    command = ("/opt/torque/bin/pbsnodes -x %s" % (hostname))

    sleep_time = 3
    times = 20
    host_state = None
    while isHostInitState(host_state) and times > 0:
        try:
            output = check_command_output(command, log)
            # Ex.1: <Data><Node><name>ip-10-0-76-39</name><state>down,offline,MOM-list-not-sent</state><power_state>Running</power_state>
            #        <np>1</np><ntype>cluster</ntype><mom_service_port>15002</mom_service_port><mom_manager_port>15003</mom_manager_port></Node></Data>
            # Ex 2: <Data><Node><name>ip-10-0-76-39</name><state>free</state><power_state>Running</power_state><np>1</np><ntype>cluster</ntype>
            #        <status>rectime=1527799181,macaddr=02:e4:00:b0:b1:72,cpuclock=Fixed,varattr=,jobs=,state=free,netload=210647044,gres=,loadave=0.00,
            #        ncpus=1,physmem=1017208kb,availmem=753728kb,totmem=1017208kb,idletime=856,nusers=1,nsessions=1,sessions=19698,
            #        uname=Linux ip-10-0-76-39 4.9.75-25.55.amzn1.x86_64 #1 SMP Fri Jan 5 23:50:27 UTC 2018 x86_64,opsys=linux</status>
            #        <mom_service_port>15002</mom_service_port><mom_manager_port>15003</mom_manager_port></Node></Data>
            xmlnode = ElementTree.XML(output)
            host_state = xmlnode.findtext("./Node/state")
        except:
            log.error("Error parsing XML from %s" % output)

        if isHostInitState(host_state):
            log.debug("Host %s is still in state %s" % (hostname, host_state))
            time.sleep(sleep_time)
            times -= 1

    if host_state == "free":
        command = "/opt/torque/bin/qmgr -c \"set server scheduling=true\""
        run_command(command, log, raise_on_error=False)
    elif times == 0:
        log.error("Host %s is still in state %s" % (hostname, host_state))
    else:
        log.debug("Host %s is in state %s" % (hostname, host_state))
예제 #8
0
def removeHost(hostname, cluster_user, max_cluster_size):
    log.info('Removing %s', hostname)

    command = ('/opt/torque/bin/pbsnodes -o %s' % hostname)
    run_command(command, log, raise_on_error=False)

    command = ("/opt/torque/bin/qmgr -c 'delete node %s'" % hostname)
    run_command(command, log, raise_on_error=False)
예제 #9
0
def lockHost(hostname, unlock=False):
    # https://lists.sdsc.edu/pipermail/npaci-rocks-discussion/2007-November/027919.html
    mod = unlock and '-c' or '-o'
    command = ['/opt/torque/bin/pbsnodes', mod, hostname]
    try:
        run_command(command, log)
    except subprocess.CalledProcessError:
        log.error("Error %s host %s", "unlocking" if unlock else "locking", hostname)
예제 #10
0
def run_sge_command(command):
    """
    Execute SGE shell command, by exporting the appropriate environment.

    :param command: command to execute
    :raise: subprocess.CalledProcessError if the command fails
    """
    command = _prepend_sge_bin_dir(command)
    run_command(command, SGE_ENV)
예제 #11
0
def lockHost(hostname, unlock=False):
    # https://lists.sdsc.edu/pipermail/npaci-rocks-discussion/2007-November/027919.html
    mod = unlock and '-c' or '-o'
    command = ['/opt/torque/bin/pbsnodes', mod, hostname]
    try:
        run_command(command, log)
    except subprocess.CalledProcessError:
        log.error("Error %s host %s", "unlocking" if unlock else "locking",
                  hostname)
예제 #12
0
def run_sge_command(command, log):
    """
    Execute SGE shell command, by exporting the appropriate environment.

    :param command: command to execute
    :param log: logger
    :raise: subprocess.CalledProcessError if the command fails
    """
    command = _prepend_sge_bin_dir(command)
    run_command(command, log, SGE_ENV)
예제 #13
0
def lockHost(hostname, unlock=False):
    # hostname format: ip-10-0-0-114.eu-west-1.compute.internal
    hostname = hostname.split(".")[0]
    mod = unlock and "-c" or "-o"
    command = [TORQUE_BIN_DIR + "pbsnodes", mod, hostname]
    try:
        run_command(command)
    except subprocess.CalledProcessError:
        log.error("Error %s host %s", "unlocking" if unlock else "locking",
                  hostname)
예제 #14
0
def _restart_master_node():
    log.info("Restarting slurm on master node")
    if os.path.isfile("/etc/systemd/system/slurmctld.service"):
        command = ["sudo", "systemctl", "restart", "slurmctld.service"]
    else:
        command = ["/etc/init.d/slurm", "restart"]
    try:
        run_command(command, log)
    except Exception as e:
        log.error("Failed when restarting slurm daemon on master node with exception %s", e)
        raise
예제 #15
0
def _update_master_np(max_nodes, node_slots):
    """Master np is dynamically based on the number of compute nodes that join the cluster."""
    current_nodes_count = len(
        check_command_output("cat /var/spool/torque/server_priv/nodes").strip(
        ).splitlines()) - 1
    # If cluster is at max size set the master np to 1 since 0 is not allowed.
    master_node_np = max(1, (max_nodes - current_nodes_count) * node_slots)
    master_hostname = check_command_output("hostname")
    logging.info("Setting master np to: %d", master_node_np)
    run_command(TORQUE_BIN_DIR +
                'qmgr -c "set node {hostname} np = {slots}"'.format(
                    hostname=master_hostname, slots=master_node_np))
예제 #16
0
def launch(benchmark_script, benchmark_script_args, with_nvprof=False):
    """
    If with_nvprof is True, it will launch the following command firstly to
    get the gpu_time:
        nvprof python benchmark_script benchmark_script_args

    Then the normal testing command will be launched:
        python benchmark_script benchmark_script_args
    """
    cmd = "{} {} {}".format(sys.executable, benchmark_script,
                            " ".join(benchmark_script_args))
    if with_nvprof:
        stdout, exit_code = _nvprof(cmd)
        if exit_code == 0:
            parse_status, gpu_time = _parse_nvprof_logs(stdout.split("\n"))
        else:
            parse_status = False
        if parse_status:
            return gpu_time
        else:
            print("Runing Error:\n {}".format(stdout))
    else:
        stdout, exit_code = utils.run_command(cmd)
        print(stdout)
        if exit_code != 0:
            sys.exit(exit_code)
    return 0.0
def _remove_keys_from_known_hosts(hostnames, host_keys_file, user):
    for hostname in hostnames:
        command = "ssh-keygen -R " + hostname + " -f " + host_keys_file
        run_command(command, raise_on_error=False, execute_as_user=user)
        command = "ssh-keygen -R " + hostname + ". -f " + host_keys_file
        run_command(command, raise_on_error=False, execute_as_user=user)
        command = "ssh-keygen -R " + socket.gethostbyname(
            hostname) + " -f " + host_keys_file
        run_command(command, raise_on_error=False, execute_as_user=user)
def addHost(hostname, cluster_user, slots, max_cluster_size):
    log.info("Adding %s with %s slots" % (hostname, slots))

    command = "/opt/torque/bin/qmgr -c 'create node %s np=%s'" % (hostname, slots)
    run_command(command, raise_on_error=False)

    command = "/opt/torque/bin/pbsnodes -c %s" % hostname
    run_command(command, raise_on_error=False)

    # Connect and hostkey
    ssh = paramiko.SSHClient()
    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    hosts_key_file = os.path.expanduser("~" + cluster_user) + "/.ssh/known_hosts"
    user_key_file = os.path.expanduser("~" + cluster_user) + "/.ssh/id_rsa"
    iter = 0
    connected = False
    while iter < 3 and connected == False:
        try:
            log.info("Connecting to host: %s iter: %d" % (hostname, iter))
            ssh.connect(hostname, username=cluster_user, key_filename=user_key_file)
            connected = True
        except socket.error as e:
            log.info("Socket error: %s" % e)
            time.sleep(10 + iter)
            iter = iter + 1
            if iter == 3:
                log.info("Unable to provison host")
                return
    try:
        ssh.load_host_keys(hosts_key_file)
    except IOError:
        ssh._host_keys_filename = None
        pass
    ssh.save_host_keys(hosts_key_file)
    ssh.close()

    wakeupSchedOn(hostname)
예제 #19
0
def launch(benchmark_script, benchmark_script_args, with_nvprof=False):
    cmd = "{} {} {}".format(sys.executable, benchmark_script,
                            " ".join(benchmark_script_args))
    if with_nvprof:
        stdout, exit_code = _nvprof(cmd)
        if exit_code == 0:
            return _parse_nvprof_logs(stdout.split("\n"))
        else:
            print("Runing Error:\n {}".format(stdout))
    else:
        stdout, exit_code = utils.run_command(cmd)
        print(stdout)
        if exit_code != 0:
            sys.exit(exit_code)
    return 0.0
예제 #20
0
def update_cluster_limits(max_nodes, node_slots):
    try:
        logging.info("Updating cluster limits: max_nodes=%d, node_slots=%d",
                     max_nodes, node_slots)
        run_command(TORQUE_BIN_DIR +
                    'qmgr -c "set queue batch resources_available.nodect={0}"'.
                    format(max_nodes))
        run_command(TORQUE_BIN_DIR +
                    'qmgr -c "set server resources_available.nodect={0}"'.
                    format(max_nodes))
        run_command(TORQUE_BIN_DIR +
                    'qmgr -c "set queue batch resources_max.ncpus={0}"'.format(
                        node_slots))
        _update_master_np(max_nodes, node_slots)
    except Exception as e:
        logging.error("Failed when updating cluster limits with exception %s.",
                      e)
예제 #21
0
파일: app.py 프로젝트: tlkh/mini-dlperf
import streamlit as st
import time
import multiprocessing
import pandas as pd
from common import utils

st.sidebar.title("Mini-DLPerf")
st.sidebar.subheader("\nControls")
threads = st.sidebar.number_input("Threads",
                                  min_value=1,
                                  value=multiprocessing.cpu_count() - 2)
batch_size = st.sidebar.number_input("Batch Size", min_value=1, value=64)
ready = st.sidebar.checkbox("Ready to run!")

_ = utils.run_command("nvidia-smi nvlink -sc 0bz")

with st.spinner("Getting GPU info..."):

    @st.cache
    def app_get_gpu_info():
        return utils.get_gpu_info()

    st.markdown("GPU info:")
    st.json(app_get_gpu_info())

threads = str(threads)
batch_size = str(batch_size)

if ready:
    progress_bar = st.progress(0)
예제 #22
0
def _nvprof(cmd):
    return utils.run_command("nvprof {}".format(cmd))