Esempio n. 1
0
def _get_node_slots():
    hostname = check_command_output("hostname")
    # retrieves number of slots for a specific node in the cluster.
    # Output format:
    # 4
    command = "/opt/slurm/bin/sinfo -o '%c' -n {0} -h".format(hostname)
    output = check_command_output(command)
    return int(output)
Esempio n. 2
0
def _update_master_np(max_nodes, node_slots):
    """Master np is dynamically based on the number of compute nodes that join the cluster."""
    current_nodes_count = len(
        check_command_output("cat /var/spool/torque/server_priv/nodes").strip(
        ).splitlines()) - 1
    # If cluster is at max size set the master np to 1 since 0 is not allowed.
    master_node_np = max(1, (max_nodes - current_nodes_count) * node_slots)
    master_hostname = check_command_output("hostname")
    logging.info("Setting master np to: %d", master_node_np)
    run_command(TORQUE_BIN_DIR +
                'qmgr -c "set node {hostname} np = {slots}"'.format(
                    hostname=master_hostname, slots=master_node_np))
Esempio n. 3
0
def hasPendingJobs():
    command = "/opt/torque/bin/qstat -Q"

    # Command outputs the status of the queue in the following format
    # Queue              Max    Tot   Ena   Str   Que   Run   Hld   Wat   Trn   Ext T   Cpt
    # ----------------   ---   ----    --    --   ---   ---   ---   ---   ---   --- -   ---
    # batch                0     24   yes   yes    24     0     0     0     0     0 E     0
    # test1                0     26   yes   yes    26     0     0     0     0     0 E     0
    try:
        output = check_command_output(command, log)
        lines = filter(None, output.split("\n"))
        if len(lines) < 3:
            log.error("Unable to check pending jobs. The command '%s' does not return a valid output", command)
            raise CriticalError

        pending = 0
        for idx, line in enumerate(lines):
            if idx < 2:
                continue
            queue_status = line.split()
            pending += int(queue_status[5])

        has_pending = pending > 0
        error = False
    except (subprocess.CalledProcessError, CriticalError):
        error = True
        has_pending = False

    return has_pending, error
Esempio n. 4
0
def _qmgr_manage_nodes(operation,
                       hosts,
                       error_messages_to_ignore,
                       additional_qmgr_args=""):
    if not hosts:
        return set()

    hostnames = ",".join(hosts)
    command = TORQUE_BIN_DIR + 'qmgr -c "{operation} node {hostnames} {additional_args}"'.format(
        operation=operation,
        hostnames=hostnames,
        additional_args=additional_qmgr_args)
    try:
        output = check_command_output(command, log_error=False)
    except subprocess.CalledProcessError as e:
        if not hasattr(e, "output") or not e.output or e.output == "":
            logging.error(
                "Failed when executing operation %s on nodes %s with error %s",
                operation, hostnames, e)
            return set()
        else:
            output = e.output
    except Exception as e:
        logging.error(
            "Failed when executing operation %s on nodes %s with error %s",
            operation, hostnames, e)
        return set()

    return _qmgr_process_command_output(operation, hosts,
                                        error_messages_to_ignore, output)
Esempio n. 5
0
def is_node_down():
    """
    Check if node is down according to scheduler.

    The node is considered as down if:
    - there is a failure contacting the scheduler
    - node is not reported in the compute nodes list
    - node is in one of the SGE_ERROR_STATES states
    """
    try:
        hostname = check_command_output("hostname").strip()
        host_fqdn = socket.getfqdn(hostname)
        nodes = get_compute_nodes_info(hostname_filter=hostname)
        if not any(host in nodes for host in [hostname, host_fqdn]):
            log.warning("Node is not attached to scheduler. Reporting as down")
            return True

        node = nodes.get(host_fqdn, nodes.get(hostname))
        log.info("Node is in state: '{0}'".format(node.state))
        if all(error_state not in node.state for error_state in SGE_ERROR_STATES):
            return False
    except Exception as e:
        log.error("Failed when checking if node is down with exception %s. Reporting node as down.", e)

    return True
Esempio n. 6
0
def get_busy_nodes(instance_properties):
    command = "/opt/torque/bin/pbsnodes -x"
    # The output of the command
    #<?xml version="1.0" encoding="UTF-8"?>
    # <Data>
    #    <Node>
    #       <name>ip-172-31-11-1</name>
    #       <state>down</state>
    #       <power_state>Running</power_state>
    #       <np>1000</np>
    #       <ntype>cluster</ntype>
    #       <jobs>"job-id"</jobs>
    #       <note>MasterServer</note>
    #       <mom_service_port>15002</mom_service_port>
    #       <mom_manager_port>15003</mom_manager_port>
    #    </Node>
    # </Data>
    _output = check_command_output(command, log)
    root = ElementTree.fromstring(_output)
    count = 0
    # See how many nodes have jobs
    for node in root.findall('Node'):
        if len(node.findall('jobs')) != 0:
            count += 1
    return count
def hasPendingJobs():
    command = "/opt/torque/bin/qstat -Q"

    # Command outputs the status of the queue in the following format
    # Queue              Max    Tot   Ena   Str   Que   Run   Hld   Wat   Trn   Ext T   Cpt
    # ----------------   ---   ----    --    --   ---   ---   ---   ---   ---   --- -   ---
    # batch                0     24   yes   yes    24     0     0     0     0     0 E     0
    # test1                0     26   yes   yes    26     0     0     0     0     0 E     0
    try:
        output = check_command_output(command, log)
        lines = filter(None, output.split("\n"))
        if len(lines) < 3:
            log.error(
                "Unable to check pending jobs. The command '%s' does not return a valid output",
                command)
            raise CriticalError

        pending = 0
        for idx, line in enumerate(lines):
            if idx < 2:
                continue
            queue_status = line.split()
            pending += int(queue_status[5])

        has_pending = pending > 0
        error = False
    except (subprocess.CalledProcessError, CriticalError):
        error = True
        has_pending = False

    return has_pending, error
Esempio n. 8
0
def wakeupSchedOn(hostname):
    log.info('Waking up scheduler on host %s', hostname)
    command = ("/opt/torque/bin/pbsnodes -x %s" % (hostname))

    sleep_time = 3
    times = 20
    host_state = None
    while isHostInitState(host_state) and times > 0:
        try:
            output = check_command_output(command, log)
            # Ex.1: <Data><Node><name>ip-10-0-76-39</name><state>down,offline,MOM-list-not-sent</state><power_state>Running</power_state>
            #        <np>1</np><ntype>cluster</ntype><mom_service_port>15002</mom_service_port><mom_manager_port>15003</mom_manager_port></Node></Data>
            # Ex 2: <Data><Node><name>ip-10-0-76-39</name><state>free</state><power_state>Running</power_state><np>1</np><ntype>cluster</ntype>
            #        <status>rectime=1527799181,macaddr=02:e4:00:b0:b1:72,cpuclock=Fixed,varattr=,jobs=,state=free,netload=210647044,gres=,loadave=0.00,
            #        ncpus=1,physmem=1017208kb,availmem=753728kb,totmem=1017208kb,idletime=856,nusers=1,nsessions=1,sessions=19698,
            #        uname=Linux ip-10-0-76-39 4.9.75-25.55.amzn1.x86_64 #1 SMP Fri Jan 5 23:50:27 UTC 2018 x86_64,opsys=linux</status>
            #        <mom_service_port>15002</mom_service_port><mom_manager_port>15003</mom_manager_port></Node></Data>
            xmlnode = ElementTree.XML(output)
            host_state = xmlnode.findtext("./Node/state")
        except:
            log.error("Error parsing XML from %s" % output)

        if isHostInitState(host_state):
            log.debug("Host %s is still in state %s" % (hostname, host_state))
            time.sleep(sleep_time)
            times -= 1

    if host_state == "free":
        command = "/opt/torque/bin/qmgr -c \"set server scheduling=true\""
        run_command(command, log, raise_on_error=False)
    elif times == 0:
        log.error("Host %s is still in state %s" % (hostname, host_state))
    else:
        log.debug("Host %s is in state %s" % (hostname, host_state))
Esempio n. 9
0
def get_busy_nodes():
    command = "/opt/torque/bin/pbsnodes -x"
    # The output of the command
    # <?xml version="1.0" encoding="UTF-8"?>
    # <Data>
    #    <Node>
    #       <name>ip-172-31-11-1</name>
    #       <state>down</state>
    #       <power_state>Running</power_state>
    #       <np>1000</np>
    #       <ntype>cluster</ntype>
    #       <jobs>"job-id"</jobs>
    #       <note>MasterServer</note>
    #       <mom_service_port>15002</mom_service_port>
    #       <mom_manager_port>15003</mom_manager_port>
    #    </Node>
    # </Data>
    _output = check_command_output(command)
    root = ElementTree.fromstring(_output)
    count = 0
    # See how many nodes have jobs
    for node in root.findall("Node"):
        if len(node.findall("jobs")) != 0:
            count += 1
    return count
Esempio n. 10
0
def get_required_nodes(instance_properties, max_size):
    command = "/opt/torque/bin/qstat -at"

    # Example output of torque
    #                                                                                   Req'd       Req'd       Elap
    # Job ID                  Username    Queue    Jobname          SessID  NDS   TSK   Memory      Time    S   Time
    # ----------------------- ----------- -------- ---------------- ------ ----- ------ --------- --------- - ---------
    # 0.ip-172-31-11-1.ec2.i  centos      batch    job.sh             5343     5     30       --   01:00:00 Q  00:04:58
    # 1.ip-172-31-11-1.ec2.i  centos      batch    job.sh             5340     3      6       --   01:00:00 R  00:08:14
    # 2.ip-172-31-11-1.ec2.i  centos      batch    job.sh             5387     2      4       --   01:00:00 R  00:08:27

    status = ["Q"]
    _output = check_command_output(command)
    output = _output.split("\n")[5:]
    slots_requested = []
    nodes_requested = []
    for line in output:
        line_arr = line.split()
        if len(line_arr) >= 10 and line_arr[9] in status:
            # if a job has been looked at to account for pending nodes, don't look at it again
            slots_requested.append(int(line_arr[6]))
            nodes_requested.append(int(line_arr[5]))

    return get_optimal_nodes(nodes_requested, slots_requested,
                             instance_properties)
Esempio n. 11
0
def check_sge_command_output(command, log):
    """
    Execute SGE shell command, by exporting the appropriate environment.

    :param command: command to execute
    :param log: logger
    :raise: subprocess.CalledProcessError if the command fails
    """
    command = _prepend_sge_bin_dir(command)
    return check_command_output(command, log, SGE_ENV)
Esempio n. 12
0
def check_sge_command_output(command, log):
    """
    Execute SGE shell command, by exporting the appropriate environment.

    :param command: command to execute
    :param log: logger
    :raise: subprocess.CalledProcessError if the command fails
    """
    command = _prepend_sge_bin_dir(command)
    return check_command_output(command, log, SGE_ENV)
Esempio n. 13
0
def hasJobs(hostname):
    # Slurm won't use FQDN
    short_name = hostname.split(".")[0]
    # Checking for running jobs on the node
    command = ["/opt/slurm/bin/squeue", "-w", short_name, "-h"]
    try:
        output = check_command_output(command)
        has_jobs = output != ""
    except subprocess.CalledProcessError:
        has_jobs = False

    return has_jobs
Esempio n. 14
0
def hasJobs(hostname):
    # Slurm won't use FQDN
    short_name = hostname.split('.')[0]
    # Checking for running jobs on the node
    command = ['/opt/slurm/bin/squeue', '-w', short_name, '-h']
    try:
        output = check_command_output(command, log)
        has_jobs = output != ""
    except subprocess.CalledProcessError:
        has_jobs = False

    return has_jobs
Esempio n. 15
0
def hasJobs(hostname):
    # Slurm won't use FQDN
    short_name = hostname.split('.')[0]
    # Checking for running jobs on the node
    command = ['/opt/slurm/bin/squeue', '-w', short_name, '-h']
    try:
        output = check_command_output(command, log)
        has_jobs = output != ""
    except subprocess.CalledProcessError:
        has_jobs = False

    return has_jobs
Esempio n. 16
0
def check_sge_command_output(command, raise_on_error=True):
    """
    Execute SGE shell command, by exporting the appropriate environment.

    :param command: command to execute
    :param raise_on_error: if True the method raises subprocess.CalledProcessError on errors
    :raise subprocess.CalledProcessError if the command fails
    :return the stdout and stderr of the executed command.
    """
    command = _prepend_sge_bin_dir(command)
    return check_command_output(command,
                                SGE_ENV,
                                raise_on_error=raise_on_error)
Esempio n. 17
0
def get_jobs_info(job_state_filter=None):
    """
    Retrieve the list of submitted jobs.

    :param job_state_filter: filter jobs by the given state
    :return: a list of SlurmJob objects representing the submitted jobs.
    """
    command = "/opt/slurm/bin/squeue -r -o '%i|%t|%D|%C|%c|%r'"
    if job_state_filter:
        command += " --states {0}".format(job_state_filter)

    output = check_command_output(command)
    return SlurmJob.from_table(output)
Esempio n. 18
0
def has_jobs(hostname):
    # Slurm won't use FQDN
    short_name = hostname.split(".")[0]
    # Checking for running jobs on the node
    command = ["/opt/slurm/bin/squeue", "-w", short_name, "-h"]
    try:
        output = check_command_output(command)
        logging.info("Found the following running jobs:\n%s", output.rstrip())
        has_jobs = output != ""
    except subprocess.CalledProcessError:
        has_jobs = False

    return has_jobs
Esempio n. 19
0
def get_busy_nodes(instance_properties):
    command = "/opt/slurm/bin/sinfo -r -h -o '%D %t'"
    # Sample output:
    # 2 mix
    # 4 alloc
    # 10 idle
    output = check_command_output(command, log)
    nodes = 0
    output = output.split("\n")
    for line in output:
        line_arr = line.split()
        if len(line_arr) == 2 and (line_arr[1] in ['mix', 'alloc', 'drain', 'drain*']):
            nodes += int(line_arr[0])
    return nodes
Esempio n. 20
0
def get_busy_nodes(instance_properties):
    command = "/opt/slurm/bin/sinfo -r -h -o '%D %t'"
    # Sample output:
    # 2 mix
    # 4 alloc
    # 10 idle
    output = check_command_output(command, log)
    nodes = 0
    output = output.split("\n")
    for line in output:
        line_arr = line.split()
        if len(line_arr) == 2 and (line_arr[1]
                                   in ['mix', 'alloc', 'drain', 'drain*']):
            nodes += int(line_arr[0])
    return nodes
Esempio n. 21
0
def get_busy_nodes():
    command = "/opt/slurm/bin/sinfo -h -o '%D %t'"
    # Sample output:
    # 2 mix
    # 4 alloc
    # 10 idle
    # 1 down*
    output = check_command_output(command)
    logging.info("Found the following compute nodes:\n%s", output.rstrip())
    nodes = 0
    output = output.split("\n")
    for line in output:
        line_arr = line.split()
        if len(line_arr) == 2 and (line_arr[1] in ["mix", "alloc", "down", "down*"]):
            nodes += int(line_arr[0])
    return nodes
Esempio n. 22
0
def hasPendingJobs():
    command = "/opt/slurm/bin/squeue -t PD --noheader -o '%r'"

    # Command outputs the pending jobs in the queue in the following format
    #  Resources
    #  Priority
    #  PartitionNodeLimit
    try:
        output = check_command_output(command, log)
        has_pending = len(filter(lambda reason: reason in PENDING_RESOURCES_REASONS, output.split("\n"))) > 0
        error = False
    except subprocess.CalledProcessError:
        error = True
        has_pending = False

    return has_pending, error
Esempio n. 23
0
def is_node_down():
    """Check if node is down according to scheduler"""
    try:
        # retrieves the state of a specific node
        # https://slurm.schedmd.com/sinfo.html#lbAG
        # Output format:
        # down*
        command = "/bin/bash -c \"/opt/slurm/bin/sinfo --noheader -o '%T' -n $(hostname)\""
        output = check_command_output(command).strip()
        log.info("Node is in state: '{0}'".format(output))
        if output and all(state not in output for state in ["down", "drained", "fail"]):
            return False
    except Exception as e:
        log.error("Failed when checking if node is down with exception %s. Reporting node as down.", e)

    return True
Esempio n. 24
0
def get_busy_nodes():
    command = "/opt/slurm/bin/sinfo -h -o '%D %t'"
    # Sample output:
    # 2 mix
    # 4 alloc
    # 10 idle
    # 1 down*
    output = check_command_output(command)
    nodes = 0
    output = output.split("\n")
    for line in output:
        line_arr = line.split()
        if len(line_arr) == 2 and (line_arr[1] in [
                "mix", "alloc", "drain", "drain*", "down", "down*"
        ]):
            nodes += int(line_arr[0])
    return nodes
Esempio n. 25
0
def is_node_down():
    """Check if node is down according to scheduler"""
    try:
        hostname = check_command_output("hostname").strip()
        node = get_compute_nodes_info(hostname_filter=[hostname]).get(hostname)
        if node:
            log.info("Node is in state: '{0}'".format(node.state))
            if all(error_state not in node.state
                   for error_state in TORQUE_NODE_ERROR_STATES):
                return False
        else:
            log.warning("Node is not attached to scheduler. Reporting as down")
    except Exception as e:
        log.error(
            "Failed when checking if node is down with exception %s. Reporting node as down.",
            e)

    return True
Esempio n. 26
0
def hasPendingJobs():
    command = "/opt/slurm/bin/squeue -t PD --noheader -o '%r'"

    # Command outputs the pending jobs in the queue in the following format
    #  Resources
    #  Priority
    #  PartitionNodeLimit
    try:
        output = check_command_output(command, log)
        has_pending = len(
            filter(lambda reason: reason in PENDING_RESOURCES_REASONS,
                   output.split("\n"))) > 0
        error = False
    except subprocess.CalledProcessError:
        error = True
        has_pending = False

    return has_pending, error
Esempio n. 27
0
def get_compute_nodes_info(hostname_filter=None):
    command = TORQUE_BIN_DIR + "pbsnodes -x"
    if hostname_filter:
        command += " {0}".format(" ".join(hostname_filter))

    output = check_command_output(command, raise_on_error=False)
    if output.startswith("<Data>"):
        root = ElementTree.fromstring(output)
        nodes = root.findall("./Node")
        nodes_list = [
            TorqueHost.from_xml(ElementTree.tostring(node)) for node in nodes
        ]
        return dict((node.name, node) for node in nodes_list
                    if node.note != "MasterServer")
    else:
        if output != "":
            logging.warning("Failed when running command %s with error %s",
                            command, output)
        return dict()
Esempio n. 28
0
def get_jobs_info(filter_by_states=None, filter_by_exec_hosts=None):
    command = TORQUE_BIN_DIR + "qstat -t -x"
    output = check_command_output(command)
    if not output:
        return []

    root = ElementTree.fromstring(output)
    jobs = root.findall("./Job")
    jobs_list = []
    for job in jobs:
        parsed_job = TorqueJob.from_xml(ElementTree.tostring(job))
        if filter_by_states and parsed_job.state not in filter_by_states:
            continue
        if filter_by_exec_hosts:
            if any(host in parsed_job.exec_hosts
                   for host in filter_by_exec_hosts):
                jobs_list.append(parsed_job)
        else:
            jobs_list.append(parsed_job)

    return jobs_list
Esempio n. 29
0
def get_required_nodes(instance_properties):
    log.info("Computing number of required nodes for submitted jobs")
    command = "/opt/slurm/bin/squeue -r -h -o '%i-%t-%D-%C-%r'"
    # Example output of squeue
    # 1-PD-1-24-Nodes required for job are DOWN, DRAINED or reserved for jobs in higher priority partitions
    # 2-PD-1-24-Licenses
    # 3-PD-1-24-PartitionNodeLimit
    # 4-R-1-24-
    output = check_command_output(command, log)
    slots_requested = []
    nodes_requested = []
    output = output.split("\n")
    for line in output:
        line_arr = line.split("-")
        if len(line_arr) == 5 and line_arr[1] == 'PD':
            if line_arr[4] in PENDING_RESOURCES_REASONS:
                slots_requested.append(int(line_arr[3]))
                nodes_requested.append(int(line_arr[2]))
            else:
                log.info("Skipping pending job %s due to pending reason: %s", line_arr[0], line_arr[4])

    return get_optimal_nodes(nodes_requested, slots_requested, instance_properties)
Esempio n. 30
0
def get_required_nodes(instance_properties):
    command = "/opt/torque/bin/qstat -at"

    # Example output of torque
    #                                                                                   Req'd       Req'd       Elap
    # Job ID                  Username    Queue    Jobname          SessID  NDS   TSK   Memory      Time    S   Time
    # ----------------------- ----------- -------- ---------------- ------ ----- ------ --------- --------- - ---------
    # 0.ip-172-31-11-1.ec2.i  centos      batch    job.sh             5343     5     30       --   01:00:00 Q  00:04:58
    # 1.ip-172-31-11-1.ec2.i  centos      batch    job.sh             5340     3      6       --   01:00:00 R  00:08:14
    # 2.ip-172-31-11-1.ec2.i  centos      batch    job.sh             5387     2      4       --   01:00:00 R  00:08:27

    status = ['Q']
    _output = check_command_output(command, log)
    output = _output.split("\n")[5:]
    slots_requested = []
    nodes_requested = []
    for line in output:
        line_arr = line.split()
        if len(line_arr) >= 10 and line_arr[9] in status:
            # if a job has been looked at to account for pending nodes, don't look at it again
            slots_requested.append(int(line_arr[6]))
            nodes_requested.append(int(line_arr[5]))

    return get_optimal_nodes(nodes_requested, slots_requested, instance_properties)
Esempio n. 31
0
def get_required_nodes(instance_properties):
    log.info("Computing number of required nodes for submitted jobs")
    command = "/opt/slurm/bin/squeue -r -h -o '%i-%t-%D-%C-%r'"
    # Example output of squeue
    # 1-PD-1-24-Nodes required for job are DOWN, DRAINED or reserved for jobs in higher priority partitions
    # 2-PD-1-24-Licenses
    # 3-PD-1-24-PartitionNodeLimit
    # 4-R-1-24-
    output = check_command_output(command, log)
    slots_requested = []
    nodes_requested = []
    output = output.split("\n")
    for line in output:
        line_arr = line.split("-")
        if len(line_arr) == 5 and line_arr[1] == 'PD':
            if line_arr[4] in PENDING_RESOURCES_REASONS:
                slots_requested.append(int(line_arr[3]))
                nodes_requested.append(int(line_arr[2]))
            else:
                log.info("Skipping pending job %s due to pending reason: %s",
                         line_arr[0], line_arr[4])

    return get_optimal_nodes(nodes_requested, slots_requested,
                             instance_properties)
Esempio n. 32
0
def _qmgr_manage_nodes(operation,
                       hosts,
                       error_messages_to_ignore,
                       additional_qmgr_args=""):
    if not hosts:
        return set()

    hostnames = ",".join(hosts)
    command = TORQUE_BIN_DIR + 'qmgr -c "{operation} node {hostnames} {additional_args}"'.format(
        operation=operation,
        hostnames=hostnames,
        additional_args=additional_qmgr_args)
    try:
        output = check_command_output(command, log_error=False)
    except subprocess.CalledProcessError as e:
        if not hasattr(e, "output") or not e.output or e.output == "":
            logging.error(
                "Failed when executing operation %s on nodes %s with error %s",
                operation, hostnames, e)
            return set()
        else:
            output = e.output
    except Exception as e:
        logging.error(
            "Failed when executing operation %s on nodes %s with error %s",
            operation, hostnames, e)
        return set()

    succeeded_hosts = set(hosts)
    # analyze command output to understand if failure can be ignored (e.g. already existing node)
    for error_message in output.splitlines():
        match = re.match(r"qmgr obj=(?P<host>.*) svr=default: (?P<error>.*)",
                         error_message)
        if not match:
            # assume unexpected error and mark all as failed
            logging.error(
                "Failed when executing operation %s on nodes %s with error %s",
                operation, hostnames, output)
            return set()

        host, error = match.groups()
        if any(error.strip() == message_to_ignore
               for message_to_ignore in error_messages_to_ignore):
            logging.warning(
                "Marking %s operation on node %s as succeeded because of ignored error message %s",
                operation,
                host,
                error,
            )
            continue

        try:
            logging.error(
                "Failed when executing operation %s on node %s with error %s",
                operation, host, error_message)
            succeeded_hosts.remove(host)
        except Exception as e:
            logging.error(
                "Failed to extract host from error message while adding nodes. Mark all as failed. Output was %s.\n"
                "Exception was %s",
                output,
                e,
            )

    return succeeded_hosts