def _get_node_slots(): hostname = check_command_output("hostname") # retrieves number of slots for a specific node in the cluster. # Output format: # 4 command = "/opt/slurm/bin/sinfo -o '%c' -n {0} -h".format(hostname) output = check_command_output(command) return int(output)
def _update_master_np(max_nodes, node_slots): """Master np is dynamically based on the number of compute nodes that join the cluster.""" current_nodes_count = len( check_command_output("cat /var/spool/torque/server_priv/nodes").strip( ).splitlines()) - 1 # If cluster is at max size set the master np to 1 since 0 is not allowed. master_node_np = max(1, (max_nodes - current_nodes_count) * node_slots) master_hostname = check_command_output("hostname") logging.info("Setting master np to: %d", master_node_np) run_command(TORQUE_BIN_DIR + 'qmgr -c "set node {hostname} np = {slots}"'.format( hostname=master_hostname, slots=master_node_np))
def hasPendingJobs(): command = "/opt/torque/bin/qstat -Q" # Command outputs the status of the queue in the following format # Queue Max Tot Ena Str Que Run Hld Wat Trn Ext T Cpt # ---------------- --- ---- -- -- --- --- --- --- --- --- - --- # batch 0 24 yes yes 24 0 0 0 0 0 E 0 # test1 0 26 yes yes 26 0 0 0 0 0 E 0 try: output = check_command_output(command, log) lines = filter(None, output.split("\n")) if len(lines) < 3: log.error("Unable to check pending jobs. The command '%s' does not return a valid output", command) raise CriticalError pending = 0 for idx, line in enumerate(lines): if idx < 2: continue queue_status = line.split() pending += int(queue_status[5]) has_pending = pending > 0 error = False except (subprocess.CalledProcessError, CriticalError): error = True has_pending = False return has_pending, error
def _qmgr_manage_nodes(operation, hosts, error_messages_to_ignore, additional_qmgr_args=""): if not hosts: return set() hostnames = ",".join(hosts) command = TORQUE_BIN_DIR + 'qmgr -c "{operation} node {hostnames} {additional_args}"'.format( operation=operation, hostnames=hostnames, additional_args=additional_qmgr_args) try: output = check_command_output(command, log_error=False) except subprocess.CalledProcessError as e: if not hasattr(e, "output") or not e.output or e.output == "": logging.error( "Failed when executing operation %s on nodes %s with error %s", operation, hostnames, e) return set() else: output = e.output except Exception as e: logging.error( "Failed when executing operation %s on nodes %s with error %s", operation, hostnames, e) return set() return _qmgr_process_command_output(operation, hosts, error_messages_to_ignore, output)
def is_node_down(): """ Check if node is down according to scheduler. The node is considered as down if: - there is a failure contacting the scheduler - node is not reported in the compute nodes list - node is in one of the SGE_ERROR_STATES states """ try: hostname = check_command_output("hostname").strip() host_fqdn = socket.getfqdn(hostname) nodes = get_compute_nodes_info(hostname_filter=hostname) if not any(host in nodes for host in [hostname, host_fqdn]): log.warning("Node is not attached to scheduler. Reporting as down") return True node = nodes.get(host_fqdn, nodes.get(hostname)) log.info("Node is in state: '{0}'".format(node.state)) if all(error_state not in node.state for error_state in SGE_ERROR_STATES): return False except Exception as e: log.error("Failed when checking if node is down with exception %s. Reporting node as down.", e) return True
def get_busy_nodes(instance_properties): command = "/opt/torque/bin/pbsnodes -x" # The output of the command #<?xml version="1.0" encoding="UTF-8"?> # <Data> # <Node> # <name>ip-172-31-11-1</name> # <state>down</state> # <power_state>Running</power_state> # <np>1000</np> # <ntype>cluster</ntype> # <jobs>"job-id"</jobs> # <note>MasterServer</note> # <mom_service_port>15002</mom_service_port> # <mom_manager_port>15003</mom_manager_port> # </Node> # </Data> _output = check_command_output(command, log) root = ElementTree.fromstring(_output) count = 0 # See how many nodes have jobs for node in root.findall('Node'): if len(node.findall('jobs')) != 0: count += 1 return count
def hasPendingJobs(): command = "/opt/torque/bin/qstat -Q" # Command outputs the status of the queue in the following format # Queue Max Tot Ena Str Que Run Hld Wat Trn Ext T Cpt # ---------------- --- ---- -- -- --- --- --- --- --- --- - --- # batch 0 24 yes yes 24 0 0 0 0 0 E 0 # test1 0 26 yes yes 26 0 0 0 0 0 E 0 try: output = check_command_output(command, log) lines = filter(None, output.split("\n")) if len(lines) < 3: log.error( "Unable to check pending jobs. The command '%s' does not return a valid output", command) raise CriticalError pending = 0 for idx, line in enumerate(lines): if idx < 2: continue queue_status = line.split() pending += int(queue_status[5]) has_pending = pending > 0 error = False except (subprocess.CalledProcessError, CriticalError): error = True has_pending = False return has_pending, error
def wakeupSchedOn(hostname): log.info('Waking up scheduler on host %s', hostname) command = ("/opt/torque/bin/pbsnodes -x %s" % (hostname)) sleep_time = 3 times = 20 host_state = None while isHostInitState(host_state) and times > 0: try: output = check_command_output(command, log) # Ex.1: <Data><Node><name>ip-10-0-76-39</name><state>down,offline,MOM-list-not-sent</state><power_state>Running</power_state> # <np>1</np><ntype>cluster</ntype><mom_service_port>15002</mom_service_port><mom_manager_port>15003</mom_manager_port></Node></Data> # Ex 2: <Data><Node><name>ip-10-0-76-39</name><state>free</state><power_state>Running</power_state><np>1</np><ntype>cluster</ntype> # <status>rectime=1527799181,macaddr=02:e4:00:b0:b1:72,cpuclock=Fixed,varattr=,jobs=,state=free,netload=210647044,gres=,loadave=0.00, # ncpus=1,physmem=1017208kb,availmem=753728kb,totmem=1017208kb,idletime=856,nusers=1,nsessions=1,sessions=19698, # uname=Linux ip-10-0-76-39 4.9.75-25.55.amzn1.x86_64 #1 SMP Fri Jan 5 23:50:27 UTC 2018 x86_64,opsys=linux</status> # <mom_service_port>15002</mom_service_port><mom_manager_port>15003</mom_manager_port></Node></Data> xmlnode = ElementTree.XML(output) host_state = xmlnode.findtext("./Node/state") except: log.error("Error parsing XML from %s" % output) if isHostInitState(host_state): log.debug("Host %s is still in state %s" % (hostname, host_state)) time.sleep(sleep_time) times -= 1 if host_state == "free": command = "/opt/torque/bin/qmgr -c \"set server scheduling=true\"" run_command(command, log, raise_on_error=False) elif times == 0: log.error("Host %s is still in state %s" % (hostname, host_state)) else: log.debug("Host %s is in state %s" % (hostname, host_state))
def get_busy_nodes(): command = "/opt/torque/bin/pbsnodes -x" # The output of the command # <?xml version="1.0" encoding="UTF-8"?> # <Data> # <Node> # <name>ip-172-31-11-1</name> # <state>down</state> # <power_state>Running</power_state> # <np>1000</np> # <ntype>cluster</ntype> # <jobs>"job-id"</jobs> # <note>MasterServer</note> # <mom_service_port>15002</mom_service_port> # <mom_manager_port>15003</mom_manager_port> # </Node> # </Data> _output = check_command_output(command) root = ElementTree.fromstring(_output) count = 0 # See how many nodes have jobs for node in root.findall("Node"): if len(node.findall("jobs")) != 0: count += 1 return count
def get_required_nodes(instance_properties, max_size): command = "/opt/torque/bin/qstat -at" # Example output of torque # Req'd Req'd Elap # Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time # ----------------------- ----------- -------- ---------------- ------ ----- ------ --------- --------- - --------- # 0.ip-172-31-11-1.ec2.i centos batch job.sh 5343 5 30 -- 01:00:00 Q 00:04:58 # 1.ip-172-31-11-1.ec2.i centos batch job.sh 5340 3 6 -- 01:00:00 R 00:08:14 # 2.ip-172-31-11-1.ec2.i centos batch job.sh 5387 2 4 -- 01:00:00 R 00:08:27 status = ["Q"] _output = check_command_output(command) output = _output.split("\n")[5:] slots_requested = [] nodes_requested = [] for line in output: line_arr = line.split() if len(line_arr) >= 10 and line_arr[9] in status: # if a job has been looked at to account for pending nodes, don't look at it again slots_requested.append(int(line_arr[6])) nodes_requested.append(int(line_arr[5])) return get_optimal_nodes(nodes_requested, slots_requested, instance_properties)
def check_sge_command_output(command, log): """ Execute SGE shell command, by exporting the appropriate environment. :param command: command to execute :param log: logger :raise: subprocess.CalledProcessError if the command fails """ command = _prepend_sge_bin_dir(command) return check_command_output(command, log, SGE_ENV)
def hasJobs(hostname): # Slurm won't use FQDN short_name = hostname.split(".")[0] # Checking for running jobs on the node command = ["/opt/slurm/bin/squeue", "-w", short_name, "-h"] try: output = check_command_output(command) has_jobs = output != "" except subprocess.CalledProcessError: has_jobs = False return has_jobs
def hasJobs(hostname): # Slurm won't use FQDN short_name = hostname.split('.')[0] # Checking for running jobs on the node command = ['/opt/slurm/bin/squeue', '-w', short_name, '-h'] try: output = check_command_output(command, log) has_jobs = output != "" except subprocess.CalledProcessError: has_jobs = False return has_jobs
def check_sge_command_output(command, raise_on_error=True): """ Execute SGE shell command, by exporting the appropriate environment. :param command: command to execute :param raise_on_error: if True the method raises subprocess.CalledProcessError on errors :raise subprocess.CalledProcessError if the command fails :return the stdout and stderr of the executed command. """ command = _prepend_sge_bin_dir(command) return check_command_output(command, SGE_ENV, raise_on_error=raise_on_error)
def get_jobs_info(job_state_filter=None): """ Retrieve the list of submitted jobs. :param job_state_filter: filter jobs by the given state :return: a list of SlurmJob objects representing the submitted jobs. """ command = "/opt/slurm/bin/squeue -r -o '%i|%t|%D|%C|%c|%r'" if job_state_filter: command += " --states {0}".format(job_state_filter) output = check_command_output(command) return SlurmJob.from_table(output)
def has_jobs(hostname): # Slurm won't use FQDN short_name = hostname.split(".")[0] # Checking for running jobs on the node command = ["/opt/slurm/bin/squeue", "-w", short_name, "-h"] try: output = check_command_output(command) logging.info("Found the following running jobs:\n%s", output.rstrip()) has_jobs = output != "" except subprocess.CalledProcessError: has_jobs = False return has_jobs
def get_busy_nodes(instance_properties): command = "/opt/slurm/bin/sinfo -r -h -o '%D %t'" # Sample output: # 2 mix # 4 alloc # 10 idle output = check_command_output(command, log) nodes = 0 output = output.split("\n") for line in output: line_arr = line.split() if len(line_arr) == 2 and (line_arr[1] in ['mix', 'alloc', 'drain', 'drain*']): nodes += int(line_arr[0]) return nodes
def get_busy_nodes(): command = "/opt/slurm/bin/sinfo -h -o '%D %t'" # Sample output: # 2 mix # 4 alloc # 10 idle # 1 down* output = check_command_output(command) logging.info("Found the following compute nodes:\n%s", output.rstrip()) nodes = 0 output = output.split("\n") for line in output: line_arr = line.split() if len(line_arr) == 2 and (line_arr[1] in ["mix", "alloc", "down", "down*"]): nodes += int(line_arr[0]) return nodes
def hasPendingJobs(): command = "/opt/slurm/bin/squeue -t PD --noheader -o '%r'" # Command outputs the pending jobs in the queue in the following format # Resources # Priority # PartitionNodeLimit try: output = check_command_output(command, log) has_pending = len(filter(lambda reason: reason in PENDING_RESOURCES_REASONS, output.split("\n"))) > 0 error = False except subprocess.CalledProcessError: error = True has_pending = False return has_pending, error
def is_node_down(): """Check if node is down according to scheduler""" try: # retrieves the state of a specific node # https://slurm.schedmd.com/sinfo.html#lbAG # Output format: # down* command = "/bin/bash -c \"/opt/slurm/bin/sinfo --noheader -o '%T' -n $(hostname)\"" output = check_command_output(command).strip() log.info("Node is in state: '{0}'".format(output)) if output and all(state not in output for state in ["down", "drained", "fail"]): return False except Exception as e: log.error("Failed when checking if node is down with exception %s. Reporting node as down.", e) return True
def get_busy_nodes(): command = "/opt/slurm/bin/sinfo -h -o '%D %t'" # Sample output: # 2 mix # 4 alloc # 10 idle # 1 down* output = check_command_output(command) nodes = 0 output = output.split("\n") for line in output: line_arr = line.split() if len(line_arr) == 2 and (line_arr[1] in [ "mix", "alloc", "drain", "drain*", "down", "down*" ]): nodes += int(line_arr[0]) return nodes
def is_node_down(): """Check if node is down according to scheduler""" try: hostname = check_command_output("hostname").strip() node = get_compute_nodes_info(hostname_filter=[hostname]).get(hostname) if node: log.info("Node is in state: '{0}'".format(node.state)) if all(error_state not in node.state for error_state in TORQUE_NODE_ERROR_STATES): return False else: log.warning("Node is not attached to scheduler. Reporting as down") except Exception as e: log.error( "Failed when checking if node is down with exception %s. Reporting node as down.", e) return True
def hasPendingJobs(): command = "/opt/slurm/bin/squeue -t PD --noheader -o '%r'" # Command outputs the pending jobs in the queue in the following format # Resources # Priority # PartitionNodeLimit try: output = check_command_output(command, log) has_pending = len( filter(lambda reason: reason in PENDING_RESOURCES_REASONS, output.split("\n"))) > 0 error = False except subprocess.CalledProcessError: error = True has_pending = False return has_pending, error
def get_compute_nodes_info(hostname_filter=None): command = TORQUE_BIN_DIR + "pbsnodes -x" if hostname_filter: command += " {0}".format(" ".join(hostname_filter)) output = check_command_output(command, raise_on_error=False) if output.startswith("<Data>"): root = ElementTree.fromstring(output) nodes = root.findall("./Node") nodes_list = [ TorqueHost.from_xml(ElementTree.tostring(node)) for node in nodes ] return dict((node.name, node) for node in nodes_list if node.note != "MasterServer") else: if output != "": logging.warning("Failed when running command %s with error %s", command, output) return dict()
def get_jobs_info(filter_by_states=None, filter_by_exec_hosts=None): command = TORQUE_BIN_DIR + "qstat -t -x" output = check_command_output(command) if not output: return [] root = ElementTree.fromstring(output) jobs = root.findall("./Job") jobs_list = [] for job in jobs: parsed_job = TorqueJob.from_xml(ElementTree.tostring(job)) if filter_by_states and parsed_job.state not in filter_by_states: continue if filter_by_exec_hosts: if any(host in parsed_job.exec_hosts for host in filter_by_exec_hosts): jobs_list.append(parsed_job) else: jobs_list.append(parsed_job) return jobs_list
def get_required_nodes(instance_properties): log.info("Computing number of required nodes for submitted jobs") command = "/opt/slurm/bin/squeue -r -h -o '%i-%t-%D-%C-%r'" # Example output of squeue # 1-PD-1-24-Nodes required for job are DOWN, DRAINED or reserved for jobs in higher priority partitions # 2-PD-1-24-Licenses # 3-PD-1-24-PartitionNodeLimit # 4-R-1-24- output = check_command_output(command, log) slots_requested = [] nodes_requested = [] output = output.split("\n") for line in output: line_arr = line.split("-") if len(line_arr) == 5 and line_arr[1] == 'PD': if line_arr[4] in PENDING_RESOURCES_REASONS: slots_requested.append(int(line_arr[3])) nodes_requested.append(int(line_arr[2])) else: log.info("Skipping pending job %s due to pending reason: %s", line_arr[0], line_arr[4]) return get_optimal_nodes(nodes_requested, slots_requested, instance_properties)
def get_required_nodes(instance_properties): command = "/opt/torque/bin/qstat -at" # Example output of torque # Req'd Req'd Elap # Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time # ----------------------- ----------- -------- ---------------- ------ ----- ------ --------- --------- - --------- # 0.ip-172-31-11-1.ec2.i centos batch job.sh 5343 5 30 -- 01:00:00 Q 00:04:58 # 1.ip-172-31-11-1.ec2.i centos batch job.sh 5340 3 6 -- 01:00:00 R 00:08:14 # 2.ip-172-31-11-1.ec2.i centos batch job.sh 5387 2 4 -- 01:00:00 R 00:08:27 status = ['Q'] _output = check_command_output(command, log) output = _output.split("\n")[5:] slots_requested = [] nodes_requested = [] for line in output: line_arr = line.split() if len(line_arr) >= 10 and line_arr[9] in status: # if a job has been looked at to account for pending nodes, don't look at it again slots_requested.append(int(line_arr[6])) nodes_requested.append(int(line_arr[5])) return get_optimal_nodes(nodes_requested, slots_requested, instance_properties)
def _qmgr_manage_nodes(operation, hosts, error_messages_to_ignore, additional_qmgr_args=""): if not hosts: return set() hostnames = ",".join(hosts) command = TORQUE_BIN_DIR + 'qmgr -c "{operation} node {hostnames} {additional_args}"'.format( operation=operation, hostnames=hostnames, additional_args=additional_qmgr_args) try: output = check_command_output(command, log_error=False) except subprocess.CalledProcessError as e: if not hasattr(e, "output") or not e.output or e.output == "": logging.error( "Failed when executing operation %s on nodes %s with error %s", operation, hostnames, e) return set() else: output = e.output except Exception as e: logging.error( "Failed when executing operation %s on nodes %s with error %s", operation, hostnames, e) return set() succeeded_hosts = set(hosts) # analyze command output to understand if failure can be ignored (e.g. already existing node) for error_message in output.splitlines(): match = re.match(r"qmgr obj=(?P<host>.*) svr=default: (?P<error>.*)", error_message) if not match: # assume unexpected error and mark all as failed logging.error( "Failed when executing operation %s on nodes %s with error %s", operation, hostnames, output) return set() host, error = match.groups() if any(error.strip() == message_to_ignore for message_to_ignore in error_messages_to_ignore): logging.warning( "Marking %s operation on node %s as succeeded because of ignored error message %s", operation, host, error, ) continue try: logging.error( "Failed when executing operation %s on node %s with error %s", operation, host, error_message) succeeded_hosts.remove(host) except Exception as e: logging.error( "Failed to extract host from error message while adding nodes. Mark all as failed. Output was %s.\n" "Exception was %s", output, e, ) return succeeded_hosts