def parse_slurm_conf(slurm_conf_file, part): cluster = [] r_part = re.compile('PartitionName=(\S+).*?Nodes=(\S+)') r_node = re.compile("(?<=NodeName=)\S+") try: for line in open(slurm_conf_file, 'r'): if line[0] != '#': m = r_node.search(line) if m: exp_nodelist = hostlist.expand_hostlist(m.group(0)) if not part: cluster.extend(exp_nodelist) elif part: m = r_part.search(line) if m and m.group(1) in part: cluster.extend(hostlist.expand_hostlist(m.group(2))) if len(part) == 1: break except IOError: print "Error: cannot open Slurm conf file at", slurm_conf_file return cluster
def parse_slurm_conf(slurm_conf_file,part,node_dict): cluster=[] r_part=re.compile('PartitionName=(\S+).*?Nodes=(\S+)') r_node=re.compile("(?<=NodeName=)\S+") try: for line in open(slurm_conf_file,'r'): if line[0]!='#': m=r_node.search(line) if m: exp_nodelist=hostlist.expand_hostlist(m.group(0)) parse_slurm_conf_cores(line,exp_nodelist,node_dict) if not part: cluster.extend(exp_nodelist) elif part: m=r_part.search(line) if m and m.group(1) in part: cluster.extend(hostlist.expand_hostlist(m.group(2))) if len(part)==1: break except IOError: print "Error: cannot open Slurm conf file at",slurm_conf_file return cluster
def create_cluster(self, name, names, ips, management): ''' creates a cluster with the given name specification ip specifications, and the identification of management node. :param name: name of the cluster :param names: the names of the cluster servers. 'i[001-003]'. creates the objects with names i001, i002, i003 :param ips: the names of the ips for the servers. 'i[001-003].futuregrid.org' creates the ips for the previously defined names :param management: the names of the management nodes. 'i[001-002]' sets the nodes i001 and i002 to management nodes. The rest will be set to compute nodes automatically. ''' name_list = expand_hostlist(names) ip_list = expand_hostlist(ips) management_list = expand_hostlist(management) server_list = zip(name_list, ip_list) servers = [] for (server_name, server_ip) in server_list: server = FabricServer(name=server_name, cluster=name, ip=server_ip) if server_name in management_list: server.tags = ["manage"] else: server.tags = ["compute"] self.stamp() server.save(cascade=True) servers.append(server) cluster = FabricCluster(name=name, cluster=name, definition=names) cluster.servers = servers self.stamp() cluster.save(cascade=True) pass
def expand_names(name_spec): if type(name_spec) != str and isinstance(name_spec, collections.Sequence): names = [] for name in name_spec: names += hostlist.expand_hostlist(name) else: names = hostlist.expand_hostlist(name_spec) return names
def validate(self): cluster = inventory.get("cluster", self.cluster.data) posibilities = expand_hostlist(cluster.definition) choice = expand_hostlist(self.nodespec.data) if choice == []: ok = False else: ok = set(choice).issubset(posibilities) print "Validate", ok, choice return ok
def returnjobinfo(jobid): global GANGLIA, GANGLIA_PROC t0 = time.time() # print jobid s = hide_usernames( os.popen("scontrol show -d --oneliner job " + str(jobid)).read()).split() print("slurm response took ", time.time() - t0) if s: j = dict() for x in s: y = x.split('=', 1) if len(y) == 2: j[y[0]] = y[1] if not j['JobState'] == 'PENDING': cpu_mapping = list() h = ['Nodes', 'CPU_IDs', 'Mem'] nodelist = "" for i, n in enumerate(s): if n.startswith("Nodes="): cpu_mapping.append([s[i].replace('Nodes=', ''), s[i + 1].replace('CPU_IDs=', ''), s[i + 2].replace('Mem=', '')]) if n.startswith("NodeList="): nodelist = n.replace("NodeList=", "") j['cpu_mapping'] = {'headers': h, 'nodes': cpu_mapping} j['expanded_nodelist'] = list( map(str.strip, expand_hostlist(nodelist))) if 'NodeList' in j: j['nodeinfo'] = returnnodeinfo(j['NodeList'])['nodeinfo'] elif 'Nodes' in j: j['nodeinfo'] = returnnodeinfo(j['Nodes'])['nodeinfo'] if GANGLIA == 1 and GANGLIA_PROC == 1: t_procs0 = time.time() j['procs'] = get_procs(j['expanded_nodelist']) print("get procs took", time.time() - t_procs0) else: # not an active job, fetch finished job stats # remark, these stats have a different format, leave it up to the client side # to fix it. yesterday = (dt.datetime.today() - dt.timedelta(1)).strftime("%Y-%m-%d") sacct = "sacct -X --format=jobid,jobname,user,account,state,elapsed,submit,start,end,nnodes,ncpus,reqnodes,reqcpus,nodelist --parsable2 -S %s --job %s" s = hide_usernames(run_slurmcommand(sacct % (yesterday, jobid))) t = io.StringIO(s) reader = csv.reader(t, delimiter='|') headers = list(map(convert, next(reader))) jobinfo = list(map(convert, next(reader))) j = dict(list(zip(headers, jobinfo))) j['expanded_nodelist'] = list( map(str.strip, expand_hostlist(j["NodeList"]))) j['GANGLIA'] = GANGLIA # print j print("jobinfo", time.time() - t0) return j
def main(argv): global cstring # Initialize empty cluster list cluster_list = [] # Default slurm configuration slurm_conf = "/etc/slurm/slurm.conf" # Default load threshold to report load_thr = 8.0 guilty = False reverse = False # Default amount of parallelism workers = 8 try: opts, args = getopt.getopt(argv, "n:l:c:s:p:Lgh") except getopt.GetoptError: usage() sys.exit() for opt, arg in opts: if opt in ("-h"): usage() sys.exit() elif opt in ("-n"): # override default cluster file cluster_list = hostlist.expand_hostlist(arg) elif opt in ("-s"): # override default Slurm config file name slurm_conf = arg elif opt in ("-l"): # override default load level load_thr = float(arg) if load_thr < 0: print "Error: invalid load level - " + arg + "!" sys.exit() elif opt in ("-c"): # override default community string cstring = arg elif opt in ("-g"): # print top user on each overused system guilty = True elif opt in ("-L"): # invert load comparison reverse = True elif opt in ("-p"): # invert load comparison workers = int(arg) if not cluster_list: cluster_list = parse_slurm_conf(slurm_conf) if not cluster_list: print "Error: no cluster nodes specified!" sys.exit() pool = Pool(workers) results = pool.map(query_node, cluster_list) pool.close() # sort results by load value and pass to report printer print_report(sorted(results, key=itemgetter(1), reverse=True), reduce(max_name, results, 0), load_thr, guilty, reverse)
def listPBSnodes(hostExpression): """ Create a list of nodes that are free to run jobs Example: freePBSnodes('node01[05-11],node02[01-80]') """ nodeNames = expand_hostlist(hostExpression) nodeString = "".join(["%s " % n for n in nodeNames]) pbsOut = check_output(["pbsnodes -x %s" % nodeString], shell=True) nodes = xml2obj(pbsOut) freenodelist = [] # find free nodes # state = free # no jobs are runing # no message for node in nodes["Node"]: status = {} messages = [] jobs = [] name = node["name"] state = node["state"] freenodelist.append(name) return freenodelist
def setup_servers(self, protocol): if self.num_nodes > 1: from hostlist import expand_hostlist task_index = int(os.environ['SLURM_PROCID']) hostlist = expand_hostlist(os.environ['SLURM_NODELIST']) hostlist_w_port = [("%s:2222" % host) for host in hostlist] cluster = tf.train.ClusterSpec({"localhost":hostlist_w_port}).as_cluster_def() protocol = f'grpc+{protocol}' if protocol else 'grpc' server = tf.distribute.Server(cluster, job_name="localhost", task_index=task_index, protocol=protocol) session_target = server.target if task_index != 0: utils.join_tasks(task_index, hostlist) quit() else: task_index = 0 hostlist = ['localhost'] session_target = '' [print(f'{arg} : {val}') for arg, val in vars(self.args).items()] self.session_target = session_target self.task_index = task_index self.hostlist = hostlist
def check_nodes_computesets(clusterid, computenodeids): hosts_param = hostlist.expand_hostlist(computenodeids) hosts_param_set = set(hosts_param) nodes_free = True nodes_allocated = False nodes_checked = False # computesetid = -1 computesets = Comet.get_computeset() # get all active computeset and put nodes into a set allhosts = set() for computeset in computesets: if computeset["cluster"] == clusterid \ and (computeset["state"] in Cluster.ACTIVE_COMPUTESETS): computesetid = computeset["id"] # print (computesetid) for compute in computeset["computes"]: allhosts.add(compute["name"]) # all nodes allocated if hosts_param_set <= allhosts: nodes_allocated = True nodes_free = False nodes_checked = True # at least one specified host not in any Active computeset else: for host in hosts_param: # some specified nodes are in Active computeset if host in allhosts: nodes_free = False nodes_checked = True break # print ("nodes_checked: %s" % nodes_checked) # print ("nodes_free: %s" % nodes_free) return [nodes_free, nodes_allocated]
def generate(self, nodes, prefix, topology=None, test_name=None): if topology: self.logger.info('node: ignoring topology (not used)') node_set = collections.defaultdict(set) node_set = set(hostlist.expand_hostlist(bnc.config['nodes'])) node_set &= set(nodes) #Don't include error/excluded nodes for node in node_set: test_dir = os.path.join(prefix, "tests", node) bench.util.mkdir_p(test_dir) script_file = os.path.join(test_dir, '{0}.job'.format(node)) with open(script_file, 'w') as fp: fp.write(self.TEMPLATE.render( job_name = 'bench-node-{0}'.format(node), modules = " ".join(bnc.config['modules']), node_name = node, linpack_path = bnc.config['linpack_path'], stream_path = bnc.config['stream_path'], )) node_list_file = os.path.join(test_dir, 'node_list') bench.util.write_node_list(node_list_file, [node]) self.logger.info('node: add: {0}'.format(len(nodes)))
def get_partition_nodes(partition): p = scontrol('show partition {0}'.format(partition)) for line in p.stdout: for match in NODES_P.finditer(line): for node in hostlist.expand_hostlist(match.group(1)): yield node p.wait()
def main(argv): global cstring # Initialize empty cluster list cluster_list=[] # Default slurm configuration slurm_conf="/etc/slurm/slurm.conf" # Default load threshold to report load_thr=8.0 guilty=False reverse=False # Default amount of parallelism workers=8 try: opts,args=getopt.getopt(argv,"n:l:c:s:p:Lgh") except getopt.GetoptError: usage() sys.exit() for opt,arg in opts: if opt in ("-h"): usage() sys.exit() elif opt in ("-n"): # override default cluster file cluster_list=hostlist.expand_hostlist(arg) elif opt in ("-s"): # override default Slurm config file name slurm_conf=arg elif opt in ("-l"): # override default load level load_thr=float(arg) if load_thr<0: print "Error: invalid load level - "+arg+"!" sys.exit() elif opt in ("-c"): # override default community string cstring=arg elif opt in ("-g"): # print top user on each overused system guilty=True elif opt in ("-L"): # invert load comparison reverse=True elif opt in ("-p"): # invert load comparison workers=int(arg) if not cluster_list: cluster_list=parse_slurm_conf(slurm_conf) if not cluster_list: print "Error: no cluster nodes specified!" sys.exit() pool=Pool(workers) results=pool.map(query_node,cluster_list) pool.close() # sort results by load value and pass to report printer print_report(sorted(results,key=itemgetter(1),reverse=True), reduce(max_name,results,0),load_thr,guilty,reverse)
def add(self, **kwargs): if "host" not in kwargs: Console.error("no id specified") sys.exit(1) hosts = hostlist.expand_hostlist(kwargs['host']) if 'ip' in kwargs: ips = Parameter.expand(kwargs['ip']) else: ips = [None for i in hosts] if ips is None: ips = [None for i in hosts] for host, ip in zip(hosts, ips): if host in self.data: entry = self.data[host] else: entry = dict(self.entry) self.data[host] = entry for key, value in kwargs.items(): entry[key] = value entry['ip'] = ip entry['host'] = host for attribute in entry: self.data[host][attribute] = entry[attribute]
def set(self, spec, attribute): """add an attribute for the specified hosts in the format i[1-20]. which would set the attribute for all hosts in i1 to i20""" hosts = expand_hostlist(spec) for host in hosts: self._set(host, attribute)
def acct_reader(self, filename): ftr = [3600,60,1] acct = [] with open(filename) as fd: for job in csv.DictReader(fd, delimiter = '|'): if self.jobids and job['JobID'] not in self.jobids: continue if job['NodeList'] == "None assigned": continue jent = {} jent['id'] = job['JobID'] jent['user'] = job['User'] jent['project'] = job['Account'] jent['start_time'] = int(parse(job['Start']).strftime('%s')) jent['end_time'] = int(parse(job['End']).strftime('%s')) jent['queue_time'] = int(parse(job['Submit']).strftime('%s')) jent['queue'] = job['Partition'] jent['name'] = job['JobName'] jent['status'] = job['State'].split()[0] jent['nodes'] = int(job['NNodes']) jent['cores'] = int(job['ReqCPUS']) jent['host_list'] = hostlist.expand_hostlist(job['NodeList']) if '-' in job['Timelimit']: days, time = job['Timelimit'].split('-') else: time = job['Timelimit'] days = 0 jent['requested_time'] = (int(days) * 86400 + sum([a*b for a,b in zip(ftr, [int(i) for i in time.split(":")])]))/60 acct += [jent] return acct
def test_alltoall_switch_tests(self, _): self.nodes = NODES bench.util.write_node_list(os.path.join(self.directory, 'node_list'), self.nodes) bench.add.execute(self.directory, TOPOLOGY_FILE, alltoall_switch_tests=True) prefix = os.path.join(self.directory, 'alltoall-switch', 'tests') switches = set(( 'hpcf-ib-rack1-u43', 'hpcf-ib-rack1-u45', 'hpcf-ib-rack1-u42', 'hpcf-ib-rack1-u44', 'hpcf-ib-rack1-u46', )) self.assertEqual( set(os.listdir(prefix)), switches, ) nodes = set() for switch in switches: script = os.path.join(prefix, switch, '{0}.job'.format(switch)) with open(script) as fp: match = NODELIST_P.search(fp.read()) nodes |= set(hostlist.expand_hostlist(match.group(3))) self.assertEqual(nodes, self.nodes)
def _detect_nodes(self): # see if we have a PBS_NODEFILE pbs_nodefile = os.environ.get('PBS_NODEFILE') slurm_nodelist = os.environ.get('SLURM_NODELIST') if pbs_nodefile is not None: # parse PBS the nodefile self._raw_nodes = [line.strip() for line in open(pbs_nodefile)] if self.log is not None: self.log.info( message="Found PBS_NODEFILE %s: %s" % (pbs_nodefile, self._raw_nodes), suffix=LOG_SUFFIX) elif slurm_nodelist is not None: # parse SLURM nodefile self._raw_nodes = hostlist.expand_hostlist(slurm_nodelist) if self.log is not None: self.log.info( message="Found SLURM_NODELIST %s. Expanded to: %s" % (slurm_nodelist, self._raw_nodes), suffix=LOG_SUFFIX) else: self._raw_nodes = ['localhost'] if self.log is not None: self.log.info( message="No PBS_NODEFILE or SLURM_NODELIST found. Using hosts: %s" % (self._raw_nodes), suffix=LOG_SUFFIX)
def test_exclude_file(self, _): fail_nodes = os.path.join(self.directory, 'fail_nodes') with open(fail_nodes, 'w') as fp: for node in ['tnode0101', 'tnode0102', 'tnode0208', 'nonode0101']: fp.write('{0}\n'.format(node)) bench.add.execute( self.directory, topology_file=TOPOLOGY_FILE, node_tests=True, exclude_files=[fail_nodes], ) tests_dir = os.path.join(self.directory, 'node', 'tests') expected_tests = set(self.nodes) - set( ['tnode0101', 'tnode0102', 'tnode0208', 'nonode0101']) self.assertEqual( set(os.listdir(tests_dir)), expected_tests, ) for node in expected_tests: script = os.path.join(tests_dir, node, '{0}.job'.format(node)) self.assertNotEqual(os.stat(script).st_size, 0) with open(script) as fp: match = NODELIST_P.search(fp.read()) self.assertEqual( set(hostlist.expand_hostlist(match.group(3))), set((node, )), )
def test_exclude_file (self, _): fail_nodes = os.path.join(self.directory, 'fail_nodes') with open(fail_nodes, 'w') as fp: for node in ['tnode0101', 'tnode0102', 'tnode0208', 'nonode0101']: fp.write('{0}\n'.format(node)) bench.add.execute( self.directory, topology_file=TOPOLOGY_FILE, node_tests=True, exclude_files=[fail_nodes], ) tests_dir = os.path.join(self.directory, 'node', 'tests') expected_tests = set(self.nodes) - set(['tnode0101', 'tnode0102', 'tnode0208', 'nonode0101']) self.assertEqual( set(os.listdir(tests_dir)), expected_tests, ) for node in expected_tests: script = os.path.join(tests_dir, node, '{0}.job'.format(node)) self.assertNotEqual(os.stat(script).st_size, 0) with open(script) as fp: match = NODELIST_P.search(fp.read()) self.assertEqual( set(hostlist.expand_hostlist(match.group(3))), set((node, )), )
def read_data_from_yaml(self): """ read mac address and bmc configuration information from **mac.yaml** file. """ data = read_yaml_config(self.yaml_file) result = None if data: result = {} data = data["inventory"] for cluster in data: cluster_data = data[cluster] if "bmc" in cluster_data and "common" in cluster_data["bmc"]: # process the common bmc data in cluster common_bmc_data = cluster_data["bmc"]["common"] host_range = common_bmc_data.pop("range", None) hosts = expand_hostlist(host_range) mac_data = cluster_data["macaddr"] for host in mac_data: if host in hosts: temp_common_bmc_data = deepcopy(common_bmc_data) if "bmc" in mac_data[host]: # bmc config in individual host have a high # priority than common config temp_common_bmc_data.update(mac_data[host]["bmc"]) mac_data[host]["bmc"] = temp_common_bmc_data result[cluster] = mac_data return result
def dict_key_list_table_printer(d, indexed=False): ''' accept a dict in the form: {key1: [list1], key2: [list2], ....... => | key1 | key2 | | l | i | s | t ''' x = PrettyTable() temp = d.values() l = 0 for item in temp: l0 = len(item) if l0 > l: l = l0 if indexed: if l == 0: index_list = [] else: index_list = hostlist.expand_hostlist("[1-{0}]".format(str(l))) x.add_column("index", index_list) for k, v in d.iteritems(): v0 = v + [" "] * (l - len(v)) x.add_column(k, v0) x.align = "l" return x
def display_provision_form(): clusters = cm_config_server().get("cloudmesh.server.provisioner.clusters") # clusters = ['india','bravo','sierra'] # servers = n_inventory.hostlist(cluster) # server = n_inventory.host(name,auth=False) form = ProvisionForm(csrf=False) if form.validate_on_submit(): flash("Success") print "FORM" pprint(form.__dict__) print "CLUSTER", form.cluster.data print "Service", form.service.data hosts = expand_hostlist(form.nodespec.data) print "Nodespec", hosts for host in hosts: print "PROVISION HOST", host provision.delay(host, form.service.data) return redirect("provision/tasks/{0}/{1}/{2}".format( form.cluster.data, form.nodespec.data, form.service.data)) # return redirect("/provision/summary/") else: flash("Wrong submission") inventory.refresh() return render_template("mesh/provision/provision.html", clusters=clusters, form=form)
def init_process(backend="nccl"): print(f"Starting process with rank {ptu.dist_rank}...", flush=True) if "SLURM_STEPS_GPUS" in os.environ: gpu_ids = os.environ["SLURM_STEP_GPUS"].split(",") os.environ["MASTER_PORT"] = str(12345 + int(min(gpu_ids))) else: os.environ["MASTER_PORT"] = str(12345) if "SLURM_JOB_NODELIST" in os.environ: hostnames = hostlist.expand_hostlist(os.environ["SLURM_JOB_NODELIST"]) os.environ["MASTER_ADDR"] = hostnames[0] else: os.environ["MASTER_ADDR"] = "127.0.0.1" dist.init_process_group( backend, rank=ptu.dist_rank, world_size=ptu.world_size, ) print(f"Process {ptu.dist_rank} is connected.", flush=True) dist.barrier() silence_print(ptu.dist_rank == 0) if ptu.dist_rank == 0: print(f"All processes are connected.", flush=True)
def GET(self, id, metric): res = db.select(settings["SlurmClusterName"]+"_job_table", what="nodelist, time_start, time_end", where="id_job = $id_job", vars={ "id_job" : id }) job = res[0] url = "%s/render" % ( settings["GraphiteURL"] ) target = "%s{%s}.%s" % ( settings["GraphitePrefix"], ",".join(hostlist.expand_hostlist(job["nodelist"])), metric ) if 'infiniband' not in metric: if 'cpu-' in metric or 'swap_io' in metric or metric.endswith('tx') or metric.endswith('rx') or metric.startswith('llite-') or metric.endswith('read') or metric.endswith('write'): target = "scaleToSeconds(derivative(%s),1)" % target data = urllib.urlencode({ "target": target, "from": datetime.datetime.fromtimestamp(job["time_start"]).strftime("%H:%M_%Y%m%d"), "until": datetime.datetime.fromtimestamp(job["time_end"]).strftime("%H:%M_%Y%m%d"), "format": "json", "maxDataPoints": 200}) req = urllib2.Request(url, data) ret = {} returned = urllib2.urlopen(req).read() graphiteData = json.loads(returned) for metric in graphiteData: node = metric["target"].split(".")[len(settings["GraphitePrefix"].split("."))-1] ret[node] = [[i[1] * 1000, i[0]] for i in metric["datapoints"]] return json.dumps(ret)
def create(self, kind, subkind, nameregex): #"india[9-11].futuregrid.org,india[01-02].futuregrid.org" names = expand_hostlist(nameregex) for name in names: if kind == "server": object = FabricServer( name=name, kind=kind, subkind=subkind, ) elif kind == "service": object = FabricService( name=name, kind=kind, subkind=subkind) log.info("creating {0} {1} {2}".format(name, kind, subkind)) elif kind == "cluster": object = FabricCluster( name=name, kind=kind, subkind=subkind) log.info("creating {0} {1} {2}".format(name, kind, subkind)) else: log.error( "kind is not defined, creation of objects failed, kind, nameregex") return object.save(cascade=True)
def expand(cls, parameter, allow_duplicates=False, sort=False): if parameter is None: return parameter else: return expand_hostlist(parameter, allow_duplicates=False, sort=False)
def display_provision_form(): clusters = cm_config_server().get("cloudmesh.server.provisioner.clusters") # clusters = ['india','bravo','sierra'] # servers = n_inventory.hostlist(cluster) # server = n_inventory.host(name,auth=False) form = ProvisionForm(csrf=False) if form.validate_on_submit(): flash("Success") print "FORM" pprint(form.__dict__) print "CLUSTER", form.cluster.data print "Service", form.service.data hosts = expand_hostlist(form.nodespec.data) print "Nodespec", hosts for host in hosts: print "PROVISION HOST", host provision.delay(host, form.service.data) return redirect("provision/tasks/{0}/{1}/{2}" .format(form.cluster.data, form.nodespec.data, form.service.data)) # return redirect("/provision/summary/") else: flash("Wrong submission") inventory.refresh() return render_template("mesh/provision/provision.html", clusters=clusters, form=form)
def acct_reader(self, filename): ftr = [3600,60,1] acct = [] with open(filename) as fd: for job in csv.DictReader(fd, delimiter = '|'): if self.jobids and job['JobID'] not in self.jobids: continue if job['NodeList'] == "None assigned": continue if len(job) != 13: print(job['JobID'] + " is not parsed correctly") continue jent = {} jent['id'] = job['JobID'] jent['user'] = job['User'] jent['project'] = job['Account'] jent['start_time'] = int(parse(job['Start']).strftime('%s')) jent['end_time'] = int(parse(job['End']).strftime('%s')) jent['queue_time'] = int(parse(job['Submit']).strftime('%s')) jent['queue'] = job['Partition'] jent['name'] = job['JobName'] jent['status'] = job['State'].split()[0] jent['nodes'] = int(job['NNodes']) jent['cores'] = int(job['ReqCPUS']) jent['host_list'] = hostlist.expand_hostlist(job['NodeList']) if '-' in job['Timelimit']: days, time = job['Timelimit'].split('-') else: time = job['Timelimit'] days = 0 jent['requested_time'] = (int(days) * 86400 + sum([a*b for a,b in zip(ftr, [int(i) for i in time.split(":")])]))/60 acct += [jent] return acct
def main(argv): # Default slurm configuration slurm_conf = "/etc/slurm/slurm.conf" argc = len(argv) if argc < 2 or argc > 3: usage() else: if argc == 2: nodes = parse_slurm_conf(slurm_conf) else: nodes = hostlist.expand_hostlist(argv[0]) if len(nodes) > 0: for node in nodes: cmdline = "scp " + argv[argc - 2] + " " + node + ":" + argv[argc - 1] print "Running", cmdline results = commands.getstatusoutput(cmdline) if results[0] == 0: print "OK", results[1] else: print "Error:", node + ")", results[1] else: print "Error: empty or invalid list of cluster nodes!"
def main(): global logger, ARGS signal.signal(signal.SIGINT, sig_handler) ARGS = parse_args() if ARGS.verbose: setup_logging("debug") else: setup_logging("info") logger.debug(ARGS) G.config = ConfigParser.SafeConfigParser() try: G.config.read(ARGS.cfgfile) G.hosts = hostlist.expand_hostlist(G.config.get("global", "pub_hosts")) G.url = G.config.get("DB", "url") G.port = G.config.get("global", "pub_port") except: logger.error("Can't read configuration file") sys.exit(1) ARGS.func()
def dict_key_list_table_printer(d, indexed=False): """ accept a dict in the form: {key1: [list1], key2: [list2], ....... => | key1 | key2 | | l | i | s | t """ x = PrettyTable() temp = d.values() l = 0 for item in temp: l0 = len(item) if l0 > l: l = l0 if indexed: if l == 0: index_list = [] else: index_list = hostlist.expand_hostlist("[1-{0}]".format(str(l))) x.add_column("index", index_list) for k, v in d.iteritems(): v0 = v + [" "] * (l - len(v)) x.add_column(k, v0) x.align = "l" return x
def create(self, kind, namespec): ''' creates fabric objects of the specified kind and matching the name specification :param kind: the kind . see FABRIC_TYPES :param namespec: the specifacation for a name list. 'i[001-003]'. creates the objects with names i001, i002, i003 ''' elements = [] names = expand_hostlist(namespec) for name in names: if kind == "server": element = FabricServer(name=name, kind=kind) elif kind == "service": element = FabricService(name=name, kind=kind) log.info("creating {0} {1}".format(name, kind)) elif kind == "cluster": element = FabricCluster(name=name, kind=kind) log.info("creating {0} {1}".format(name, kind)) elif kind == "image": element = FabricImage(name=name, kind=kind) log.info("creating {0} {1}".format(name, kind)) else: log.error( "kind is not defined, creation of objects failed, kind, nameregex" ) return self.stamp() element.save(cascade=True) elements.append(object) return elements
def test_submit_jobs_3(self, arg1): '''Test that submit doesn't submit jobs with nodes in --nodelist''' node_test = bench.tests.node_test.NodeTest("node") node_test.Submit.execute(self.directory, nodelist='tnode01[01-06]') self.assertTrue(bench.slurm.sbatch.called) directories = set() expected_directories = set() job_scripts = set() expected_job_scripts = set() for ii, call in enumerate(arg1.call_args_list): script_dir = os.path.join(self.node_test_dir, self.nodes[ii]) args, kwargs = call #call object is two things: args=tuple, kwargs=dict directories.add(script_dir) expected_directories.add(kwargs['chdir']) job_scripts.add(args[0]) expected_job_scripts.add(script_dir + '/' + self.nodes[ii] + '.job') # Check that --nodelist nodes not submitted for node in hostlist.expand_hostlist('tnode01[07-10]'): self.assertNotIn(node, kwargs['chdir']) self.assertEqual(directories, expected_directories) self.assertEqual(job_scripts, expected_job_scripts)
def get_resource() -> Tuple[str, List[str], Optional[List[int]]]: """Get SLURM resources: nodename, nodelist, and gpus. Returns ------- Tuple[str, List[str], Optional[List[int]]] nodename, nodelist, and gpus Raises ------ RuntimeError if number of nodes could not be retrieved ValueError list of nodes is not of the same length sa number of nodes ValueError if current nodename is not found in node list """ nodelist = hostlist.expand_hostlist(os.environ["SLURM_JOB_NODELIST"]) nodename = os.environ["SLURMD_NODENAME"] num_nodes_env = os.getenv("SLURM_JOB_NUM_NODES") if num_nodes_env: num_nodes = int(num_nodes_env) else: raise RuntimeError("Could not get SLURM number of nodes") if len(nodelist) != num_nodes: raise ValueError( f"Number of slurm nodes {len(nodelist)} not equal to {num_nodes}") if nodename not in nodelist: raise ValueError( f"Nodename({nodename}) not in nodelist({nodelist}). This should not happen!" ) gpus = local.get_gpus() return nodename, nodelist, gpus
def create (self, kind, namespec): ''' creates fabric objects of the specified kind and matching the name specification :param kind: the kind . see FABRIC_TYPES :param namespec: the specifacation for a name list. 'i[001-003]'. creates the objects with names i001, i002, i003 ''' elements = [] names = expand_hostlist(namespec) for name in names: if kind == "server": element = FabricServer(name=name, kind=kind) elif kind == "service": element = FabricService(name=name, kind=kind) log.info("creating {0} {1}".format(name, kind)) elif kind == "cluster": element = FabricCluster(name=name, kind=kind) log.info("creating {0} {1}".format(name, kind)) elif kind == "image": element = FabricImage(name=name, kind=kind) log.info("creating {0} {1}".format(name, kind)) else: log.error( "kind is not defined, creation of objects failed, kind, nameregex") return self.stamp() element.save(cascade=True) elements.append(object) return elements
def filter_node_list( nodes, include_nodes=None, exclude_nodes=None, include_reservations=None, exclude_reservations=None, include_states=None, exclude_states=None, include_files=None, exclude_files=None, ): nodes = set(nodes) if include_states or exclude_states: nodes &= set( get_nodes( include_states=include_states, exclude_states=exclude_states, )) if include_nodes or include_reservations or include_files: include_nodes_ = set() if include_nodes: for hostlist_ in include_nodes: include_nodes_ |= set(hostlist.expand_hostlist(hostlist_)) if include_reservations: for reservation in include_reservations: include_nodes_ |= get_reserved_nodes(reservation) if include_files: for include_file in include_files: include_nodes_ |= set(read_node_list(include_file)) nodes &= include_nodes_ if exclude_nodes or exclude_reservations or exclude_files: exclude_nodes_ = set() if exclude_nodes: for hostlist_ in exclude_nodes: exclude_nodes_ |= set(hostlist.expand_hostlist(hostlist_)) if exclude_reservations: for reservation in exclude_reservations: exclude_nodes_ |= get_reserved_nodes(reservation) if exclude_files: for exclude_file in exclude_files: exclude_nodes_ |= set(read_node_list(exclude_file)) nodes -= exclude_nodes_ return nodes
def get_status_short(self, raw_hosts=None): """get status of baremetal computer provided for **rain status --short [HOSTS]** :param string raw_hosts: one or more hosts with the valid formation of hostlist :return: a dict of the formation {"host1": "deployed", "host2": "deploying", "host3": "failed"} """ hosts = expand_hostlist(raw_hosts) if raw_hosts else None return self.status.get_status_short(hosts)
def get_worker_host_list(base_port, workers_per_host): hosts = expand_hostlist(os.environ['SLURM_NODELIST']) ports = [base_port + i for i in range(workers_per_host)] worker_hlist = [] for h in hosts: for p in ports: worker_hlist.append('{}:{}'.format(h, p)) return worker_hlist
def get_slurm_allocated_nodes(): hosts = os.environ.get("SLURM_NODELIST") hosts = hostlist.expand_hostlist(hosts) freenodes = [] for h in hosts: freenodes.append((h + "\n")) return list(set(freenodes))
def ip_name_pair(self, nameregex, format_string, start=1): ips = expand_hostlist(nameregex) i = start names = [] for ip in ips: names.append(format_string.format(i)) i += 1 return zip(names, ips)
def get_worker_host_list(base_port,workers_per_host): hosts = expand_hostlist( os.environ['SLURM_NODELIST']) ports = [base_port + i for i in range(workers_per_host)] l = [] for h in hosts: for p in ports: l.append('{}:{}'.format(h,p)) return l
def get_status_summary(self, raw_hosts=None): """get status summary of baremetal computer provided for **rain status --summary [HOSTS]** :param string raw_hosts: one or more hosts with the valid formation of hostlist :return: a dict of the formation {"deployed": 1, "deploying":2, "failed":2, "total": 5} """ hosts = expand_hostlist(raw_hosts) if raw_hosts else None return self.status.get_status_summary(hosts)
def baremetal_computer_host_off(self, raw_hosts): """Disable/OFF computers for baremetal provisioning provided for **rain admin off HOSTS** :param string raw_hosts: ne or more hosts with the valid formation of hostlist :return: True means successfully, otherwise False """ hosts = expand_hostlist(raw_hosts) if raw_hosts else None return self.baremetal.disable_baremetal_computers(hosts)
def filtered_hosts_based_policy(user, projects, hosts): """filtered hosts based on policy of the user and his/her projects """ # wrapper wrapper = RainCobblerWrapper() policy = wrapper.get_policy_based_user_or_its_projects(user, projects) policy_hosts = hostlist.expand_hostlist(policy) if policy else None return [h for h in hosts if h in policy_hosts] if policy_hosts else []
def filtered_hosts_based_baremetal(raw_hosts): """filtered hosts based on baremetal computers """ # wrapper wrapper = RainCobblerWrapper() input_hosts = hostlist.expand_hostlist(raw_hosts) bm_hosts = wrapper.baremetal_computer_host_list() return [h for h in input_hosts if h in bm_hosts] if bm_hosts else []
def set_compute_nodelist(self, nodelist): nodelist_expanded = hostlist.expand_hostlist(nodelist) self._inventory_content['all']['children']['compute_nodes'][ 'hosts'] = {} for node in nodelist_expanded: self._inventory_content['all']['children']['compute_nodes'][ 'hosts'][node] = None
def generate(self): self.generate_bootspec() self._generate_globals() clusters = self.config.get("cloudmesh.inventory") for cluster_name in clusters: cluster = clusters[cluster_name] names = expand_hostlist(cluster["id"]) net_id = 0 for network in cluster["network"]: n_index = expand_hostlist(network["id"]) n_label = expand_hostlist(network["label"]) n_range = expand_hostlist(network["range"]) n_name = network["name"] for i in range(0, len(names)): name = n_index[i] element = dict(network) del element['range'] element.update({'cm_type': "inventory", 'cm_key': 'server', 'cm_kind': 'server', 'cm_id': name, 'cm_cluster': cluster_name, 'id': name, 'label' : n_label[i], 'network_name': n_name, 'cm_network_id': net_id, 'ipaddr': n_range[i], 'cm_attribute': 'network'} ) self.insert(element) net_id += 1 # added by HC # init rack status self.generate_rack_status() # init baremetal computer managemnt, maybe will be deprecated later bdb = BaremetalDB() bdb.init_base_document_structure() # insert necessary mac info of baremetal computers to inventory bmc = BaremetalComputer() bmc.insert_mac_data_to_inventory()
def _configure(self): slurm_nodelist = os.environ.get("SLURM_NODELIST") if slurm_nodelist is None: msg = "$SLURM_NODELIST not set!" self._log.error(msg) raise RuntimeError(msg) # Parse SLURM nodefile environment variable slurm_nodes = hostlist.expand_hostlist(slurm_nodelist) self._log.info("Found SLURM_NODELIST %s. Expanded to: %s", slurm_nodelist, slurm_nodes) # $SLURM_NPROCS = Total number of cores allocated for the current job slurm_nprocs_str = os.environ.get("SLURM_NPROCS") if slurm_nprocs_str is None: msg = "$SLURM_NPROCS not set!" self._log.error(msg) raise RuntimeError(msg) else: slurm_nprocs = int(slurm_nprocs_str) # $SLURM_NNODES = Total number of (partial) nodes in the job's resource allocation slurm_nnodes_str = os.environ.get("SLURM_NNODES") if slurm_nnodes_str is None: msg = "$SLURM_NNODES not set!" self._log.error(msg) raise RuntimeError(msg) else: slurm_nnodes = int(slurm_nnodes_str) # $SLURM_CPUS_ON_NODE = Number of cores per node (physically) slurm_cpus_on_node_str = os.environ.get("SLURM_CPUS_ON_NODE") if slurm_cpus_on_node_str is None: msg = "$SLURM_CPUS_ON_NODE not set!" self._log.error(msg) raise RuntimeError(msg) else: slurm_cpus_on_node = int(slurm_cpus_on_node_str) # Verify that $SLURM_NPROCS <= $SLURM_NNODES * $SLURM_CPUS_ON_NODE if not slurm_nprocs <= slurm_nnodes * slurm_cpus_on_node: self._log.warning( "$SLURM_NPROCS(%d) <= $SLURM_NNODES(%d) * $SLURM_CPUS_ON_NODE(%d)", slurm_nprocs, slurm_nnodes, slurm_cpus_on_node, ) # Verify that $SLURM_NNODES == len($SLURM_NODELIST) if slurm_nnodes != len(slurm_nodes): self._log.error("$SLURM_NNODES(%d) != len($SLURM_NODELIST)(%d)", slurm_nnodes, len(slurm_nodes)) # Report the physical number of cores or the total number of cores # in case of a single partial node allocation. self.cores_per_node = min(slurm_cpus_on_node, slurm_nprocs) self.node_list = slurm_nodes
def parse_slurm_env(env_var_name): cluster=[] env=os.getenv(env_var_name) if env is not None: cluster.extend(hostlist.expand_hostlist(env)) else: print "Error: cannot get "+env_var_name+" from environment" return cluster
def lookup(list): debug("Forward Lookup:") debug(list) if "*" in list or ":" in list: # MOAB events on Cray XE6 use node list format [1-7]*16:[10-99]*7 list = re.sub("\*\d+","", list) # omit *nprocs list = re.sub(":",",", list) # change : to , new_list = hostlist.expand_hostlist(list) # this takes the list we get back and turns it into a string for splunk return ",".join(str(n) for n in new_list)
def main(argv): # Default slurm configuration slurm_conf="/etc/slurm/slurm.conf" num_workers=8 try: opts,args=getopt.getopt(argv,"p:nh") except getopt.GetoptError: usage() sys.exit() nodes=[] for opt,arg in opts: if opt in ("-h"): usage() sys.exit() elif opt in ("-p"): # partition nodes=parse_slurm_conf(slurm_conf,arg) if len(nodes)==0: print "Error: invalid partition -",arg sys.exit() elif opt in ("-n"): # dump nodelist if not nodes: nodes=parse_slurm_conf(slurm_conf) for node in nodes: print node sys.exit() elif opt in ('-N'): # set workers num_workers=int(arg) argc=len(args) if argc<2 or argc>3: usage() else: if argc==3: nodes=hostlist.expand_hostlist(args[0]) elif not nodes: nodes=parse_slurm_conf(slurm_conf) if len(nodes)>0: cmdline=[] for node in nodes: cmdline.append([node,"scp "+args[argc-2]+" "+node+":"+args[argc-1]]) pool=Pool(num_workers) results=pool.map(copy_to_node,cmdline) pool.close() for result in results: if result!="OK": print result else: print "Error: empty or invalid list of cluster nodes!"
def get_ps_host_list(base_port,num_ps): assert(num_ps < 10000000) port = base_port l = [] hosts = expand_hostlist( os.environ['SLURM_NODELIST']) while True: for host in hosts: if len(l) >= num_ps: return l l.append('{}:{}'.format(host,port)) port += 1
def stateChangeLogic(record, nodes, start_state, end_state): global output_results, options, known_states, node_states if "-" in nodes or "[" in nodes: try: # MOAB JOBSTART events on Cray XE6 use node list format [1-7]*16:[10-999]*7 nodes = re.sub("\*\d+","", nodes) # omit *nprocs nodes = re.sub(":",",", nodes) # change : to , node_list = hostlist.expand_hostlist(nodes) except: # try to deal with BadHostlist exceptions debug2("---- Bad hostlist: " + str(record['_time']) +" nodes="+nodes) # guess it is missing a left bracket, and truncate at last comma m=re.match("(^.*)\[(.*),(.*)", nodes) if not m == None: nodes = m.group(1) +"["+ m.group(2) +"]" debug2("---- Changing to: nodes="+nodes) node_list = hostlist.expand_hostlist(nodes) # do or die else: node_list = nodes.split(",") if len(nodes) > 500: # omit long node lists record['_raw'] = record['_raw'].replace(nodes,'(LONG_NODE_LIST)') # the below is a hack for cielo, because RSV lines don't include some service nodes. # if RSV lines list at least 98% of all known hosts, apply it to all hosts. raw = str(record['_raw']) if (raw.find("RSVSTART")!=-1 or raw.find("RSVEND")!=-1): these = len(node_list) all = len(node_states) # the hardcoded .95 may cause problems later and should be improved. # eg it could be an argument and set via local/savedsearches.conf for each machine, # but really it should be a percentage of the hosts in each hpc_system index... if (all>these and these>0.95*all): debug2("Applying RSV to all known hosts ("+ str(these) +">0.95*"+ str(all)+"): "+raw) node_list = node_states.keys() # debug2("---- in stateChangeLogic:" + str(record['_time']) + " start="+ start_state + " end="+ end_state + " nodes="+nodes) for node in node_list: current_state = node_states.get(node) if current_state == None or current_state=="Unknown" or start_state=="*" or current_state==start_state: nodeStateChange(record, node, current_state, end_state)