def gen_slurm(g, out): """ g: A networkx graph representing the IB network out: Output file-like object """ try: import hostlist except ImportError: print("""To generate a slurm topology.conf, you need to install python-hostlist, https://pypi.python.org/pypi/python-hostlist""") raise out.write('# topology.conf generated by ibtopo2dot.py\n') for n, nbrs in g.adjacency(): if g.nodes[n]['type'] == 'Switch': switches = [] nodes = [] for nbr in nbrs: if g.nodes[nbr]['type'] == 'Switch': switches.append(g.nodes[nbr]['label']) else: nodename = g.nodes[nbr]['label'] nodes.append(nodename) switchstring = "" if len(switches) > 0: switches.sort() switchstring = " Switches=" + hostlist.collect_hostlist( switches) nodestr = '' if len(nodes) > 0: nodes.sort() nodestr = " Nodes=" + hostlist.collect_hostlist(nodes) out.write('SwitchName=%s%s%s\n' % (g.nodes[n]['label'], switchstring, nodestr))
def gen_slurm(g, out): """ g: A networkx graph representing the IB network out: Output file-like object """ try: import hostlist except ImportError: print("""To generate a slurm topology.conf, you need to install python-hostlist, https://pypi.python.org/pypi/python-hostlist""") raise out.write('# topology.conf generated by ibtopo2dot.py\n') for n, nbrs in g.adjacency(): if g.node[n]['type'] == 'Switch': switches = [] nodes = [] for nbr in nbrs: if g.node[nbr]['type'] == 'Switch': switches.append(g.node[nbr]['label']) else: nodename = g.node[nbr]['label'] nodes.append(nodename) switchstring = "" if len(switches) > 0: switches.sort() switchstring = " Switches=" + hostlist.collect_hostlist(switches) nodestr = '' if len(nodes) > 0: nodes.sort() nodestr = " Nodes=" + hostlist.collect_hostlist(nodes) out.write('SwitchName=%s%s%s\n' % (g.node[n]['label'], switchstring, nodestr))
def rlookup(list): debug("Reverse Lookup:") debug(list) # hostlist does not like square brackets, so get rid of them # this turns a string obj into a list obj new_list = [str(n) for n in list.replace("[","").replace("]", "").split(",")] return hostlist.collect_hostlist(new_list)
async def ipmi_sensors(hosts_list, username, password, record_ids=None): """call ipmi-sensors and parse the output :param hosts_list: List of hosts to be queried :param username: user name to query data :param password: password to query data :param record_ids: ids of the records that are queried :return: output table of ipmi-sensors """ hosts = hostlist.collect_hostlist(hosts_list) fanout = str(min(1024, len(hosts_list))) param = [ '-h', hosts, '-u', username, '-p', password, '-F', fanout, ] if record_ids: param.extend(['-r', str.join(',', record_ids)]) query_timestamp = metricq.Timestamp.now() process = await asyncio.create_subprocess_exec( *CMD_IPMI_SENSORE_BASE, *param, stdout=asyncio.subprocess.PIPE, ) stdout, stderr = await process.communicate() output = stdout.decode() return query_timestamp, output
def execute(directory, include_states=None, exclude_states=None, **kwargs): if not (include_states or exclude_states): exclude_states = ['down', 'draining', 'drained'] node_list_filename = os.path.join(directory, 'node_list') logger.info('creating {0}'.format(node_list_filename)) all_nodes = bench.util.get_nodes() node_list = bench.util.filter_node_list(all_nodes, include_states=include_states, exclude_states=exclude_states, **kwargs) logger.info('nodes to test: {0}'.format(len(node_list))) try: bench.util.write_node_list(node_list_filename, sorted(node_list)) except IOError as ex: logger.error('unable to write {0}'.format(node_list_filename)) logger.debug(ex, exc_info=True) error_nodes_filename = os.path.join(directory, 'error_nodes') error_nodes = all_nodes - node_list if error_nodes: logger.warning('error nodes: {0} ({1} nodes)'.format( hostlist.collect_hostlist(error_nodes), len(error_nodes), )) try: bench.util.write_node_list(error_nodes_filename, sorted(error_nodes)) except IOError as ex: logger.error('unable to write {0}'.format(error_nodes_filename)) logger.debug(ex, exc_info=True)
def print_progress(label, hosts, data): print("Status of host --{0}-- ({1}): ".format(label, len(hosts))) if len(hosts) == 0: print("\tEmpty records.") else: if label in ["failed", "unknown"]: print("\thost list: ", hostlist.collect_hostlist(hosts)) else: for host in hosts: print("\t{0}\t{1}%".format(host, data[host]["progress"]))
async def log_loop(configs, log_interval): while True: for conf in configs: active_queried_hosts = sum(1 for i in conf['hosts'].values() if i['status'] == Status.ACTIVE) logger.info('{0} of {1} are active in {2}'.format( active_queried_hosts, len(conf['hosts']), hostlist.collect_hostlist(conf['hosts'].keys()), )) await asyncio.sleep(log_interval)
def get_policy_based_user_or_its_projects(self, user, projects): """get the merged policy based on the username and his/her related projects :param string user: only one valid username :param list projects: a list of project :return: a policy with hostlist string, or None if non-exist policy for user and its projects """ user_policy = self.get_policy_based_user(user) group_policy = self.get_policy_based_group(collect_hostlist(projects)) if user_policy or group_policy: if user_policy: valid_policy = user_policy if group_policy: valid_policy.update(group_policy) else: valid_policy = group_policy else: return None all_policys = [] for name in valid_policy: all_policys.extend(expand_hostlist(valid_policy[name])) return collect_hostlist(list(set(all_policys)))
def bits_from_string_hpctools(self, mask): ret = [] mask_int = int(mask, 0) index = 0 while mask_int: if mask_int & 1: # ret.append(index) ret.append(str(index)) index += 1 mask_int >>= 1 # return ret return hostlist.collect_hostlist(ret)
def merge_policy_based_ug(self, policys, flag_user=True, name=None): """merge policy based the name of user/group :param dict policys: all the possible policys for users/groups :param boolean flag_user: True means user, False means group :param string name: the name of one or more user/group, None means all user/groups :return: a dict of user/group and policy with the formation {"name1": "hostlist", "name2": "hostlist2"} """ if policys: data = {} names = expand_hostlist(name) if name else None for puser in policys: users = expand_hostlist(puser) common_users = [u for u in users if u in names] if names else users hosts = [] for policy in policys[puser]: hosts.extend(expand_hostlist(policy["policy"])) for user in common_users: if user in data: data[user].extend(hosts) else: data[user] = deepcopy(hosts) for user in data: data[user] = collect_hostlist(data[user]) # flip data, combine duplicate values flipped_data = {} for user in data: if data[user] in flipped_data: flipped_data[data[user]].append(user) else: flipped_data[data[user]] = [user] # try to merge user with same hosts data = {} for value in flipped_data: data[collect_hostlist(flipped_data[value])] = value return data if data else None return None
def merge_policy_based_ug(self, policys, flag_user=True, name=None): """merge policy based the name of user/group :param dict policys: all the possible policys for users/groups :param boolean flag_user: True means user, False means group :param string name: the name of one or more user/group, None means all user/groups :return: a dict of user/group and policy with the formation {"name1": "hostlist", "name2": "hostlist2"} """ if policys: data = {} names = expand_hostlist(name) if name else None for puser in policys: users = expand_hostlist(puser) common_users = [ u for u in users if u in names] if names else users hosts = [] for policy in policys[puser]: hosts.extend(expand_hostlist(policy["policy"])) for user in common_users: if user in data: data[user].extend(hosts) else: data[user] = deepcopy(hosts) for user in data: data[user] = collect_hostlist(data[user]) # flip data, combine duplicate values flipped_data = {} for user in data: if data[user] in flipped_data: flipped_data[data[user]].append(user) else: flipped_data[data[user]] = [user] # try to merge user with same hosts data = {} for value in flipped_data: data[collect_hostlist(flipped_data[value])] = value return data if data else None return None
def nodeStateList(record): global node_states # make a list of nodes in each state stateList = {} for node in node_states: if stateList.get(node_states[node]) == None: stateList[node_states[node]] = [] stateList[node_states[node]].append(node) # make new record new = {'_time': record.get('_time')} for state in stateList: new["StateName_" + state] = hostlist.collect_hostlist(stateList[state]) output_results.append(new)
def nodeStateList(record): global node_states # make a list of nodes in each state stateList = {} for node in node_states: if stateList.get(node_states[node]) == None: stateList[node_states[node]] = [] stateList[node_states[node]].append(node) # make new record new = {'_time': record.get('_time')} for state in stateList: new["StateName_"+state] = hostlist.collect_hostlist(stateList[state]) output_results.append(new)
def wlm_get_nodes_in_reservation(location, task_id, task_node_type): command = f"rsvnodelist scops-{task_id}" output = remote_execute(location, command) success = check_command_with_output(command, output) if success: if output.startswith('No nodelist') or output.startswith( 'No reservation'): return success, "", 0 splitted_output = re.split('\n', output) resources = list() for t in splitted_output: node_type, nodelist = t.split(' ') if task_node_type in node_type.split(','): # should always be true resources += expand_hostlist(nodelist) return success, collect_hostlist(resources), len(resources) return False, None, 0
def run(self, params, args): # os.system("rocks report slurmnodes > /etc/slurm/nodenames.conf") # os.system("rocks report slurmpartitions > /etc/slurm/partitions.conf") os.system("make -C /var/411 force >/dev/null") os.system("sleep 10") os.system("service slurm restart >/dev/null") #os.system("/opt/rocks/bin/tentakel service slurm reconfig >/dev/null") query=('select nodes.name from nodes, memberships where nodes.membership = memberships.id and memberships.name like "%Compute%" order by rack,rank' ) self.db.execute(query) myhostlist = [] nodelist=0 for name in self.db.fetchall(): myhostlist.append("%s" % (name)) nodelist=1 if nodelist > 0: hl = hostlist.collect_hostlist(myhostlist) os.system("/opt/pdsh/bin/pdsh -w %s service slurm reconfig" % (hl))
def display_results(self): error_table = [] fail_table = [] # for key, result in self.results['fail'].items(): for key, result in list(self.results['fail'].items()): if result == []: self.results['fail'].pop(key, None) self.results['f_tests'].remove(key) for ii in key.split(','): self.results['error']['not_parsable'].add(ii) continue fail_table.append([key] + result) # for key, result in self.results['error'].items(): for key, result in list(self.results['error'].items()): if result: error_table.append([hostlist.collect_hostlist(result), key]) self.log_results(fail_table, error_table) #print("FAIL TABLE :", fail_table) #Summary print("### Summary ###") print('passing nodes: {passed} / {total}'.format(passed=len(self.results['p_nodes']), total=len(self.node_list))) print('failing nodes: {passed} / {total}'.format(passed=len(self.results['f_nodes']), total=len(self.node_list))) print('error nodes: {passed} / {total}'.format(passed=len(self.results['e_nodes']), total=len(self.node_list))) if self.results['p_tests']: self.results_logger.info("\n### Passing Tests ###") self.results_logger.info(sorted(list(self.results['p_tests']))) if fail_table: print("\n### Failing Tests ###") print(tabulate.tabulate(fail_table, headers=['Hardware', 'Test', 'Result', 'Expected', 'Res/Exp'], floatfmt=".2f")) if error_table: print("\n### Missing/Error Tests ###") print(tabulate.tabulate(error_table, headers=['Hardware', 'Reason']))
action='store', dest='file', help="Name of the file to parse. If not specified, STDIN is used.", default="-") args = parser.parse_args() cables = defaultdict(dict) leaves = defaultdict(set) spines = defaultdict(set) for line in fileinput.input(args.file): line = line.strip() _, port1, type1, name1, _, port2, type2, name2 = line.split(';') if type1 == 'FI': name1 = name1.split(" ")[0] if type2 == 'SW': cables[name2].update({port2: (name1, port1)}) leaves[name2].add(name1) else: cables[name1].update({port1: (name2, port2)}) spines[name1].add(name2) for switch, nodes in leaves.items(): nodenames = filter(lambda x: args.node_filter in x, nodes) nodenames = collect_hostlist(nodenames) print "SwitchName=%s Nodes=%s" % (switch, nodenames) for switch, nodes in spines.items(): print "SwitchName=%s Switches=%s" % (switch, ",".join(nodes))
## JSON if json_stdout: print json.dumps(bad_disks_dict) if json_file: if os.path.isfile(json_file): with open(json_file, 'a') as outfile: json.dump(bad_disks_dict, outfile) outfile.write("\n") else: with open(json_file, 'w') as outfile: json.dump(bad_disks_dict, outfile) outfile.write("\n") ## No more JSON # turn list [ '1I:1:1' ,'1I:1:2' ] into string 1I:1:[1-2] # turn list [ 'Physical Drive (4 TB SAS HDD) 1I:1:32' ,'Physical Drive (4 TB SAS HDD) 1I:1:33', ... ] # into string Physical Drive (4 TB SAS HDD) 1I:1:[32-64] collected_bad_disks = hostlist.collect_hostlist(bad_disks) if no_diff_cnt > 0 and diff_cnt < 1: if verbosely or debug: print "no differences in the counters between any disks in the reports" else: print "OK: No increases of %s on any disks. (%s vs %s)" % ( track_this_error_counter, timegenerated1, timegenerated2) sys.exit(OK) elif no_diff_cnt == 0 and diff_cnt == 0: print "UNKNOWN: Found nothing, does '%s' exist?" % track_this_error_counter sys.exit(UNKNOWN) else: print "CRITICAL: %s increased on these disks: %s (%s vs %s)" % ( track_this_error_counter, collected_bad_disks, timegenerated1, timegenerated2) sys.exit(CRITICAL)
def main(): parser = argparse.ArgumentParser( description='Launch groups of test scripts with srun.') parser.add_argument('--debug', action='store_true', help='Enable debug loggibng (default: info)') parser.add_argument( '--partition', help='Specify a partition for node selection and job submission') parser.add_argument('--mpi', help='Enable MPI support for srun') parser.add_argument('--ntasks', help='Specify the number of tasks to run for each job') parser.add_argument( '--ntasks-per-node', help='Specify the number of tasks to run on each node for each job') parser.add_argument('--state', action='append', help='Specify valid node states for node selection') parser.add_argument('--account', help='Specify the account to use during srun') parser.add_argument('--chdir', help='Specify a directory to use for srun') parser.add_argument('--time', help='Specify a job runtime to use for srun') parser.add_argument('--nodelist', help='Specify a node list to use for node selection') parser.add_argument('--nodes', type=int, default=1, help='Specify the number of nodes to use for srun') parser.add_argument( '--bcast', nargs='?', const=True, help='Copy executable file to compute nodes during srun') parser.add_argument('--exclusive', action='store_true', help='Exclusive use of compute nodes during srun') parser.add_argument('--timeout', type=int, help='Terminate jobs after a timeout (seconds)') parser.add_argument('executable', help='Executable to execute') parser.add_argument('executable_arguments', nargs='*', help='Arguments for the test executable') args = parser.parse_args() if args.debug: log_level = logging.DEBUG else: log_level = logging.INFO logging.basicConfig(level=log_level) all_nodes = set(get_all_nodes(states=args.state)) if args.partition: all_nodes = all_nodes & set(get_partition_nodes(args.partition)) if args.nodelist: all_nodes = all_nodes & set(hostlist.expand_hostlist(args.nodelist)) all_nodes = list(sorted(all_nodes)) jobs = [(nodelist, srun(args.executable, args.executable_arguments, partition=args.partition, nodelist=','.join(nodelist), ntasks=args.ntasks, account=args.account, chdir=args.chdir, time=args.time, bcast=args.bcast, exclusive=args.exclusive, nodes=args.nodes, ntasks_per_node=args.ntasks_per_node, mpi=args.mpi)) for nodelist in split_nodes(all_nodes, args.nodes)] start_time = time.time() completed_jobs = set() new_completed_jobs = [] pass_ = set() fail = set() unknown = set() while True: for nodelist, job in jobs: job.poll() if job.returncode is not None and job not in completed_jobs: new_completed_jobs.append((nodelist, job)) if job.returncode == PASS: pass_ |= set(nodelist) elif job.returncode == FAIL: fail |= set(nodelist) else: unknown |= set(nodelist) elif args.timeout is not None and (time.time() - start_time > args.timeout): logger.warning('{0}: {1}'.format(','.join(nodelist), 'timeout')) job.terminate() if new_completed_jobs: for nodelist, job in new_completed_jobs: for line in job.stderr: logger.debug('{0}: {1}'.format(','.join(nodelist), line.rstrip())) for line in job.stdout: logger.info('{0}: {1}'.format(','.join(nodelist), line.rstrip())) break completed_jobs.add(job) new_completed_jobs = [] if completed_jobs == set(job for _, job in jobs): break else: time.sleep(POLLING_INTERVAL) continue if pass_: print('pass:'******'fail:', hostlist.collect_hostlist(fail)) if unknown: print('unknown:', hostlist.collect_hostlist(unknown))
def rain_command(arguments): """ Usage: rain -h | --help rain --version rain admin add [LABEL] --file=FILE rain admin baremetals rain admin on HOSTS rain admin off HOSTS rain admin [-i] delete HOSTS rain admin [-i] rm HOSTS rain admin list users [--merge] rain admin list projects [--merge] rain admin list roles rain admin list hosts [--user=USERS|--project=PROJECTS|--role=ROLE] [--start=TIME_START] [--end=TIME_END] [--format=FORMAT] rain admin policy [--user=USERS|--project=PROJECTS|--role=ROLE] (-l HOSTS|-n COUNT) [--start=TIME_START] [--end=TIME_END] rain user list [--project=PROJECTS] [HOSTS] rain user list hosts [--start=TIME_START] [--end=TIME_END] [--format=FORMAT] rain status [--short|--summary][--kind=KIND] [HOSTS] rain provision --profile=PROFILE HOSTS rain provision list [--type=TYPE] (--distro=DISTRO|--kickstart=KICKSTART) rain provision --distro=DITRO --kickstart=KICKSTART HOSTS rain provision add (--distro=URL|--kickstart=KICk_CONTENT) NAME rain provision power [--off] HOSTS rain provision monitor HOSTS Arguments: HOSTS the list of hosts passed LABEL the label of a host COUNT the count of the bare metal provisioned hosts KIND the kind TYPE the type of profile or server Options: -n COUNT count of teh bare metal hosts to be provisined -p PROJECTS --projects=PROJECTS -u USERS --user=USERS Specify users -f FILE, --file=FILE file to be specified -i interactive mode adds a yes/no question for each host specified --role=ROLE Specify predefined role --start=TIME_START Start time of the reservation, in YYYY/MM/DD HH:MM:SS format. [default: current_time] --end=TIME_END End time of the reservation, in YYYY/MM/DD HH:MM:SS format. In addition a duration can be specified if the + sign is the first sign. The duration will than be added to the start time. [default: +1d] --kind=KIND Format of the output -png, jpg, pdf. [default:png] --format=FORMAT Format of the output json, cfg. [default:json] --type=TYPE Format of the output profile, server. [default:server] """ # comment by H. C, we need the raw list for policy """ for list in ["HOSTS", "USERS", "PROJECTS","--project", "--user"]: try: expanded_list = hostlist.expand_hostlist(arguments[list]) arguments[list]=expanded_list except: pass """ # print(arguments) # wrapper wrapper = RainCobblerWrapper() """ rain admin on HOSTS rain admin off HOSTS """ if arguments["admin"]: if arguments["add"]: print("add") if arguments["LABEL"] is not None: """admin add LABEL --file=FILE""" print((arguments["LABEL"])) print((arguments["--file"])) not_implemented() else: """admin add --file=FILE""" print((arguments["--file"])) not_implemented() elif arguments["baremetals"]: """rain admin baremetals""" print("list all baremetals") result = wrapper.baremetal_computer_host_list() print(result if result else "No Baremetals") elif arguments["on"]: """rain admin on HOSTS""" print("switch on") print((arguments["HOSTS"])) result = wrapper.baremetal_computer_host_on(arguments["HOSTS"]) print("success" if result else "failed") elif arguments["off"]: """rain admin off HOSTS""" print("switch off") print((arguments["HOSTS"])) result = wrapper.baremetal_computer_host_off(arguments["HOSTS"]) print("success" if result else "failed") elif arguments["delete"] or arguments["rm"]: """rain admin [-i] delete HOSTS""" """rain admin [-i] rm HOSTS""" interactive = arguments["-i"] print("delete", interactive) for host in arguments["HOSTS"]: if interactive: answer = raw_input( "Do you want to delete the host %s? (y)es/(n)o: " % host) if answer in ["yes", "y", "Y", "YES"]: print("delete %s" % host) else: print("keeping %s" % host) not_implemented() elif arguments["list"]: if arguments["users"]: print("list users") flag_merge = arguments["--merge"] policys = wrapper.list_all_user_group_hosts(True, flag_merge) print_policys(policys, flag_merge) elif arguments["projects"]: print("list projects") flag_merge = arguments["--merge"] policys = wrapper.list_all_user_group_hosts(False, flag_merge) print_policys(policys, flag_merge) elif arguments["roles"]: print("list roles") not_implemented() elif arguments["hosts"]: print("list hosts") # comment by H. C """ not_implemented() (time_start, time_end) = parse_time_interval(arguments["--start"], arguments["--end"]) print "From:", time_start print "To :", time_end """ if arguments["--user"] is not None: policys = wrapper.list_user_hosts(arguments["--user"]) print_policys(policys) elif arguments["--project"] is not None: policys = wrapper.list_project_hosts( arguments["--project"]) print_policys(policys) elif arguments["--role"] is not None: not_implemented() else: print("all users, projects, roles") not_implemented() elif arguments["policy"]: print("policy") # comment by H. C """ (time_start, time_end) = parse_time_interval(arguments["--start"], arguments["--end"]) print "From:", time_start print "To :", time_end """ if arguments["--user"] is not None: policy_id = wrapper.add_user_policy(arguments["--user"], arguments["HOSTS"]) print("success" if policy_id else "failed") elif arguments["--project"] is not None: policy_id = wrapper.add_project_policy(arguments["--project"], arguments["HOSTS"]) print("success" if policy_id else "failed") elif arguments["--role"] is not None: not_implemented() else: print("all users, projects, roles") not_implemented() elif arguments["list"]: print("list") not_implemented() elif arguments["status"]: print("status") if arguments["--short"]: status_dict = wrapper.get_status_short(arguments["HOSTS"]) if status_dict: for host in sorted(status_dict.keys()): print("{0:16}\t{1}".format(host, status_dict[host])) else: print("Empty") if arguments["--summary"]: status_dict = wrapper.get_status_summary(arguments["HOSTS"]) if status_dict: for deploy_status in [ "deployed", "deploying", "failed", "total", ]: print("{0:16}\t{1}".format(deploy_status, status_dict[deploy_status])) elif arguments["user"]: if arguments["list"]: print("user list") (time_start, time_end) = parse_time_interval(arguments["--start"], arguments["--end"]) print("From:", time_start) print("To :", time_end) not_implemented() ### # provisioning ### elif arguments["provision"]: # print "provision a node..." if arguments["list"]: # print "this will list distro or kickstart info" if arguments["--type"] == "profile": print( "this will list profiles based on distro or kickstart info" ) profiles = wrapper.list_profile_based_distro_kickstart( arguments["--distro"], arguments["--kickstart"]) print("matched profiles: {0}".format(profiles)) else: print( "this will list servers based on distro or kickstart info") servers = wrapper.list_system_based_distro_kickstart( arguments["--distro"], arguments["--kickstart"]) print("matched servers: {0}".format(servers)) elif arguments["add"]: print("add a new distro or kickstart") not_implemented() elif arguments["power"]: print("power ON/OFF a host...") # pre-process hosts which user can access if arguments["HOSTS"]: all_hosts = filtered_access_hosts(arguments["HOSTS"]) access_hosts = all_hosts["access"] unaccess_hosts = all_hosts["unaccess"] if unaccess_hosts: print( "You can NOT access these hosts: {0}, please contact your admin." .format(hostlist.collect_hostlist(unaccess_hosts))) result = wrapper.power_host(access_hosts, not arguments["--off"]) power_hosts = [h for h in sorted(result.keys()) if result[h]] unknown_hosts = [h for h in sorted(result.keys()) if not result[h]] if unknown_hosts: print( "unknow hosts, must deploy first: ", hostlist.collect_hostlist( hostlist.collect_hostlist(unknown_hosts))) if power_hosts: print( "call [rain provision monitor {0}] to monitor power progress." .format(hostlist.collect_hostlist(power_hosts))) elif arguments["monitor"]: print("monitor progress of a host...") # pre-process hosts which user can access if arguments["HOSTS"]: all_hosts = filtered_access_hosts(arguments["HOSTS"]) access_hosts = all_hosts["access"] unaccess_hosts = all_hosts["unaccess"] if unaccess_hosts: print( "You can NOT access these hosts: {0}, please contact your admin." .format(hostlist.collect_hostlist(unaccess_hosts))) result = wrapper.monitor_host(access_hosts) poweron_hosts = [ h for h in sorted(result.keys()) if result[h]["status"] == "poweron" ] poweroff_hosts = [ h for h in sorted(result.keys()) if result[h]["status"] == "poweroff" ] deploy_hosts = [ h for h in sorted(result.keys()) if result[h]["status"] == "deploy" ] failed_hosts = [ h for h in sorted(result.keys()) if result[h]["status"] == "failed" ] unknown_hosts = [ h for h in sorted(result.keys()) if result[h]["status"] == "unknown" ] print_progress("deploy", deploy_hosts, result) print_progress("poweron", poweron_hosts, result) print_progress("poweroff", poweroff_hosts, result) print_progress("failed", failed_hosts, result) print_progress("unknown", unknown_hosts, result) else: # pre-process hosts which user can access if arguments["HOSTS"]: all_hosts = filtered_access_hosts(arguments["HOSTS"]) access_hosts = all_hosts["access"] unaccess_hosts = all_hosts["unaccess"] if unaccess_hosts: print( "You can NOT access these hosts: {0}, please contact your admin." .format(hostlist.collect_hostlist(unaccess_hosts))) if arguments["--profile"]: if access_hosts: wrapper.provision_host_with_profile( arguments["--profile"], access_hosts) print( "call [rain provision monitor {0}] to monitor depoy progress." .format(hostlist.collect_hostlist(access_hosts))) elif arguments["--distro"] and arguments["--kickstart"]: if access_hosts: wrapper.provision_host_with_distro_kickstart( arguments["--distro"], arguments["--kickstart"], access_hosts) print( "call [rain provision monitor {0}] to monitor deploy progress." .format(hostlist.collect_hostlist(access_hosts)))
def simple_list(id=None, format="table"): result = "" if id is None: r = Comet.get(Comet.url("cluster/")) else: r = Comet.get(Comet.url("cluster/" + id + "/")) if r is None: Console.error("Could not find cluster `{}`" .format(id)) return result r = [r] if r is not None: if 'error' in r: Console.error("An error occurred: {error}".format(**r)) raise ValueError("COMET Error") elif 'error' in r[0]: Console.error("An error occurred: {error}".format(**r[0])) raise ValueError("COMET Error") if format == "rest": result = r else: elements = {} for cluster in r: element = {} for attribute in ["project", "name", "description"]: element[attribute] = cluster[attribute] element["nodes"] = len(cluster["computes"]) for attribute in cluster["frontend"].keys(): element["frontend " + attribute] = cluster["frontend"][ attribute] names = [] for compute in cluster["computes"]: names.append(compute["name"]) element["computes"] = hostlist.collect_hostlist(names) elements[cluster["name"]] = element result = dict_printer(elements, order=[ "name", "project", "nodes", "computes", "frontend name", "frontend state", "frontend type", "description", ], header=[ "Name", "Project", "Count", "Nodes", "Frontend (Fe)", "State (Fe)", "Type (Fe)", "Description", ], output=format) return result
def rain_command(arguments): """ Usage: rain -h | --help rain --version rain admin add [LABEL] --file=FILE rain admin baremetals rain admin on HOSTS rain admin off HOSTS rain admin [-i] delete HOSTS rain admin [-i] rm HOSTS rain admin list users [--merge] rain admin list projects [--merge] rain admin list roles rain admin list hosts [--user=USERS|--project=PROJECTS|--role=ROLE] [--start=TIME_START] [--end=TIME_END] [--format=FORMAT] rain admin policy [--user=USERS|--project=PROJECTS|--role=ROLE] (-l HOSTS|-n COUNT) [--start=TIME_START] [--end=TIME_END] rain user list [--project=PROJECTS] [HOSTS] rain user list hosts [--start=TIME_START] [--end=TIME_END] [--format=FORMAT] rain status [--short|--summary][--kind=KIND] [HOSTS] rain provision --profile=PROFILE HOSTS rain provision list [--type=TYPE] (--distro=DISTRO|--kickstart=KICKSTART) rain provision --distro=DITRO --kickstart=KICKSTART HOSTS rain provision add (--distro=URL|--kickstart=KICk_CONTENT) NAME rain provision power [--off] HOSTS rain provision monitor HOSTS Arguments: HOSTS the list of hosts passed LABEL the label of a host COUNT the count of the bare metal provisioned hosts KIND the kind TYPE the type of profile or server Options: -n COUNT count of teh bare metal hosts to be provisined -p PROJECTS --projects=PROJECTS -u USERS --user=USERS Specify users -f FILE, --file=FILE file to be specified -i interactive mode adds a yes/no question for each host specified --role=ROLE Specify predefined role --start=TIME_START Start time of the reservation, in YYYY/MM/DD HH:MM:SS format. [default: current_time] --end=TIME_END End time of the reservation, in YYYY/MM/DD HH:MM:SS format. In addition a duration can be specified if the + sign is the first sign. The duration will than be added to the start time. [default: +1d] --kind=KIND Format of the output -png, jpg, pdf. [default:png] --format=FORMAT Format of the output json, cfg. [default:json] --type=TYPE Format of the output profile, server. [default:server] """ # comment by H. C, we need the raw list for policy """ for list in ["HOSTS", "USERS", "PROJECTS","--project", "--user"]: try: expanded_list = hostlist.expand_hostlist(arguments[list]) arguments[list]=expanded_list except: pass """ # print(arguments) # wrapper wrapper = RainCobblerWrapper() """ rain admin on HOSTS rain admin off HOSTS """ if arguments["admin"]: if arguments["add"]: print("add") if arguments["LABEL"] is not None: """admin add LABEL --file=FILE""" print((arguments["LABEL"])) print((arguments["--file"])) not_implemented() else: """admin add --file=FILE""" print((arguments["--file"])) not_implemented() elif arguments["baremetals"]: """rain admin baremetals""" print("list all baremetals") result = wrapper.baremetal_computer_host_list() print(result if result else "No Baremetals") elif arguments["on"]: """rain admin on HOSTS""" print("switch on") print((arguments["HOSTS"])) result = wrapper.baremetal_computer_host_on(arguments["HOSTS"]) print("success" if result else "failed") elif arguments["off"]: """rain admin off HOSTS""" print("switch off") print((arguments["HOSTS"])) result = wrapper.baremetal_computer_host_off(arguments["HOSTS"]) print("success" if result else "failed") elif arguments["delete"] or arguments["rm"]: """rain admin [-i] delete HOSTS""" """rain admin [-i] rm HOSTS""" interactive = arguments["-i"] print("delete", interactive) for host in arguments["HOSTS"]: if interactive: answer = raw_input( "Do you want to delete the host %s? (y)es/(n)o: " % host) if answer in ["yes", "y", "Y", "YES"]: print("delete %s" % host) else: print("keeping %s" % host) not_implemented() elif arguments["list"]: if arguments["users"]: print("list users") flag_merge = arguments["--merge"] policys = wrapper.list_all_user_group_hosts(True, flag_merge) print_policys(policys, flag_merge) elif arguments["projects"]: print("list projects") flag_merge = arguments["--merge"] policys = wrapper.list_all_user_group_hosts(False, flag_merge) print_policys(policys, flag_merge) elif arguments["roles"]: print("list roles") not_implemented() elif arguments["hosts"]: print("list hosts") # comment by H. C """ not_implemented() (time_start, time_end) = parse_time_interval(arguments["--start"], arguments["--end"]) print "From:", time_start print "To :", time_end """ if arguments["--user"] is not None: policys = wrapper.list_user_hosts(arguments["--user"]) print_policys(policys) elif arguments["--project"] is not None: policys = wrapper.list_project_hosts( arguments["--project"]) print_policys(policys) elif arguments["--role"] is not None: not_implemented() else: print ("all users, projects, roles") not_implemented() elif arguments["policy"]: print("policy") # comment by H. C """ (time_start, time_end) = parse_time_interval(arguments["--start"], arguments["--end"]) print "From:", time_start print "To :", time_end """ if arguments["--user"] is not None: policy_id = wrapper.add_user_policy( arguments["--user"], arguments["HOSTS"]) print("success" if policy_id else "failed") elif arguments["--project"] is not None: policy_id = wrapper.add_project_policy( arguments["--project"], arguments["HOSTS"]) print("success" if policy_id else "failed") elif arguments["--role"] is not None: not_implemented() else: print ("all users, projects, roles") not_implemented() elif arguments["list"]: print("list") not_implemented() elif arguments["status"]: print("status") if arguments["--short"]: status_dict = wrapper.get_status_short(arguments["HOSTS"]) if status_dict: for host in sorted(status_dict.keys()): print("{0:16}\t{1}".format(host, status_dict[host])) else: print("Empty") if arguments["--summary"]: status_dict = wrapper.get_status_summary(arguments["HOSTS"]) if status_dict: for deploy_status in ["deployed", "deploying", "failed", "total", ]: print("{0:16}\t{1}".format(deploy_status, status_dict[deploy_status])) elif arguments["user"]: if arguments["list"]: print("user list") (time_start, time_end) = parse_time_interval(arguments["--start"], arguments["--end"]) print("From:", time_start) print("To :", time_end) not_implemented() ### # provisioning ### elif arguments["provision"]: # print "provision a node..." if arguments["list"]: # print "this will list distro or kickstart info" if arguments["--type"] == "profile": print("this will list profiles based on distro or kickstart info") profiles = wrapper.list_profile_based_distro_kickstart( arguments["--distro"], arguments["--kickstart"]) print("matched profiles: {0}".format(profiles)) else: print("this will list servers based on distro or kickstart info") servers = wrapper.list_system_based_distro_kickstart( arguments["--distro"], arguments["--kickstart"]) print("matched servers: {0}".format(servers)) elif arguments["add"]: print("add a new distro or kickstart") not_implemented() elif arguments["power"]: print("power ON/OFF a host...") # pre-process hosts which user can access if arguments["HOSTS"]: all_hosts = filtered_access_hosts(arguments["HOSTS"]) access_hosts = all_hosts["access"] unaccess_hosts = all_hosts["unaccess"] if unaccess_hosts: print("You can NOT access these hosts: {0}, please contact your admin.".format(hostlist.collect_hostlist(unaccess_hosts))) result = wrapper.power_host(access_hosts, not arguments["--off"]) power_hosts = [h for h in sorted(result.keys()) if result[h]] unknown_hosts = [h for h in sorted(result.keys()) if not result[h]] if unknown_hosts: print("unknow hosts, must deploy first: ", hostlist.collect_hostlist(hostlist.collect_hostlist(unknown_hosts))) if power_hosts: print("call [rain provision monitor {0}] to monitor power progress.".format(hostlist.collect_hostlist(power_hosts))) elif arguments["monitor"]: print("monitor progress of a host...") # pre-process hosts which user can access if arguments["HOSTS"]: all_hosts = filtered_access_hosts(arguments["HOSTS"]) access_hosts = all_hosts["access"] unaccess_hosts = all_hosts["unaccess"] if unaccess_hosts: print("You can NOT access these hosts: {0}, please contact your admin.".format(hostlist.collect_hostlist(unaccess_hosts))) result = wrapper.monitor_host(access_hosts) poweron_hosts = [ h for h in sorted(result.keys()) if result[h]["status"] == "poweron"] poweroff_hosts = [ h for h in sorted(result.keys()) if result[h]["status"] == "poweroff"] deploy_hosts = [ h for h in sorted(result.keys()) if result[h]["status"] == "deploy"] failed_hosts = [ h for h in sorted(result.keys()) if result[h]["status"] == "failed"] unknown_hosts = [ h for h in sorted(result.keys()) if result[h]["status"] == "unknown"] print_progress("deploy", deploy_hosts, result) print_progress("poweron", poweron_hosts, result) print_progress("poweroff", poweroff_hosts, result) print_progress("failed", failed_hosts, result) print_progress("unknown", unknown_hosts, result) else: # pre-process hosts which user can access if arguments["HOSTS"]: all_hosts = filtered_access_hosts(arguments["HOSTS"]) access_hosts = all_hosts["access"] unaccess_hosts = all_hosts["unaccess"] if unaccess_hosts: print("You can NOT access these hosts: {0}, please contact your admin.".format(hostlist.collect_hostlist(unaccess_hosts))) if arguments["--profile"]: if access_hosts: wrapper.provision_host_with_profile( arguments["--profile"], access_hosts) print("call [rain provision monitor {0}] to monitor depoy progress.".format(hostlist.collect_hostlist(access_hosts))) elif arguments["--distro"] and arguments["--kickstart"]: if access_hosts: wrapper.provision_host_with_distro_kickstart( arguments["--distro"], arguments["--kickstart"], access_hosts) print("call [rain provision monitor {0}] to monitor deploy progress.".format(hostlist.collect_hostlist(access_hosts)))
def collect_eq(self, hostlist, expanded_list): # Note the order of the arguments! This makes it easier to # copy tests between the expand and collect parts! self.assertEqual(hostlist, collect_hostlist(expanded_list))
exclude_states = ['down', 'draining', 'drained'] node_list_filename = os.path.join(directory, 'node_list') logger.info('creating {0}'.format(node_list_filename)) all_nodes = bench.util.get_nodes() node_list = bench.util.filter_node_list(all_nodes, include_states=include_states, exclude_states=exclude_states, **kwargs) logger.info('nodes to test: {0}'.format(len(node_list))) try: bench.util.write_node_list(node_list_filename, sorted(node_list)) except IOError, ex: logger.error('unable to write {0}'.format(node_list_filename)) logger.debug(ex, exc_info=True) error_nodes_filename = os.path.join(directory, 'error_nodes') error_nodes = all_nodes - node_list if error_nodes: logger.warn('error nodes: {0} ({1} nodes)'.format( hostlist.collect_hostlist(error_nodes), len(error_nodes), )) try: bench.util.write_node_list(error_nodes_filename, sorted(error_nodes)) except IOError, ex: logger.error('unable to write {0}'.format(error_nodes_filename)) logger.debug(ex, exc_info=True)
def simple_list(id=None, format="table"): result = "" if id is None: r = Comet.get(Comet.url("cluster/")) else: r = Comet.get(Comet.url("cluster/" + id + "/")) if r is None: Console.error("Could not find cluster `{}`".format(id)) return result r = [r] if r is not None: if 'error' in r: Console.error("An error occurred: {error}".format(**r)) raise ValueError("COMET Error") elif 'error' in r[0]: Console.error("An error occurred: {error}".format(**r[0])) raise ValueError("COMET Error") if format == "rest": result = r else: elements = {} for cluster in r: element = {} for attribute in ["project", "name", "description"]: element[attribute] = cluster[attribute] element["nodes"] = len(cluster["computes"]) for attribute in cluster["frontend"]: element["frontend " + attribute] = cluster["frontend"][attribute] names = [] for compute in cluster["computes"]: names.append(compute["name"]) element["computes"] = hostlist.collect_hostlist(names) elements[cluster["name"]] = element result = Printer.write(elements, order=[ "name", "project", "nodes", "computes", "frontend name", "frontend state", "frontend type", "description", ], header=[ "Name", "Project", "Count", "Nodes", "Frontend (Fe)", "State (Fe)", "Type (Fe)", "Description", ], output=format) return result
def main(argv): # Default slurm configuration slurm_conf="/etc/slurm/slurm.conf" int_part_name="pubint" state_file="/tmp/grabby.dat" gstate=["red","yellow","green"] slurm_node_dict={} node_dict={} user_dict={} try: opts,args=getopt.getopt(argv,"np:h") except getopt.GetoptError: usage() sys.exit() notify=0 for opt,arg in opts: if opt in ("-h"): usage() sys.exit() elif opt in ("-n"): notify=1 elif opt in ("-p"): int_part_name=arg int_part_list=parse_slurm_conf(slurm_conf,int_part_name,slurm_node_dict) if len(int_part_list)==0: print "Error: no members found in partition",int_part_name else: #print "Getting Loads" for node in int_part_list: cores=slurm_node_dict[node] node_dict[node]=[cores,get_node_load(node),cores] #print "Getting Users" parse_squeue(node_dict,user_dict,int_part_name) free_nodes=count_free_nodes(node_dict) if notify==1: last_state=read_state(state_file) new_state=nodes_to_state(free_nodes) if new_state==last_state: sys.exit() write_state(state_file,new_state) output="Node, Free Cores, Load\n" for node in sorted(node_dict.items(),key=lambda item:item[1][0],reverse=True): output+= "%s, %d, %.2f\n" % (node[0],node[1][0],node[1][1]) output+="\n" output+="User, Cores, Node(s)\n" for user in sorted(user_dict.items(),key=lambda user:user[1][0],reverse=True): output+= "%s, %d, %s\n" % (user[0],user[1][0],hostlist.collect_hostlist(user[1][1])) if notify==1: mailing_list(recipients,"Grabnode State Change: "+gstate[new_state],output) else: print output
async def collect_periodically(conf, result_queue): deadline = time.time() + conf['interval'] while True: ts = metricq.Timestamp.now() if conf['active_hosts']: ts, data = await get_sensor_data_dict( conf['active_hosts'], conf['username'], conf['password'], conf['record_ids'], ) hosts_to_fix = set() for host, host_info in conf['hosts'].items(): for metric_sufix, metric_data in conf['metrics'].items(): value = NaN if host_info['status'] == Status.ACTIVE: sensors = {} for sensor in metric_data['sensors']: try: sensors[sensor] = data[sensor][host] except KeyError: if host in conf['active_hosts']: conf['active_hosts'].remove(host) conf['hosts'][host]['status'] = Status.ERROR hosts_to_fix.add(host) sensors = {} break if not sensors: continue if 'plugin' in conf: try: value = conf['plugin'].create_metric_value(sensors) except Exception as e: logger.error( 'Error in plugin, Exception: {0}'.format(e, )) else: value = sensors[metric_data['sensors'][0]]['value'] elif host_info['status'] == Status.ERROR: if not host in hosts_to_fix and time.time( ) > host_info['next_try']: hosts_to_fix.add(host) metric_name = '{}.{}'.format( host_info['host_name'], metric_sufix, ) result_queue.put((metric_name, ts, value)) if hosts_to_fix: await try_fix_hosts( conf, hosts_to_fix, ) while deadline <= time.time(): logging.warning( 'missed deadline in {}'.format( hostlist.collect_hostlist(conf['hosts'].keys()), ), ) deadline += conf['interval'] sleep_var = deadline - time.time() await asyncio.sleep(sleep_var) deadline += conf['interval']
k = random.choice(list(ci_nnodes.keys())) while ci_nnodes[k][1] == 0: k = random.choice(list(ci_nnodes.keys())) ci_nnodes[k][1] += rest_mc print(ci_nnodes) start_gpu = 0 start_mc = 0 ci_nodelist = dict() for k in ci_nnodes: n_gpu, n_mc = ci_nnodes[k] nodelist_gpu = [] nodelist_mc = [] if n_gpu > 0: nodelist_gpu = exp_gpu[start_gpu:start_gpu + n_gpu] start_gpu += n_gpu if n_mc > 0: nodelist_mc = exp_mc[start_mc:start_mc + n_mc] start_mc += n_mc #print(len(nodelist_gpu),'==',n_gpu) #print(len(nodelist_mc),'==',n_mc) ci_nodelist[k] = [nodelist_gpu, nodelist_mc] for k in ci_nodelist: nodelist_gpu, nodelist_mc = ci_nodelist[k] final_nodelist = collect_hostlist(nodelist_gpu + nodelist_mc) file_name = 'ci_' + k + '_node_resume.lst' f = open(file_name, 'w') f.write(final_nodelist) f.close()
def main(argv): # Default slurm configuration slurm_conf = "/etc/slurm/slurm.conf" int_part_name = "pubint" state_file = "/tmp/grabby.dat" gstate = ["red", "yellow", "green"] slurm_node_dict = {} node_dict = {} user_dict = {} try: opts, args = getopt.getopt(argv, "np:h") except getopt.GetoptError: usage() sys.exit() notify = 0 for opt, arg in opts: if opt in ("-h"): usage() sys.exit() elif opt in ("-n"): notify = 1 elif opt in ("-p"): int_part_name = arg int_part_list = parse_slurm_conf(slurm_conf, int_part_name, slurm_node_dict) if len(int_part_list) == 0: print "Error: no members found in partition", int_part_name else: #print "Getting Loads" for node in int_part_list: cores = slurm_node_dict[node] node_dict[node] = [cores, get_node_load(node), cores] #print "Getting Users" parse_squeue(node_dict, user_dict, int_part_name) free_nodes = count_free_nodes(node_dict) if notify == 1: last_state = read_state(state_file) new_state = nodes_to_state(free_nodes) if new_state == last_state: sys.exit() write_state(state_file, new_state) output = "Node, Free Cores, Load\n" for node in sorted(node_dict.items(), key=lambda item: item[1][0], reverse=True): output += "%s, %d, %.2f\n" % (node[0], node[1][0], node[1][1]) output += "\n" output += "User, Cores, Node(s)\n" for user in sorted(user_dict.items(), key=lambda user: user[1][0], reverse=True): output += "%s, %d, %s\n" % (user[0], user[1][0], hostlist.collect_hostlist(user[1][1])) if notify == 1: mailing_list(recipients, "Grabnode State Change: " + gstate[new_state], output) else: print output