Example #1
0
def gen_slurm(g, out):
    """
    g: A networkx graph representing the IB network
    out: Output file-like object
    """
    try:
        import hostlist
    except ImportError:
        print("""To generate a slurm topology.conf, you need to install
python-hostlist, https://pypi.python.org/pypi/python-hostlist""")
        raise
    out.write('# topology.conf generated by ibtopo2dot.py\n')
    for n, nbrs in g.adjacency():
        if g.nodes[n]['type'] == 'Switch':
            switches = []
            nodes = []
            for nbr in nbrs:
                if g.nodes[nbr]['type'] == 'Switch':
                    switches.append(g.nodes[nbr]['label'])
                else:
                    nodename = g.nodes[nbr]['label']
                    nodes.append(nodename)
            switchstring = ""
            if len(switches) > 0:
                switches.sort()
                switchstring = " Switches=" + hostlist.collect_hostlist(
                    switches)
            nodestr = ''
            if len(nodes) > 0:
                nodes.sort()
                nodestr = " Nodes=" + hostlist.collect_hostlist(nodes)
            out.write('SwitchName=%s%s%s\n' %
                      (g.nodes[n]['label'], switchstring, nodestr))
Example #2
0
def gen_slurm(g, out):
    """
    g: A networkx graph representing the IB network
    out: Output file-like object
    """
    try:
        import hostlist
    except ImportError:
        print("""To generate a slurm topology.conf, you need to install
python-hostlist, https://pypi.python.org/pypi/python-hostlist""")
        raise
    out.write('# topology.conf generated by ibtopo2dot.py\n')
    for n, nbrs in g.adjacency():
        if g.node[n]['type'] == 'Switch':
            switches = []
            nodes = []
            for nbr in nbrs:
                if g.node[nbr]['type'] == 'Switch':
                    switches.append(g.node[nbr]['label'])
                else:
                    nodename = g.node[nbr]['label']
                    nodes.append(nodename)
            switchstring = ""
            if len(switches) > 0:
                switches.sort()
                switchstring = " Switches=" + hostlist.collect_hostlist(switches)
            nodestr = ''
            if len(nodes) > 0:
                nodes.sort()
                nodestr = " Nodes=" + hostlist.collect_hostlist(nodes)
            out.write('SwitchName=%s%s%s\n' % (g.node[n]['label'],
                                               switchstring, nodestr))
Example #3
0
def rlookup(list):
  debug("Reverse Lookup:")
  debug(list)
  # hostlist does not like square brackets, so get rid of them
  # this turns a string obj into a list obj
  new_list = [str(n) for n in list.replace("[","").replace("]", "").split(",")]
  return hostlist.collect_hostlist(new_list)
Example #4
0
async def ipmi_sensors(hosts_list, username, password, record_ids=None):
    """call ipmi-sensors and parse the output

    :param hosts_list: List of hosts to be queried
    :param username: user name to query data
    :param password: password to query data
    :param record_ids: ids of the records that are queried
    :return: output table of ipmi-sensors
    """

    hosts = hostlist.collect_hostlist(hosts_list)
    fanout = str(min(1024, len(hosts_list)))
    param = [
        '-h',
        hosts,
        '-u',
        username,
        '-p',
        password,
        '-F',
        fanout,
    ]
    if record_ids:
        param.extend(['-r', str.join(',', record_ids)])
    query_timestamp = metricq.Timestamp.now()
    process = await asyncio.create_subprocess_exec(
        *CMD_IPMI_SENSORE_BASE,
        *param,
        stdout=asyncio.subprocess.PIPE,
    )
    stdout, stderr = await process.communicate()
    output = stdout.decode()
    return query_timestamp, output
Example #5
0
def execute(directory, include_states=None, exclude_states=None, **kwargs):
    if not (include_states or exclude_states):
        exclude_states = ['down', 'draining', 'drained']

    node_list_filename = os.path.join(directory, 'node_list')
    logger.info('creating {0}'.format(node_list_filename))

    all_nodes = bench.util.get_nodes()
    node_list = bench.util.filter_node_list(all_nodes,
                                            include_states=include_states,
                                            exclude_states=exclude_states,
                                            **kwargs)
    logger.info('nodes to test: {0}'.format(len(node_list)))

    try:
        bench.util.write_node_list(node_list_filename, sorted(node_list))
    except IOError as ex:
        logger.error('unable to write {0}'.format(node_list_filename))
        logger.debug(ex, exc_info=True)

    error_nodes_filename = os.path.join(directory, 'error_nodes')
    error_nodes = all_nodes - node_list
    if error_nodes:
        logger.warning('error nodes: {0} ({1} nodes)'.format(
            hostlist.collect_hostlist(error_nodes),
            len(error_nodes),
        ))
    try:
        bench.util.write_node_list(error_nodes_filename, sorted(error_nodes))
    except IOError as ex:
        logger.error('unable to write {0}'.format(error_nodes_filename))
        logger.debug(ex, exc_info=True)
Example #6
0
def print_progress(label, hosts, data):
    print("Status of host --{0}-- ({1}): ".format(label, len(hosts)))
    if len(hosts) == 0:
        print("\tEmpty records.")
    else:
        if label in ["failed", "unknown"]:
            print("\thost list: ", hostlist.collect_hostlist(hosts))
        else:
            for host in hosts:
                print("\t{0}\t{1}%".format(host, data[host]["progress"]))
Example #7
0
def print_progress(label, hosts, data):
    print("Status of host --{0}-- ({1}): ".format(label, len(hosts)))
    if len(hosts) == 0:
        print("\tEmpty records.")
    else:
        if label in ["failed", "unknown"]:
            print("\thost list: ", hostlist.collect_hostlist(hosts))
        else:
            for host in hosts:
                print("\t{0}\t{1}%".format(host, data[host]["progress"]))
Example #8
0
async def log_loop(configs, log_interval):
    while True:
        for conf in configs:
            active_queried_hosts = sum(1 for i in conf['hosts'].values()
                                       if i['status'] == Status.ACTIVE)
            logger.info('{0} of {1} are active in {2}'.format(
                active_queried_hosts,
                len(conf['hosts']),
                hostlist.collect_hostlist(conf['hosts'].keys()),
            ))
        await asyncio.sleep(log_interval)
Example #9
0
 def get_policy_based_user_or_its_projects(self, user, projects):
     """get the merged policy based on the username and his/her related projects
     :param string user: only one valid username
     :param list projects: a list of project
     :return: a policy with hostlist string, or None if non-exist policy for user and its projects
     """
     user_policy = self.get_policy_based_user(user)
     group_policy = self.get_policy_based_group(collect_hostlist(projects))
     if user_policy or group_policy:
         if user_policy:
             valid_policy = user_policy
             if group_policy:
                 valid_policy.update(group_policy)
         else:
             valid_policy = group_policy
     else:
         return None
     all_policys = []
     for name in valid_policy:
         all_policys.extend(expand_hostlist(valid_policy[name]))
     return collect_hostlist(list(set(all_policys)))
Example #10
0
 def get_policy_based_user_or_its_projects(self, user, projects):
     """get the merged policy based on the username and his/her related projects
     :param string user: only one valid username
     :param list projects: a list of project
     :return: a policy with hostlist string, or None if non-exist policy for user and its projects
     """
     user_policy = self.get_policy_based_user(user)
     group_policy = self.get_policy_based_group(collect_hostlist(projects))
     if user_policy or group_policy:
         if user_policy:
             valid_policy = user_policy
             if group_policy:
                 valid_policy.update(group_policy)
         else:
             valid_policy = group_policy
     else:
         return None
     all_policys = []
     for name in valid_policy:
         all_policys.extend(expand_hostlist(valid_policy[name]))
     return collect_hostlist(list(set(all_policys)))
Example #11
0
    def bits_from_string_hpctools(self, mask):
        ret = []
        mask_int = int(mask, 0)
        index = 0
        while mask_int:
            if mask_int & 1:
                # ret.append(index)
                ret.append(str(index))

            index += 1
            mask_int >>= 1

        # return ret
        return hostlist.collect_hostlist(ret)
Example #12
0
 def merge_policy_based_ug(self, policys, flag_user=True, name=None):
     """merge policy based the name of user/group
     :param dict policys: all the possible policys for users/groups
     :param boolean flag_user: True means user, False means group
     :param string name: the name of one or more user/group, None means all user/groups
     :return: a dict of user/group and policy with the formation {"name1": "hostlist", "name2": "hostlist2"}
     """
     if policys:
         data = {}
         names = expand_hostlist(name) if name else None
         for puser in policys:
             users = expand_hostlist(puser)
             common_users = [u for u in users
                             if u in names] if names else users
             hosts = []
             for policy in policys[puser]:
                 hosts.extend(expand_hostlist(policy["policy"]))
             for user in common_users:
                 if user in data:
                     data[user].extend(hosts)
                 else:
                     data[user] = deepcopy(hosts)
         for user in data:
             data[user] = collect_hostlist(data[user])
         # flip data, combine duplicate values
         flipped_data = {}
         for user in data:
             if data[user] in flipped_data:
                 flipped_data[data[user]].append(user)
             else:
                 flipped_data[data[user]] = [user]
         # try to merge user with same hosts
         data = {}
         for value in flipped_data:
             data[collect_hostlist(flipped_data[value])] = value
         return data if data else None
     return None
Example #13
0
 def merge_policy_based_ug(self, policys, flag_user=True, name=None):
     """merge policy based the name of user/group
     :param dict policys: all the possible policys for users/groups
     :param boolean flag_user: True means user, False means group
     :param string name: the name of one or more user/group, None means all user/groups
     :return: a dict of user/group and policy with the formation {"name1": "hostlist", "name2": "hostlist2"}
     """
     if policys:
         data = {}
         names = expand_hostlist(name) if name else None
         for puser in policys:
             users = expand_hostlist(puser)
             common_users = [
                 u for u in users if u in names] if names else users
             hosts = []
             for policy in policys[puser]:
                 hosts.extend(expand_hostlist(policy["policy"]))
             for user in common_users:
                 if user in data:
                     data[user].extend(hosts)
                 else:
                     data[user] = deepcopy(hosts)
         for user in data:
             data[user] = collect_hostlist(data[user])
         # flip data, combine duplicate values
         flipped_data = {}
         for user in data:
             if data[user] in flipped_data:
                 flipped_data[data[user]].append(user)
             else:
                 flipped_data[data[user]] = [user]
         # try to merge user with same hosts
         data = {}
         for value in flipped_data:
             data[collect_hostlist(flipped_data[value])] = value
         return data if data else None
     return None
Example #14
0
def nodeStateList(record):
    global node_states
    # make a list of nodes in each state
    stateList = {}
    for node in node_states:
        if stateList.get(node_states[node]) == None:
            stateList[node_states[node]] = []
        stateList[node_states[node]].append(node)

    # make new record
    new = {'_time': record.get('_time')}
    for state in stateList:
        new["StateName_" + state] = hostlist.collect_hostlist(stateList[state])

    output_results.append(new)
Example #15
0
def nodeStateList(record):
  global node_states
  # make a list of nodes in each state
  stateList = {}
  for node in node_states:
    if stateList.get(node_states[node]) == None:
      stateList[node_states[node]] = []
    stateList[node_states[node]].append(node)

  # make new record
  new = {'_time': record.get('_time')}
  for state in stateList:
    new["StateName_"+state] = hostlist.collect_hostlist(stateList[state])

  output_results.append(new)
Example #16
0
def wlm_get_nodes_in_reservation(location, task_id, task_node_type):
    command = f"rsvnodelist scops-{task_id}"
    output = remote_execute(location, command)
    success = check_command_with_output(command, output)
    if success:
        if output.startswith('No nodelist') or output.startswith(
                'No reservation'):
            return success, "", 0
        splitted_output = re.split('\n', output)
        resources = list()
        for t in splitted_output:
            node_type, nodelist = t.split(' ')
            if task_node_type in node_type.split(','):  # should always be true
                resources += expand_hostlist(nodelist)
        return success, collect_hostlist(resources), len(resources)
    return False, None, 0
Example #17
0
    def run(self, params, args):
#	os.system("rocks report slurmnodes > /etc/slurm/nodenames.conf")	
#	os.system("rocks report slurmpartitions > /etc/slurm/partitions.conf")	
	os.system("make -C /var/411 force >/dev/null")
	os.system("sleep 10")
	os.system("service slurm restart >/dev/null")
	#os.system("/opt/rocks/bin/tentakel service slurm reconfig >/dev/null")
	query=('select nodes.name from nodes, memberships where nodes.membership = memberships.id and memberships.name like "%Compute%" order by rack,rank' )
        self.db.execute(query)
        myhostlist = []
        nodelist=0
        for name in self.db.fetchall():
               myhostlist.append("%s" % (name))
               nodelist=1

        if nodelist > 0:
              hl = hostlist.collect_hostlist(myhostlist)
	      os.system("/opt/pdsh/bin/pdsh -w %s service slurm reconfig" % (hl)) 
Example #18
0
    def display_results(self):
        error_table = []
        fail_table = []
        # for key, result in self.results['fail'].items():
        for key, result in list(self.results['fail'].items()):
            if result == []:
                self.results['fail'].pop(key, None)
                self.results['f_tests'].remove(key)
                for ii in key.split(','):
                    self.results['error']['not_parsable'].add(ii)
                continue
            fail_table.append([key] + result)

        # for key, result in self.results['error'].items():
        for key, result in list(self.results['error'].items()):
            if result:
                error_table.append([hostlist.collect_hostlist(result), key])

        self.log_results(fail_table, error_table)

        #print("FAIL TABLE :", fail_table)


        #Summary
        print("### Summary ###")
        print('passing nodes: {passed} / {total}'.format(passed=len(self.results['p_nodes']), total=len(self.node_list)))
        print('failing nodes: {passed} / {total}'.format(passed=len(self.results['f_nodes']), total=len(self.node_list)))
        print('error nodes: {passed} / {total}'.format(passed=len(self.results['e_nodes']), total=len(self.node_list)))

        if self.results['p_tests']:
            self.results_logger.info("\n### Passing Tests ###")
            self.results_logger.info(sorted(list(self.results['p_tests'])))

        if fail_table:
            print("\n### Failing Tests ###")
            print(tabulate.tabulate(fail_table, headers=['Hardware', 'Test', 'Result', 'Expected', 'Res/Exp'], floatfmt=".2f"))

        if error_table:
            print("\n### Missing/Error Tests ###")
            print(tabulate.tabulate(error_table, headers=['Hardware', 'Reason']))
Example #19
0
        action='store',
        dest='file',
        help="Name of the file to parse. If not specified, STDIN is used.",
        default="-")
    args = parser.parse_args()

    cables = defaultdict(dict)
    leaves = defaultdict(set)
    spines = defaultdict(set)

    for line in fileinput.input(args.file):
        line = line.strip()

        _, port1, type1, name1, _, port2, type2, name2 = line.split(';')

        if type1 == 'FI':
            name1 = name1.split(" ")[0]
            if type2 == 'SW':
                cables[name2].update({port2: (name1, port1)})
                leaves[name2].add(name1)
        else:
            cables[name1].update({port1: (name2, port2)})
            spines[name1].add(name2)

    for switch, nodes in leaves.items():
        nodenames = filter(lambda x: args.node_filter in x, nodes)
        nodenames = collect_hostlist(nodenames)
        print "SwitchName=%s Nodes=%s" % (switch, nodenames)
    for switch, nodes in spines.items():
        print "SwitchName=%s Switches=%s" % (switch, ",".join(nodes))
 ## JSON
 if json_stdout: print json.dumps(bad_disks_dict)
 if json_file:
     if os.path.isfile(json_file):
         with open(json_file, 'a') as outfile:
             json.dump(bad_disks_dict, outfile)
             outfile.write("\n")
     else:
         with open(json_file, 'w') as outfile:
             json.dump(bad_disks_dict, outfile)
             outfile.write("\n")
 ## No more JSON
 # turn list [ '1I:1:1' ,'1I:1:2' ] into string 1I:1:[1-2]
 # turn list [ 'Physical Drive (4 TB SAS HDD) 1I:1:32' ,'Physical Drive (4 TB SAS HDD) 1I:1:33', ... ]
 #  into string Physical Drive (4 TB SAS HDD) 1I:1:[32-64]
 collected_bad_disks = hostlist.collect_hostlist(bad_disks)
 if no_diff_cnt > 0 and diff_cnt < 1:
     if verbosely or debug:
         print "no differences in the counters between any disks in the reports"
     else:
         print "OK: No increases of %s on any disks. (%s vs %s)" % (
             track_this_error_counter, timegenerated1, timegenerated2)
     sys.exit(OK)
 elif no_diff_cnt == 0 and diff_cnt == 0:
     print "UNKNOWN: Found nothing, does '%s' exist?" % track_this_error_counter
     sys.exit(UNKNOWN)
 else:
     print "CRITICAL: %s increased on these disks: %s (%s vs %s)" % (
         track_this_error_counter, collected_bad_disks, timegenerated1,
         timegenerated2)
     sys.exit(CRITICAL)
Example #21
0
def main():
    parser = argparse.ArgumentParser(
        description='Launch groups of test scripts with srun.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Enable debug loggibng (default: info)')
    parser.add_argument(
        '--partition',
        help='Specify a partition for node selection and job submission')
    parser.add_argument('--mpi', help='Enable MPI support for srun')
    parser.add_argument('--ntasks',
                        help='Specify the number of tasks to run for each job')
    parser.add_argument(
        '--ntasks-per-node',
        help='Specify the number of tasks to run on each node for each job')
    parser.add_argument('--state',
                        action='append',
                        help='Specify valid node states for node selection')
    parser.add_argument('--account',
                        help='Specify the account to use during srun')
    parser.add_argument('--chdir', help='Specify a directory to use for srun')
    parser.add_argument('--time', help='Specify a job runtime to use for srun')
    parser.add_argument('--nodelist',
                        help='Specify a node list to use for node selection')
    parser.add_argument('--nodes',
                        type=int,
                        default=1,
                        help='Specify the number of nodes to use for srun')
    parser.add_argument(
        '--bcast',
        nargs='?',
        const=True,
        help='Copy executable file to compute nodes during srun')
    parser.add_argument('--exclusive',
                        action='store_true',
                        help='Exclusive use of compute nodes during srun')
    parser.add_argument('--timeout',
                        type=int,
                        help='Terminate jobs after a timeout (seconds)')
    parser.add_argument('executable', help='Executable to execute')
    parser.add_argument('executable_arguments',
                        nargs='*',
                        help='Arguments for the test executable')
    args = parser.parse_args()

    if args.debug:
        log_level = logging.DEBUG
    else:
        log_level = logging.INFO
    logging.basicConfig(level=log_level)

    all_nodes = set(get_all_nodes(states=args.state))
    if args.partition:
        all_nodes = all_nodes & set(get_partition_nodes(args.partition))
    if args.nodelist:
        all_nodes = all_nodes & set(hostlist.expand_hostlist(args.nodelist))
    all_nodes = list(sorted(all_nodes))

    jobs = [(nodelist,
             srun(args.executable,
                  args.executable_arguments,
                  partition=args.partition,
                  nodelist=','.join(nodelist),
                  ntasks=args.ntasks,
                  account=args.account,
                  chdir=args.chdir,
                  time=args.time,
                  bcast=args.bcast,
                  exclusive=args.exclusive,
                  nodes=args.nodes,
                  ntasks_per_node=args.ntasks_per_node,
                  mpi=args.mpi))
            for nodelist in split_nodes(all_nodes, args.nodes)]

    start_time = time.time()
    completed_jobs = set()
    new_completed_jobs = []
    pass_ = set()
    fail = set()
    unknown = set()
    while True:
        for nodelist, job in jobs:
            job.poll()
            if job.returncode is not None and job not in completed_jobs:
                new_completed_jobs.append((nodelist, job))
                if job.returncode == PASS:
                    pass_ |= set(nodelist)
                elif job.returncode == FAIL:
                    fail |= set(nodelist)
                else:
                    unknown |= set(nodelist)
            elif args.timeout is not None and (time.time() - start_time >
                                               args.timeout):
                logger.warning('{0}: {1}'.format(','.join(nodelist),
                                                 'timeout'))
                job.terminate()
        if new_completed_jobs:
            for nodelist, job in new_completed_jobs:
                for line in job.stderr:
                    logger.debug('{0}: {1}'.format(','.join(nodelist),
                                                   line.rstrip()))
                for line in job.stdout:
                    logger.info('{0}: {1}'.format(','.join(nodelist),
                                                  line.rstrip()))
                    break
                completed_jobs.add(job)
            new_completed_jobs = []
        if completed_jobs == set(job for _, job in jobs):
            break
        else:
            time.sleep(POLLING_INTERVAL)
            continue

    if pass_:
        print('pass:'******'fail:', hostlist.collect_hostlist(fail))
    if unknown:
        print('unknown:', hostlist.collect_hostlist(unknown))
Example #22
0
def rain_command(arguments):
    """
    Usage:
        rain -h | --help
        rain --version
        rain admin add [LABEL] --file=FILE
        rain admin baremetals
        rain admin on HOSTS
        rain admin off HOSTS
        rain admin [-i] delete HOSTS
        rain admin [-i] rm HOSTS
        rain admin list users [--merge]
        rain admin list projects [--merge]
        rain admin list roles
        rain admin list hosts [--user=USERS|--project=PROJECTS|--role=ROLE]
                              [--start=TIME_START]
                              [--end=TIME_END]
                              [--format=FORMAT]
        rain admin policy [--user=USERS|--project=PROJECTS|--role=ROLE]
                          (-l HOSTS|-n COUNT)
                          [--start=TIME_START]
                          [--end=TIME_END]
        rain user list [--project=PROJECTS] [HOSTS]
        rain user list hosts [--start=TIME_START]
                        [--end=TIME_END]
                        [--format=FORMAT]
        rain status [--short|--summary][--kind=KIND] [HOSTS]
        rain provision --profile=PROFILE HOSTS
        rain provision list [--type=TYPE] (--distro=DISTRO|--kickstart=KICKSTART)
        rain provision --distro=DITRO --kickstart=KICKSTART HOSTS
        rain provision add (--distro=URL|--kickstart=KICk_CONTENT) NAME
        rain provision power [--off] HOSTS
        rain provision monitor HOSTS

    Arguments:
        HOSTS     the list of hosts passed
        LABEL     the label of a host
        COUNT     the count of the bare metal provisioned hosts
        KIND      the kind
        TYPE      the type of profile or server

    Options:
        -n COUNT     count of teh bare metal hosts to be provisined
        -p PROJECTS  --projects=PROJECTS
        -u USERS     --user=USERS        Specify users
        -f FILE, --file=FILE  file to be specified
        -i           interactive mode adds a yes/no
                     question for each host specified
        --role=ROLE            Specify predefined role
        --start=TIME_START     Start time of the reservation, in
                               YYYY/MM/DD HH:MM:SS format. [default: current_time]
        --end=TIME_END         End time of the reservation, in
                               YYYY/MM/DD HH:MM:SS format. In addition a duration
                               can be specified if the + sign is the first sign.
                               The duration will than be added to
                               the start time. [default: +1d]
        --kind=KIND            Format of the output -png, jpg, pdf. [default:png]
        --format=FORMAT        Format of the output json, cfg. [default:json]
        --type=TYPE            Format of the output profile, server. [default:server]


    """

    # comment by H. C, we need the raw list for policy
    """
    for list in ["HOSTS", "USERS", "PROJECTS","--project", "--user"]:
        try:
            expanded_list = hostlist.expand_hostlist(arguments[list])
            arguments[list]=expanded_list
        except:
            pass
    """
    # print(arguments)
    # wrapper
    wrapper = RainCobblerWrapper()
    """
    rain admin on HOSTS
    rain admin off HOSTS
    """
    if arguments["admin"]:

        if arguments["add"]:
            print("add")

            if arguments["LABEL"] is not None:
                """admin add LABEL --file=FILE"""

                print((arguments["LABEL"]))
                print((arguments["--file"]))
                not_implemented()

            else:
                """admin add --file=FILE"""

                print((arguments["--file"]))
                not_implemented()

        elif arguments["baremetals"]:
            """rain admin baremetals"""

            print("list all baremetals")
            result = wrapper.baremetal_computer_host_list()
            print(result if result else "No Baremetals")

        elif arguments["on"]:
            """rain admin on HOSTS"""

            print("switch on")
            print((arguments["HOSTS"]))
            result = wrapper.baremetal_computer_host_on(arguments["HOSTS"])
            print("success" if result else "failed")

        elif arguments["off"]:
            """rain admin off HOSTS"""

            print("switch off")
            print((arguments["HOSTS"]))
            result = wrapper.baremetal_computer_host_off(arguments["HOSTS"])
            print("success" if result else "failed")

        elif arguments["delete"] or arguments["rm"]:
            """rain admin [-i] delete HOSTS"""
            """rain admin [-i] rm HOSTS"""

            interactive = arguments["-i"]

            print("delete", interactive)

            for host in arguments["HOSTS"]:
                if interactive:
                    answer = raw_input(
                        "Do you want to delete the host %s? (y)es/(n)o: " %
                        host)
                    if answer in ["yes", "y", "Y", "YES"]:
                        print("delete %s" % host)
                    else:
                        print("keeping %s" % host)
            not_implemented()

        elif arguments["list"]:

            if arguments["users"]:
                print("list users")
                flag_merge = arguments["--merge"]
                policys = wrapper.list_all_user_group_hosts(True, flag_merge)
                print_policys(policys, flag_merge)

            elif arguments["projects"]:
                print("list projects")
                flag_merge = arguments["--merge"]
                policys = wrapper.list_all_user_group_hosts(False, flag_merge)
                print_policys(policys, flag_merge)

            elif arguments["roles"]:
                print("list roles")
                not_implemented()

            elif arguments["hosts"]:
                print("list hosts")
                # comment by H. C
                """
                not_implemented()

                (time_start, time_end) = parse_time_interval(arguments["--start"],
                                                           arguments["--end"])
                print "From:", time_start
                print "To  :", time_end
                """
                if arguments["--user"] is not None:
                    policys = wrapper.list_user_hosts(arguments["--user"])
                    print_policys(policys)
                elif arguments["--project"] is not None:
                    policys = wrapper.list_project_hosts(
                        arguments["--project"])
                    print_policys(policys)
                elif arguments["--role"] is not None:
                    not_implemented()
                else:
                    print("all users, projects, roles")
                    not_implemented()

        elif arguments["policy"]:
            print("policy")
            # comment by H. C
            """
            (time_start, time_end) = parse_time_interval(arguments["--start"],
                                                         arguments["--end"])

            print "From:", time_start
            print "To  :", time_end
            """
            if arguments["--user"] is not None:
                policy_id = wrapper.add_user_policy(arguments["--user"],
                                                    arguments["HOSTS"])
                print("success" if policy_id else "failed")
            elif arguments["--project"] is not None:
                policy_id = wrapper.add_project_policy(arguments["--project"],
                                                       arguments["HOSTS"])
                print("success" if policy_id else "failed")
            elif arguments["--role"] is not None:
                not_implemented()
            else:
                print("all users, projects, roles")
                not_implemented()

        elif arguments["list"]:
            print("list")

            not_implemented()

    elif arguments["status"]:
        print("status")
        if arguments["--short"]:
            status_dict = wrapper.get_status_short(arguments["HOSTS"])
            if status_dict:
                for host in sorted(status_dict.keys()):
                    print("{0:16}\t{1}".format(host, status_dict[host]))
            else:
                print("Empty")
        if arguments["--summary"]:
            status_dict = wrapper.get_status_summary(arguments["HOSTS"])
            if status_dict:
                for deploy_status in [
                        "deployed",
                        "deploying",
                        "failed",
                        "total",
                ]:
                    print("{0:16}\t{1}".format(deploy_status,
                                               status_dict[deploy_status]))

    elif arguments["user"]:
        if arguments["list"]:
            print("user list")

            (time_start,
             time_end) = parse_time_interval(arguments["--start"],
                                             arguments["--end"])

            print("From:", time_start)
            print("To  :", time_end)

            not_implemented()
    ###
    # provisioning
    ###
    elif arguments["provision"]:
        # print "provision a node..."
        if arguments["list"]:
            # print "this will list distro or kickstart info"
            if arguments["--type"] == "profile":
                print(
                    "this will list profiles based on distro or kickstart info"
                )
                profiles = wrapper.list_profile_based_distro_kickstart(
                    arguments["--distro"], arguments["--kickstart"])
                print("matched profiles: {0}".format(profiles))
            else:
                print(
                    "this will list servers based on distro or kickstart info")
                servers = wrapper.list_system_based_distro_kickstart(
                    arguments["--distro"], arguments["--kickstart"])
                print("matched servers: {0}".format(servers))
        elif arguments["add"]:
            print("add a new distro or kickstart")
            not_implemented()
        elif arguments["power"]:
            print("power ON/OFF a host...")
            # pre-process hosts which user can access
            if arguments["HOSTS"]:
                all_hosts = filtered_access_hosts(arguments["HOSTS"])
                access_hosts = all_hosts["access"]
                unaccess_hosts = all_hosts["unaccess"]
            if unaccess_hosts:
                print(
                    "You can NOT access these hosts: {0}, please contact your admin."
                    .format(hostlist.collect_hostlist(unaccess_hosts)))
            result = wrapper.power_host(access_hosts, not arguments["--off"])
            power_hosts = [h for h in sorted(result.keys()) if result[h]]
            unknown_hosts = [h for h in sorted(result.keys()) if not result[h]]
            if unknown_hosts:
                print(
                    "unknow hosts, must deploy first: ",
                    hostlist.collect_hostlist(
                        hostlist.collect_hostlist(unknown_hosts)))
            if power_hosts:
                print(
                    "call [rain provision monitor {0}] to monitor power progress."
                    .format(hostlist.collect_hostlist(power_hosts)))
        elif arguments["monitor"]:
            print("monitor progress of a host...")
            # pre-process hosts which user can access
            if arguments["HOSTS"]:
                all_hosts = filtered_access_hosts(arguments["HOSTS"])
                access_hosts = all_hosts["access"]
                unaccess_hosts = all_hosts["unaccess"]
            if unaccess_hosts:
                print(
                    "You can NOT access these hosts: {0}, please contact your admin."
                    .format(hostlist.collect_hostlist(unaccess_hosts)))
            result = wrapper.monitor_host(access_hosts)
            poweron_hosts = [
                h for h in sorted(result.keys())
                if result[h]["status"] == "poweron"
            ]
            poweroff_hosts = [
                h for h in sorted(result.keys())
                if result[h]["status"] == "poweroff"
            ]
            deploy_hosts = [
                h for h in sorted(result.keys())
                if result[h]["status"] == "deploy"
            ]
            failed_hosts = [
                h for h in sorted(result.keys())
                if result[h]["status"] == "failed"
            ]
            unknown_hosts = [
                h for h in sorted(result.keys())
                if result[h]["status"] == "unknown"
            ]
            print_progress("deploy", deploy_hosts, result)
            print_progress("poweron", poweron_hosts, result)
            print_progress("poweroff", poweroff_hosts, result)
            print_progress("failed", failed_hosts, result)
            print_progress("unknown", unknown_hosts, result)
        else:
            # pre-process hosts which user can access
            if arguments["HOSTS"]:
                all_hosts = filtered_access_hosts(arguments["HOSTS"])
                access_hosts = all_hosts["access"]
                unaccess_hosts = all_hosts["unaccess"]
            if unaccess_hosts:
                print(
                    "You can NOT access these hosts: {0}, please contact your admin."
                    .format(hostlist.collect_hostlist(unaccess_hosts)))
            if arguments["--profile"]:
                if access_hosts:
                    wrapper.provision_host_with_profile(
                        arguments["--profile"], access_hosts)
                    print(
                        "call [rain provision monitor {0}] to monitor depoy progress."
                        .format(hostlist.collect_hostlist(access_hosts)))
            elif arguments["--distro"] and arguments["--kickstart"]:
                if access_hosts:
                    wrapper.provision_host_with_distro_kickstart(
                        arguments["--distro"], arguments["--kickstart"],
                        access_hosts)
                    print(
                        "call [rain provision monitor {0}] to monitor deploy progress."
                        .format(hostlist.collect_hostlist(access_hosts)))
Example #23
0
    def simple_list(id=None, format="table"):
        result = ""
        if id is None:
            r = Comet.get(Comet.url("cluster/"))
        else:
            r = Comet.get(Comet.url("cluster/" + id + "/"))
            if r is None:
                Console.error("Could not find cluster `{}`"
                              .format(id))
                return result
            r = [r]

        if r is not None:
            if 'error' in r:
                Console.error("An error occurred: {error}".format(**r))
                raise ValueError("COMET Error")
            elif 'error' in r[0]:
                Console.error("An error occurred: {error}".format(**r[0]))
                raise ValueError("COMET Error")

            if format == "rest":
                result = r
            else:
                elements = {}
                for cluster in r:
                    element = {}
                    for attribute in ["project", "name", "description"]:
                        element[attribute] = cluster[attribute]
                        element["nodes"] = len(cluster["computes"])
                    for attribute in cluster["frontend"].keys():
                        element["frontend " + attribute] = cluster["frontend"][
                            attribute]
                    names = []
                    for compute in cluster["computes"]:
                        names.append(compute["name"])

                    element["computes"] = hostlist.collect_hostlist(names)

                    elements[cluster["name"]] = element

                result = dict_printer(elements,
                                      order=[
                                          "name",
                                          "project",
                                          "nodes",
                                          "computes",
                                          "frontend name",
                                          "frontend state",
                                          "frontend type",
                                          "description",
                                      ],
                                      header=[
                                          "Name",
                                          "Project",
                                          "Count",
                                          "Nodes",
                                          "Frontend (Fe)",
                                          "State (Fe)",
                                          "Type (Fe)",
                                          "Description",
                                      ],

                                      output=format)
            return result
Example #24
0
def rain_command(arguments):
    """
    Usage:
        rain -h | --help
        rain --version
        rain admin add [LABEL] --file=FILE
        rain admin baremetals
        rain admin on HOSTS
        rain admin off HOSTS
        rain admin [-i] delete HOSTS
        rain admin [-i] rm HOSTS
        rain admin list users [--merge]
        rain admin list projects [--merge]
        rain admin list roles
        rain admin list hosts [--user=USERS|--project=PROJECTS|--role=ROLE]
                              [--start=TIME_START]
                              [--end=TIME_END]
                              [--format=FORMAT]
        rain admin policy [--user=USERS|--project=PROJECTS|--role=ROLE]
                          (-l HOSTS|-n COUNT)
                          [--start=TIME_START]
                          [--end=TIME_END]
        rain user list [--project=PROJECTS] [HOSTS]
        rain user list hosts [--start=TIME_START]
                        [--end=TIME_END]
                        [--format=FORMAT]
        rain status [--short|--summary][--kind=KIND] [HOSTS]
        rain provision --profile=PROFILE HOSTS
        rain provision list [--type=TYPE] (--distro=DISTRO|--kickstart=KICKSTART)
        rain provision --distro=DITRO --kickstart=KICKSTART HOSTS
        rain provision add (--distro=URL|--kickstart=KICk_CONTENT) NAME
        rain provision power [--off] HOSTS
        rain provision monitor HOSTS

    Arguments:
        HOSTS     the list of hosts passed
        LABEL     the label of a host
        COUNT     the count of the bare metal provisioned hosts
        KIND      the kind
        TYPE      the type of profile or server

    Options:
        -n COUNT     count of teh bare metal hosts to be provisined
        -p PROJECTS  --projects=PROJECTS
        -u USERS     --user=USERS        Specify users
        -f FILE, --file=FILE  file to be specified
        -i           interactive mode adds a yes/no
                     question for each host specified
        --role=ROLE            Specify predefined role
        --start=TIME_START     Start time of the reservation, in
                               YYYY/MM/DD HH:MM:SS format. [default: current_time]
        --end=TIME_END         End time of the reservation, in
                               YYYY/MM/DD HH:MM:SS format. In addition a duration
                               can be specified if the + sign is the first sign.
                               The duration will than be added to
                               the start time. [default: +1d]
        --kind=KIND            Format of the output -png, jpg, pdf. [default:png]
        --format=FORMAT        Format of the output json, cfg. [default:json]
        --type=TYPE            Format of the output profile, server. [default:server]


    """

    # comment by H. C, we need the raw list for policy
    """
    for list in ["HOSTS", "USERS", "PROJECTS","--project", "--user"]:
        try:
            expanded_list = hostlist.expand_hostlist(arguments[list])
            arguments[list]=expanded_list
        except:
            pass
    """
    # print(arguments)
    # wrapper
    wrapper = RainCobblerWrapper()

    """
    rain admin on HOSTS
    rain admin off HOSTS
    """
    if arguments["admin"]:

        if arguments["add"]:
            print("add")

            if arguments["LABEL"] is not None:
                """admin add LABEL --file=FILE"""

                print((arguments["LABEL"]))
                print((arguments["--file"]))
                not_implemented()

            else:
                """admin add --file=FILE"""

                print((arguments["--file"]))
                not_implemented()

        elif arguments["baremetals"]:
            """rain admin baremetals"""

            print("list all baremetals")
            result = wrapper.baremetal_computer_host_list()
            print(result if result else "No Baremetals")

        elif arguments["on"]:
            """rain admin on HOSTS"""

            print("switch on")
            print((arguments["HOSTS"]))
            result = wrapper.baremetal_computer_host_on(arguments["HOSTS"])
            print("success" if result else "failed")

        elif arguments["off"]:
            """rain admin off HOSTS"""

            print("switch off")
            print((arguments["HOSTS"]))
            result = wrapper.baremetal_computer_host_off(arguments["HOSTS"])
            print("success" if result else "failed")

        elif arguments["delete"] or arguments["rm"]:
            """rain admin [-i] delete HOSTS"""
            """rain admin [-i] rm HOSTS"""

            interactive = arguments["-i"]

            print("delete", interactive)

            for host in arguments["HOSTS"]:
                if interactive:
                    answer = raw_input(
                        "Do you want to delete the host %s? (y)es/(n)o: " % host)
                    if answer in ["yes", "y", "Y", "YES"]:
                        print("delete %s" % host)
                    else:
                        print("keeping %s" % host)
            not_implemented()

        elif arguments["list"]:

            if arguments["users"]:
                print("list users")
                flag_merge = arguments["--merge"]
                policys = wrapper.list_all_user_group_hosts(True, flag_merge)
                print_policys(policys, flag_merge)

            elif arguments["projects"]:
                print("list projects")
                flag_merge = arguments["--merge"]
                policys = wrapper.list_all_user_group_hosts(False, flag_merge)
                print_policys(policys, flag_merge)

            elif arguments["roles"]:
                print("list roles")
                not_implemented()

            elif arguments["hosts"]:
                print("list hosts")
                # comment by H. C
                """
                not_implemented()

                (time_start, time_end) = parse_time_interval(arguments["--start"],
                                                           arguments["--end"])
                print "From:", time_start
                print "To  :", time_end
                """
                if arguments["--user"] is not None:
                    policys = wrapper.list_user_hosts(arguments["--user"])
                    print_policys(policys)
                elif arguments["--project"] is not None:
                    policys = wrapper.list_project_hosts(
                        arguments["--project"])
                    print_policys(policys)
                elif arguments["--role"] is not None:
                    not_implemented()
                else:
                    print ("all users, projects, roles")
                    not_implemented()

        elif arguments["policy"]:
            print("policy")
            # comment by H. C
            """
            (time_start, time_end) = parse_time_interval(arguments["--start"],
                                                         arguments["--end"])

            print "From:", time_start
            print "To  :", time_end
            """
            if arguments["--user"] is not None:
                policy_id = wrapper.add_user_policy(
                    arguments["--user"], arguments["HOSTS"])
                print("success" if policy_id else "failed")
            elif arguments["--project"] is not None:
                policy_id = wrapper.add_project_policy(
                    arguments["--project"], arguments["HOSTS"])
                print("success" if policy_id else "failed")
            elif arguments["--role"] is not None:
                not_implemented()
            else:
                print ("all users, projects, roles")
                not_implemented()

        elif arguments["list"]:
            print("list")

            not_implemented()

    elif arguments["status"]:
        print("status")
        if arguments["--short"]:
            status_dict = wrapper.get_status_short(arguments["HOSTS"])
            if status_dict:
                for host in sorted(status_dict.keys()):
                    print("{0:16}\t{1}".format(host, status_dict[host]))
            else:
                print("Empty")
        if arguments["--summary"]:
            status_dict = wrapper.get_status_summary(arguments["HOSTS"])
            if status_dict:
                for deploy_status in ["deployed", "deploying", "failed", "total", ]:
                    print("{0:16}\t{1}".format(deploy_status, status_dict[deploy_status]))

    elif arguments["user"]:
        if arguments["list"]:
            print("user list")

            (time_start, time_end) = parse_time_interval(arguments["--start"],
                                                         arguments["--end"])

            print("From:", time_start)
            print("To  :", time_end)

            not_implemented()
    ###
    # provisioning
    ###
    elif arguments["provision"]:
        # print "provision a node..."
        if arguments["list"]:
            # print "this will list distro or kickstart info"
            if arguments["--type"] == "profile":
                print("this will list profiles based on distro or kickstart info")
                profiles = wrapper.list_profile_based_distro_kickstart(
                    arguments["--distro"], arguments["--kickstart"])
                print("matched profiles: {0}".format(profiles))
            else:
                print("this will list servers based on distro or kickstart info")
                servers = wrapper.list_system_based_distro_kickstart(
                    arguments["--distro"], arguments["--kickstart"])
                print("matched servers: {0}".format(servers))
        elif arguments["add"]:
            print("add a new distro or kickstart")
            not_implemented()
        elif arguments["power"]:
            print("power ON/OFF a host...")
            # pre-process hosts which user can access
            if arguments["HOSTS"]:
                all_hosts = filtered_access_hosts(arguments["HOSTS"])
                access_hosts = all_hosts["access"]
                unaccess_hosts = all_hosts["unaccess"]
            if unaccess_hosts:
                print("You can NOT access these hosts: {0}, please contact your admin.".format(hostlist.collect_hostlist(unaccess_hosts)))
            result = wrapper.power_host(access_hosts, not arguments["--off"])
            power_hosts = [h for h in sorted(result.keys()) if result[h]]
            unknown_hosts = [h for h in sorted(result.keys()) if not result[h]]
            if unknown_hosts:
                print("unknow hosts, must deploy first: ", hostlist.collect_hostlist(hostlist.collect_hostlist(unknown_hosts)))
            if power_hosts:
                print("call [rain provision monitor {0}] to monitor power progress.".format(hostlist.collect_hostlist(power_hosts)))
        elif arguments["monitor"]:
            print("monitor progress of a host...")
            # pre-process hosts which user can access
            if arguments["HOSTS"]:
                all_hosts = filtered_access_hosts(arguments["HOSTS"])
                access_hosts = all_hosts["access"]
                unaccess_hosts = all_hosts["unaccess"]
            if unaccess_hosts:
                print("You can NOT access these hosts: {0}, please contact your admin.".format(hostlist.collect_hostlist(unaccess_hosts)))
            result = wrapper.monitor_host(access_hosts)
            poweron_hosts = [
                h for h in sorted(result.keys()) if result[h]["status"] == "poweron"]
            poweroff_hosts = [
                h for h in sorted(result.keys()) if result[h]["status"] == "poweroff"]
            deploy_hosts = [
                h for h in sorted(result.keys()) if result[h]["status"] == "deploy"]
            failed_hosts = [
                h for h in sorted(result.keys()) if result[h]["status"] == "failed"]
            unknown_hosts = [
                h for h in sorted(result.keys()) if result[h]["status"] == "unknown"]
            print_progress("deploy", deploy_hosts, result)
            print_progress("poweron", poweron_hosts, result)
            print_progress("poweroff", poweroff_hosts, result)
            print_progress("failed", failed_hosts, result)
            print_progress("unknown", unknown_hosts, result)
        else:
            # pre-process hosts which user can access
            if arguments["HOSTS"]:
                all_hosts = filtered_access_hosts(arguments["HOSTS"])
                access_hosts = all_hosts["access"]
                unaccess_hosts = all_hosts["unaccess"]
            if unaccess_hosts:
                print("You can NOT access these hosts: {0}, please contact your admin.".format(hostlist.collect_hostlist(unaccess_hosts)))
            if arguments["--profile"]:
                if access_hosts:
                    wrapper.provision_host_with_profile(
                        arguments["--profile"], access_hosts)
                    print("call [rain provision monitor {0}] to monitor depoy progress.".format(hostlist.collect_hostlist(access_hosts)))
            elif arguments["--distro"] and arguments["--kickstart"]:
                if access_hosts:
                    wrapper.provision_host_with_distro_kickstart(
                        arguments["--distro"], arguments["--kickstart"], access_hosts)
                    print("call [rain provision monitor {0}] to monitor deploy progress.".format(hostlist.collect_hostlist(access_hosts)))
Example #25
0
 def collect_eq(self, hostlist, expanded_list):
     # Note the order of the arguments! This makes it easier to
     # copy tests between the expand and collect parts!
     self.assertEqual(hostlist, collect_hostlist(expanded_list))
Example #26
0
        exclude_states = ['down', 'draining', 'drained']

    node_list_filename = os.path.join(directory, 'node_list')
    logger.info('creating {0}'.format(node_list_filename))

    all_nodes = bench.util.get_nodes()
    node_list = bench.util.filter_node_list(all_nodes,
                                            include_states=include_states,
                                            exclude_states=exclude_states,
                                            **kwargs)
    logger.info('nodes to test: {0}'.format(len(node_list)))

    try:
        bench.util.write_node_list(node_list_filename, sorted(node_list))
    except IOError, ex:
        logger.error('unable to write {0}'.format(node_list_filename))
        logger.debug(ex, exc_info=True)

    error_nodes_filename = os.path.join(directory, 'error_nodes')
    error_nodes = all_nodes - node_list
    if error_nodes:
        logger.warn('error nodes: {0} ({1} nodes)'.format(
            hostlist.collect_hostlist(error_nodes),
            len(error_nodes),
        ))
    try:
        bench.util.write_node_list(error_nodes_filename, sorted(error_nodes))
    except IOError, ex:
        logger.error('unable to write {0}'.format(error_nodes_filename))
        logger.debug(ex, exc_info=True)
Example #27
0
    def simple_list(id=None, format="table"):
        result = ""
        if id is None:
            r = Comet.get(Comet.url("cluster/"))
        else:
            r = Comet.get(Comet.url("cluster/" + id + "/"))
            if r is None:
                Console.error("Could not find cluster `{}`".format(id))
                return result
            r = [r]

        if r is not None:
            if 'error' in r:
                Console.error("An error occurred: {error}".format(**r))
                raise ValueError("COMET Error")
            elif 'error' in r[0]:
                Console.error("An error occurred: {error}".format(**r[0]))
                raise ValueError("COMET Error")

            if format == "rest":
                result = r
            else:
                elements = {}
                for cluster in r:
                    element = {}
                    for attribute in ["project", "name", "description"]:
                        element[attribute] = cluster[attribute]
                        element["nodes"] = len(cluster["computes"])
                    for attribute in cluster["frontend"]:
                        element["frontend " +
                                attribute] = cluster["frontend"][attribute]
                    names = []
                    for compute in cluster["computes"]:
                        names.append(compute["name"])

                    element["computes"] = hostlist.collect_hostlist(names)

                    elements[cluster["name"]] = element

                result = Printer.write(elements,
                                       order=[
                                           "name",
                                           "project",
                                           "nodes",
                                           "computes",
                                           "frontend name",
                                           "frontend state",
                                           "frontend type",
                                           "description",
                                       ],
                                       header=[
                                           "Name",
                                           "Project",
                                           "Count",
                                           "Nodes",
                                           "Frontend (Fe)",
                                           "State (Fe)",
                                           "Type (Fe)",
                                           "Description",
                                       ],
                                       output=format)
            return result
Example #28
0
File: grabby.py Project: anl/IT
def main(argv):
   # Default slurm configuration
   slurm_conf="/etc/slurm/slurm.conf"
   int_part_name="pubint"

   state_file="/tmp/grabby.dat"
   gstate=["red","yellow","green"]

   slurm_node_dict={}
   node_dict={}
   user_dict={}

   try:
      opts,args=getopt.getopt(argv,"np:h")
   except getopt.GetoptError:
      usage()
      sys.exit()

   notify=0

   for opt,arg in opts:
      if opt in ("-h"):
         usage()
         sys.exit()
      elif opt in ("-n"):
         notify=1
      elif opt in ("-p"):
         int_part_name=arg

   int_part_list=parse_slurm_conf(slurm_conf,int_part_name,slurm_node_dict)
   if len(int_part_list)==0:
      print "Error: no members found in partition",int_part_name
   else:
      #print "Getting Loads"
      for node in int_part_list:
         cores=slurm_node_dict[node]
         node_dict[node]=[cores,get_node_load(node),cores]

      #print "Getting Users"
      parse_squeue(node_dict,user_dict,int_part_name)
      free_nodes=count_free_nodes(node_dict)

      if notify==1:
         last_state=read_state(state_file)
         new_state=nodes_to_state(free_nodes)
         if new_state==last_state:
            sys.exit()
         write_state(state_file,new_state)
      
      output="Node, Free Cores, Load\n"
      for node in sorted(node_dict.items(),key=lambda item:item[1][0],reverse=True):
         output+= "%s, %d, %.2f\n" % (node[0],node[1][0],node[1][1])

      output+="\n"

      output+="User, Cores, Node(s)\n"
      for user in sorted(user_dict.items(),key=lambda user:user[1][0],reverse=True):
         output+= "%s, %d, %s\n" % (user[0],user[1][0],hostlist.collect_hostlist(user[1][1]))

      if notify==1:       
         mailing_list(recipients,"Grabnode State Change: "+gstate[new_state],output)
      else:
         print output
Example #29
0
        exclude_states = ['down', 'draining', 'drained']

    node_list_filename = os.path.join(directory, 'node_list')
    logger.info('creating {0}'.format(node_list_filename))

    all_nodes = bench.util.get_nodes()
    node_list = bench.util.filter_node_list(all_nodes,
                                            include_states=include_states,
                                            exclude_states=exclude_states,
                                            **kwargs)
    logger.info('nodes to test: {0}'.format(len(node_list)))

    try:
        bench.util.write_node_list(node_list_filename, sorted(node_list))
    except IOError, ex:
        logger.error('unable to write {0}'.format(node_list_filename))
        logger.debug(ex, exc_info=True)

    error_nodes_filename = os.path.join(directory, 'error_nodes')
    error_nodes = all_nodes - node_list
    if error_nodes:
        logger.warn('error nodes: {0} ({1} nodes)'.format(
            hostlist.collect_hostlist(error_nodes),
            len(error_nodes),
        ))
    try:
        bench.util.write_node_list(error_nodes_filename, sorted(error_nodes))
    except IOError, ex:
        logger.error('unable to write {0}'.format(error_nodes_filename))
        logger.debug(ex, exc_info=True)
Example #30
0
async def collect_periodically(conf, result_queue):
    deadline = time.time() + conf['interval']
    while True:
        ts = metricq.Timestamp.now()
        if conf['active_hosts']:
            ts, data = await get_sensor_data_dict(
                conf['active_hosts'],
                conf['username'],
                conf['password'],
                conf['record_ids'],
            )
        hosts_to_fix = set()

        for host, host_info in conf['hosts'].items():
            for metric_sufix, metric_data in conf['metrics'].items():
                value = NaN
                if host_info['status'] == Status.ACTIVE:
                    sensors = {}
                    for sensor in metric_data['sensors']:
                        try:
                            sensors[sensor] = data[sensor][host]
                        except KeyError:
                            if host in conf['active_hosts']:
                                conf['active_hosts'].remove(host)
                            conf['hosts'][host]['status'] = Status.ERROR
                            hosts_to_fix.add(host)
                            sensors = {}
                            break
                    if not sensors:
                        continue
                    if 'plugin' in conf:
                        try:
                            value = conf['plugin'].create_metric_value(sensors)
                        except Exception as e:
                            logger.error(
                                'Error in plugin, Exception: {0}'.format(e, ))
                    else:
                        value = sensors[metric_data['sensors'][0]]['value']

                elif host_info['status'] == Status.ERROR:
                    if not host in hosts_to_fix and time.time(
                    ) > host_info['next_try']:
                        hosts_to_fix.add(host)

                metric_name = '{}.{}'.format(
                    host_info['host_name'],
                    metric_sufix,
                )
                result_queue.put((metric_name, ts, value))

        if hosts_to_fix:
            await try_fix_hosts(
                conf,
                hosts_to_fix,
            )

        while deadline <= time.time():
            logging.warning(
                'missed deadline in {}'.format(
                    hostlist.collect_hostlist(conf['hosts'].keys()), ), )
            deadline += conf['interval']
        sleep_var = deadline - time.time()
        await asyncio.sleep(sleep_var)
        deadline += conf['interval']
Example #31
0
    k = random.choice(list(ci_nnodes.keys()))
    while ci_nnodes[k][1] == 0:
        k = random.choice(list(ci_nnodes.keys()))
    ci_nnodes[k][1] += rest_mc
print(ci_nnodes)

start_gpu = 0
start_mc = 0
ci_nodelist = dict()
for k in ci_nnodes:
    n_gpu, n_mc = ci_nnodes[k]
    nodelist_gpu = []
    nodelist_mc = []
    if n_gpu > 0:
        nodelist_gpu = exp_gpu[start_gpu:start_gpu + n_gpu]
        start_gpu += n_gpu
    if n_mc > 0:
        nodelist_mc = exp_mc[start_mc:start_mc + n_mc]
        start_mc += n_mc
    #print(len(nodelist_gpu),'==',n_gpu)
    #print(len(nodelist_mc),'==',n_mc)
    ci_nodelist[k] = [nodelist_gpu, nodelist_mc]

for k in ci_nodelist:
    nodelist_gpu, nodelist_mc = ci_nodelist[k]
    final_nodelist = collect_hostlist(nodelist_gpu + nodelist_mc)
    file_name = 'ci_' + k + '_node_resume.lst'
    f = open(file_name, 'w')
    f.write(final_nodelist)
    f.close()
Example #32
0
def main(argv):
    # Default slurm configuration
    slurm_conf = "/etc/slurm/slurm.conf"
    int_part_name = "pubint"

    state_file = "/tmp/grabby.dat"
    gstate = ["red", "yellow", "green"]

    slurm_node_dict = {}
    node_dict = {}
    user_dict = {}

    try:
        opts, args = getopt.getopt(argv, "np:h")
    except getopt.GetoptError:
        usage()
        sys.exit()

    notify = 0

    for opt, arg in opts:
        if opt in ("-h"):
            usage()
            sys.exit()
        elif opt in ("-n"):
            notify = 1
        elif opt in ("-p"):
            int_part_name = arg

    int_part_list = parse_slurm_conf(slurm_conf, int_part_name,
                                     slurm_node_dict)
    if len(int_part_list) == 0:
        print "Error: no members found in partition", int_part_name
    else:
        #print "Getting Loads"
        for node in int_part_list:
            cores = slurm_node_dict[node]
            node_dict[node] = [cores, get_node_load(node), cores]

        #print "Getting Users"
        parse_squeue(node_dict, user_dict, int_part_name)
        free_nodes = count_free_nodes(node_dict)

        if notify == 1:
            last_state = read_state(state_file)
            new_state = nodes_to_state(free_nodes)
            if new_state == last_state:
                sys.exit()
            write_state(state_file, new_state)

        output = "Node, Free Cores, Load\n"
        for node in sorted(node_dict.items(),
                           key=lambda item: item[1][0],
                           reverse=True):
            output += "%s, %d, %.2f\n" % (node[0], node[1][0], node[1][1])

        output += "\n"

        output += "User, Cores, Node(s)\n"
        for user in sorted(user_dict.items(),
                           key=lambda user: user[1][0],
                           reverse=True):
            output += "%s, %d, %s\n" % (user[0], user[1][0],
                                        hostlist.collect_hostlist(user[1][1]))

        if notify == 1:
            mailing_list(recipients,
                         "Grabnode State Change: " + gstate[new_state], output)
        else:
            print output