Example #1
0
def parse_slurm_conf(slurm_conf_file, part):
    cluster = []

    r_part = re.compile('PartitionName=(\S+).*?Nodes=(\S+)')
    r_node = re.compile("(?<=NodeName=)\S+")

    try:
        for line in open(slurm_conf_file, 'r'):
            if line[0] != '#':
                m = r_node.search(line)
                if m:
                    exp_nodelist = hostlist.expand_hostlist(m.group(0))
                    if not part:
                        cluster.extend(exp_nodelist)
                elif part:
                    m = r_part.search(line)
                    if m and m.group(1) in part:
                        cluster.extend(hostlist.expand_hostlist(m.group(2)))
                        if len(part) == 1:
                            break

    except IOError:
        print "Error: cannot open Slurm conf file at", slurm_conf_file

    return cluster
Example #2
0
File: hitparade.py Project: anl/IT
def parse_slurm_conf(slurm_conf_file,part,node_dict):
   cluster=[]

   r_part=re.compile('PartitionName=(\S+).*?Nodes=(\S+)')
   r_node=re.compile("(?<=NodeName=)\S+")

   try:
      for line in open(slurm_conf_file,'r'):
         if line[0]!='#':
            m=r_node.search(line)
            if m:
               exp_nodelist=hostlist.expand_hostlist(m.group(0))
               parse_slurm_conf_cores(line,exp_nodelist,node_dict)
               if not part:
                  cluster.extend(exp_nodelist)
            elif part:
               m=r_part.search(line)
               if m and m.group(1) in part:
                  cluster.extend(hostlist.expand_hostlist(m.group(2)))
                  if len(part)==1:
                     break

   except IOError:
      print "Error: cannot open Slurm conf file at",slurm_conf_file

   return cluster
    def create_cluster(self, name, names, ips, management):
        '''
        creates a cluster with the given name specification ip specifications, and the identification of management node.
        
        :param name: name of the cluster
        :param names: the names of the cluster servers. 'i[001-003]'. creates the objects with names i001, i002, i003
        :param ips: the names of the ips for the servers. 'i[001-003].futuregrid.org' creates the ips for the previously defined names
        :param management: the names of the management nodes. 'i[001-002]' sets the nodes i001 and i002 to management nodes. The rest will be set to compute nodes automatically.
        '''

        name_list = expand_hostlist(names)
        ip_list = expand_hostlist(ips)
        management_list = expand_hostlist(management)
        server_list = zip(name_list, ip_list)
        servers = []
        for (server_name, server_ip) in server_list:
            server = FabricServer(name=server_name, cluster=name, ip=server_ip)
            if server_name in management_list:
                server.tags = ["manage"]
            else:
                server.tags = ["compute"]
            self.stamp()
            server.save(cascade=True)
            servers.append(server)
        cluster = FabricCluster(name=name, cluster=name, definition=names)
        cluster.servers = servers
        self.stamp()
        cluster.save(cascade=True)

        pass
Example #4
0
def expand_names(name_spec):
    if type(name_spec) != str and isinstance(name_spec, collections.Sequence):
        names = []
        for name in name_spec:
            names += hostlist.expand_hostlist(name)
    else:
        names = hostlist.expand_hostlist(name_spec)
    return names
 def validate(self):
     cluster = inventory.get("cluster", self.cluster.data)
     posibilities = expand_hostlist(cluster.definition)
     choice = expand_hostlist(self.nodespec.data)
     if choice == []:
         ok = False
     else:
         ok = set(choice).issubset(posibilities)
     print "Validate", ok, choice
     return ok
Example #6
0
 def validate(self):
     cluster = inventory.get("cluster", self.cluster.data)
     posibilities = expand_hostlist(cluster.definition)
     choice = expand_hostlist(self.nodespec.data)
     if choice == []:
         ok = False
     else:
         ok = set(choice).issubset(posibilities)
     print "Validate", ok, choice
     return ok
Example #7
0
def returnjobinfo(jobid):
    global GANGLIA, GANGLIA_PROC
    t0 = time.time()
    # print jobid
    s = hide_usernames(
        os.popen("scontrol show -d --oneliner job " + str(jobid)).read()).split()
    print("slurm response took ", time.time() - t0)
    if s:
        j = dict()
        for x in s:
            y = x.split('=', 1)
            if len(y) == 2:
                j[y[0]] = y[1]

        if not j['JobState'] == 'PENDING':
            cpu_mapping = list()
            h = ['Nodes', 'CPU_IDs', 'Mem']
            nodelist = ""
            for i, n in enumerate(s):
                if n.startswith("Nodes="):
                    cpu_mapping.append([s[i].replace('Nodes=', ''),
                                        s[i + 1].replace('CPU_IDs=', ''),
                                        s[i + 2].replace('Mem=', '')])
                if n.startswith("NodeList="):
                    nodelist = n.replace("NodeList=", "")
            j['cpu_mapping'] = {'headers': h, 'nodes': cpu_mapping}
            j['expanded_nodelist'] = list(
                map(str.strip, expand_hostlist(nodelist)))
            if 'NodeList' in j:
                j['nodeinfo'] = returnnodeinfo(j['NodeList'])['nodeinfo']
            elif 'Nodes' in j:
                j['nodeinfo'] = returnnodeinfo(j['Nodes'])['nodeinfo']
            if GANGLIA == 1 and GANGLIA_PROC == 1:
                t_procs0 = time.time()
                j['procs'] = get_procs(j['expanded_nodelist'])
                print("get procs took", time.time() - t_procs0)
    else:
            # not an active job, fetch finished job stats
            # remark, these stats have a different format, leave it up to the client side
            # to fix it.
        yesterday = (dt.datetime.today() -
                     dt.timedelta(1)).strftime("%Y-%m-%d")
        sacct = "sacct -X --format=jobid,jobname,user,account,state,elapsed,submit,start,end,nnodes,ncpus,reqnodes,reqcpus,nodelist --parsable2 -S %s --job %s"
        s = hide_usernames(run_slurmcommand(sacct % (yesterday, jobid)))
        t = io.StringIO(s)
        reader = csv.reader(t, delimiter='|')
        headers = list(map(convert, next(reader)))
        jobinfo = list(map(convert, next(reader)))
        j = dict(list(zip(headers, jobinfo)))
        j['expanded_nodelist'] = list(
            map(str.strip, expand_hostlist(j["NodeList"])))
    j['GANGLIA'] = GANGLIA
    # print j
    print("jobinfo", time.time() - t0)
    return j
Example #8
0
def main(argv):
    global cstring

    # Initialize empty cluster list
    cluster_list = []

    # Default slurm configuration
    slurm_conf = "/etc/slurm/slurm.conf"

    # Default load threshold to report
    load_thr = 8.0

    guilty = False
    reverse = False

    # Default amount of parallelism
    workers = 8

    try:
        opts, args = getopt.getopt(argv, "n:l:c:s:p:Lgh")
    except getopt.GetoptError:
        usage()
        sys.exit()

    for opt, arg in opts:
        if opt in ("-h"):
            usage()
            sys.exit()
        elif opt in ("-n"):  # override default cluster file
            cluster_list = hostlist.expand_hostlist(arg)
        elif opt in ("-s"):  # override default Slurm config file name
            slurm_conf = arg
        elif opt in ("-l"):  # override default load level
            load_thr = float(arg)
            if load_thr < 0:
                print "Error: invalid load level - " + arg + "!"
                sys.exit()
        elif opt in ("-c"):  # override default community string
            cstring = arg
        elif opt in ("-g"):  # print top user on each overused system
            guilty = True
        elif opt in ("-L"):  # invert load comparison
            reverse = True
        elif opt in ("-p"):  # invert load comparison
            workers = int(arg)

    if not cluster_list:
        cluster_list = parse_slurm_conf(slurm_conf)

    if not cluster_list:
        print "Error: no cluster nodes specified!"
        sys.exit()

    pool = Pool(workers)
    results = pool.map(query_node, cluster_list)
    pool.close()

    # sort results by load value and pass to report printer
    print_report(sorted(results, key=itemgetter(1), reverse=True),
                 reduce(max_name, results, 0), load_thr, guilty, reverse)
Example #9
0
File: pbs.py Project: mlunacek/wire
def listPBSnodes(hostExpression):
    """ Create a list of nodes that are free to run jobs

    Example: freePBSnodes('node01[05-11],node02[01-80]')
    """

    nodeNames = expand_hostlist(hostExpression)
    nodeString = "".join(["%s " % n for n in nodeNames])
    pbsOut = check_output(["pbsnodes -x %s" % nodeString], shell=True)
    nodes = xml2obj(pbsOut)

    freenodelist = []

    # find free nodes
    # state = free
    # no jobs are runing
    # no message
    for node in nodes["Node"]:
        status = {}
        messages = []
        jobs = []
        name = node["name"]
        state = node["state"]
        freenodelist.append(name)

    return freenodelist
Example #10
0
 def setup_servers(self, protocol):
     if self.num_nodes > 1:
         from hostlist import expand_hostlist
         
         task_index = int(os.environ['SLURM_PROCID'])
         hostlist = expand_hostlist(os.environ['SLURM_NODELIST'])
         hostlist_w_port = [("%s:2222" % host) for host in hostlist] 
 
         cluster = tf.train.ClusterSpec({"localhost":hostlist_w_port}).as_cluster_def()
         protocol = f'grpc+{protocol}' if protocol else 'grpc'
         server = tf.distribute.Server(cluster, job_name="localhost",
                 task_index=task_index, protocol=protocol)
         session_target = server.target
 
         if task_index != 0:
             utils.join_tasks(task_index, hostlist)
             quit()
 
     else:
         task_index = 0
         hostlist = ['localhost']
         session_target = ''
 
     [print(f'{arg} : {val}') for arg, val in vars(self.args).items()]
     self.session_target = session_target
     self.task_index = task_index
     self.hostlist = hostlist
Example #11
0
    def check_nodes_computesets(clusterid, computenodeids):
        hosts_param = hostlist.expand_hostlist(computenodeids)
        hosts_param_set = set(hosts_param)
        nodes_free = True
        nodes_allocated = False
        nodes_checked = False
        # computesetid = -1
        computesets = Comet.get_computeset()
        # get all active computeset and put nodes into a set
        allhosts = set()
        for computeset in computesets:
            if computeset["cluster"] == clusterid \
                    and (computeset["state"] in Cluster.ACTIVE_COMPUTESETS):
                computesetid = computeset["id"]
                # print (computesetid)
                for compute in computeset["computes"]:
                    allhosts.add(compute["name"])

        # all nodes allocated
        if hosts_param_set <= allhosts:
            nodes_allocated = True
            nodes_free = False
            nodes_checked = True
        # at least one specified host not in any Active computeset
        else:
            for host in hosts_param:
                # some specified nodes are in Active computeset
                if host in allhosts:
                    nodes_free = False
                    nodes_checked = True
                    break

        # print ("nodes_checked: %s" % nodes_checked)
        # print ("nodes_free: %s" % nodes_free)
        return [nodes_free, nodes_allocated]
Example #12
0
    def generate(self, nodes, prefix, topology=None, test_name=None):

        if topology:
            self.logger.info('node: ignoring topology (not used)')

        node_set = collections.defaultdict(set)
        node_set = set(hostlist.expand_hostlist(bnc.config['nodes']))
        node_set &= set(nodes) #Don't include error/excluded nodes

        for node in node_set:
            test_dir = os.path.join(prefix, "tests", node)
            bench.util.mkdir_p(test_dir)

            script_file = os.path.join(test_dir, '{0}.job'.format(node))
            with open(script_file, 'w') as fp:
                fp.write(self.TEMPLATE.render(
                    job_name = 'bench-node-{0}'.format(node),
                    modules = " ".join(bnc.config['modules']),
                    node_name = node,
                    linpack_path = bnc.config['linpack_path'],
                    stream_path = bnc.config['stream_path'],
                ))

            node_list_file = os.path.join(test_dir, 'node_list')
            bench.util.write_node_list(node_list_file, [node])
        self.logger.info('node: add: {0}'.format(len(nodes)))
Example #13
0
def get_partition_nodes(partition):
    p = scontrol('show partition {0}'.format(partition))
    for line in p.stdout:
        for match in NODES_P.finditer(line):
            for node in hostlist.expand_hostlist(match.group(1)):
                yield node
    p.wait()
Example #14
0
File: wtfmp.py Project: anl/IT
def main(argv):
   global cstring

   # Initialize empty cluster list
   cluster_list=[]

   # Default slurm configuration
   slurm_conf="/etc/slurm/slurm.conf"

   # Default load threshold to report
   load_thr=8.0

   guilty=False
   reverse=False

   # Default amount of parallelism
   workers=8

   try:
      opts,args=getopt.getopt(argv,"n:l:c:s:p:Lgh")
   except getopt.GetoptError:
      usage()
      sys.exit()

   for opt,arg in opts:
      if opt in ("-h"):
         usage()
         sys.exit()
      elif opt in ("-n"): # override default cluster file
         cluster_list=hostlist.expand_hostlist(arg)
      elif opt in ("-s"): # override default Slurm config file name
         slurm_conf=arg
      elif opt in ("-l"): # override default load level
         load_thr=float(arg)
         if load_thr<0:
            print "Error: invalid load level - "+arg+"!"
            sys.exit()
      elif opt in ("-c"): # override default community string
         cstring=arg
      elif opt in ("-g"): # print top user on each overused system
         guilty=True
      elif opt in ("-L"): # invert load comparison
         reverse=True
      elif opt in ("-p"): # invert load comparison
         workers=int(arg)

   if not cluster_list:
      cluster_list=parse_slurm_conf(slurm_conf)

   if not cluster_list:
      print "Error: no cluster nodes specified!" 
      sys.exit()

   pool=Pool(workers)
   results=pool.map(query_node,cluster_list) 
   pool.close()

   # sort results by load value and pass to report printer 
   print_report(sorted(results,key=itemgetter(1),reverse=True),
      reduce(max_name,results,0),load_thr,guilty,reverse)
    def add(self, **kwargs):

        if "host" not in kwargs:
            Console.error("no id specified")
            sys.exit(1)

        hosts = hostlist.expand_hostlist(kwargs['host'])
        if 'ip' in kwargs:
            ips = Parameter.expand(kwargs['ip'])
        else:
            ips = [None for i in hosts]
        if ips is None:
            ips = [None for i in hosts]

        for host, ip in zip(hosts, ips):
            if host in self.data:
                entry = self.data[host]
            else:
                entry = dict(self.entry)
                self.data[host] = entry
            for key, value in kwargs.items():
                entry[key] = value
            entry['ip'] = ip
            entry['host'] = host
            for attribute in entry:
                self.data[host][attribute] = entry[attribute]
Example #16
0
 def set(self, spec, attribute):
     """add an attribute for the specified hosts in the format
     i[1-20]. which would set the attribute for all hosts in i1 to
     i20"""
     hosts = expand_hostlist(spec)
     for host in hosts:
         self._set(host, attribute)
Example #17
0
 def set(self, spec, attribute):
     """add an attribute for the specified hosts in the format
     i[1-20]. which would set the attribute for all hosts in i1 to
     i20"""
     hosts = expand_hostlist(spec)
     for host in hosts:
         self._set(host, attribute)
Example #18
0
    def acct_reader(self, filename):
        ftr = [3600,60,1]
        acct = []
        with open(filename) as fd:
            for job in csv.DictReader(fd, delimiter = '|'):
                if self.jobids and job['JobID'] not in self.jobids: continue
                if job['NodeList'] == "None assigned": continue

                jent = {}
                jent['id']         = job['JobID']
                jent['user']       = job['User']
                jent['project']    = job['Account']
                jent['start_time'] = int(parse(job['Start']).strftime('%s'))
                jent['end_time']   = int(parse(job['End']).strftime('%s'))
                jent['queue_time'] = int(parse(job['Submit']).strftime('%s'))
                jent['queue']      = job['Partition']
                jent['name']       = job['JobName']
                jent['status']     = job['State'].split()[0]
                jent['nodes']      = int(job['NNodes'])
                jent['cores']      = int(job['ReqCPUS'])
                jent['host_list']  = hostlist.expand_hostlist(job['NodeList'])

                if '-' in job['Timelimit']:
                    days, time = job['Timelimit'].split('-')
                else:
                    time = job['Timelimit']
                    days = 0
                jent['requested_time'] = (int(days) * 86400 + 
                                          sum([a*b for a,b in zip(ftr, [int(i) for i in time.split(":")])]))/60
                acct += [jent]
            return acct
Example #19
0
    def test_alltoall_switch_tests(self, _):
        self.nodes = NODES
        bench.util.write_node_list(os.path.join(self.directory, 'node_list'),
                                   self.nodes)
        bench.add.execute(self.directory,
                          TOPOLOGY_FILE,
                          alltoall_switch_tests=True)

        prefix = os.path.join(self.directory, 'alltoall-switch', 'tests')
        switches = set((
            'hpcf-ib-rack1-u43',
            'hpcf-ib-rack1-u45',
            'hpcf-ib-rack1-u42',
            'hpcf-ib-rack1-u44',
            'hpcf-ib-rack1-u46',
        ))
        self.assertEqual(
            set(os.listdir(prefix)),
            switches,
        )

        nodes = set()
        for switch in switches:
            script = os.path.join(prefix, switch, '{0}.job'.format(switch))
            with open(script) as fp:
                match = NODELIST_P.search(fp.read())
            nodes |= set(hostlist.expand_hostlist(match.group(3)))
        self.assertEqual(nodes, self.nodes)
    def _detect_nodes(self):
        # see if we have a PBS_NODEFILE
        pbs_nodefile = os.environ.get('PBS_NODEFILE')
        slurm_nodelist = os.environ.get('SLURM_NODELIST')

        if pbs_nodefile is not None:
            # parse PBS the nodefile
            self._raw_nodes = [line.strip() for line in open(pbs_nodefile)]
            if self.log is not None:
                self.log.info(
                    message="Found PBS_NODEFILE %s: %s" % (pbs_nodefile, self._raw_nodes),
                    suffix=LOG_SUFFIX)

        elif slurm_nodelist is not None:
            # parse SLURM nodefile
            self._raw_nodes = hostlist.expand_hostlist(slurm_nodelist)
            if self.log is not None:
                self.log.info(
                    message="Found SLURM_NODELIST %s. Expanded to: %s" % (slurm_nodelist, self._raw_nodes),
                    suffix=LOG_SUFFIX)

        else:
            self._raw_nodes = ['localhost']

            if self.log is not None:
                self.log.info(
                    message="No PBS_NODEFILE or SLURM_NODELIST found. Using hosts: %s" % (self._raw_nodes),
                    suffix=LOG_SUFFIX)
Example #21
0
    def test_exclude_file(self, _):
        fail_nodes = os.path.join(self.directory, 'fail_nodes')
        with open(fail_nodes, 'w') as fp:
            for node in ['tnode0101', 'tnode0102', 'tnode0208', 'nonode0101']:
                fp.write('{0}\n'.format(node))

        bench.add.execute(
            self.directory,
            topology_file=TOPOLOGY_FILE,
            node_tests=True,
            exclude_files=[fail_nodes],
        )

        tests_dir = os.path.join(self.directory, 'node', 'tests')
        expected_tests = set(self.nodes) - set(
            ['tnode0101', 'tnode0102', 'tnode0208', 'nonode0101'])
        self.assertEqual(
            set(os.listdir(tests_dir)),
            expected_tests,
        )
        for node in expected_tests:
            script = os.path.join(tests_dir, node, '{0}.job'.format(node))
            self.assertNotEqual(os.stat(script).st_size, 0)
            with open(script) as fp:
                match = NODELIST_P.search(fp.read())
            self.assertEqual(
                set(hostlist.expand_hostlist(match.group(3))),
                set((node, )),
            )
Example #22
0
    def check_nodes_computesets(clusterid, computenodeids):
        hosts_param = hostlist.expand_hostlist(computenodeids)
        hosts_param_set = set(hosts_param)
        nodes_free = True
        nodes_allocated = False
        nodes_checked = False
        # computesetid = -1
        computesets = Comet.get_computeset()
        # get all active computeset and put nodes into a set
        allhosts = set()
        for computeset in computesets:
            if computeset["cluster"] == clusterid \
                    and (computeset["state"] in Cluster.ACTIVE_COMPUTESETS):
                computesetid = computeset["id"]
                # print (computesetid)
                for compute in computeset["computes"]:
                    allhosts.add(compute["name"])

        # all nodes allocated
        if hosts_param_set <= allhosts:
            nodes_allocated = True
            nodes_free = False
            nodes_checked = True
        # at least one specified host not in any Active computeset
        else:
            for host in hosts_param:
                # some specified nodes are in Active computeset
                if host in allhosts:
                    nodes_free = False
                    nodes_checked = True
                    break

        # print ("nodes_checked: %s" % nodes_checked)
        # print ("nodes_free: %s" % nodes_free)
        return [nodes_free, nodes_allocated]
Example #23
0
    def test_exclude_file (self, _):
        fail_nodes = os.path.join(self.directory, 'fail_nodes')
        with open(fail_nodes, 'w') as fp:
            for node in ['tnode0101', 'tnode0102', 'tnode0208', 'nonode0101']:
                fp.write('{0}\n'.format(node))

        bench.add.execute(
            self.directory,
            topology_file=TOPOLOGY_FILE,
            node_tests=True,
            exclude_files=[fail_nodes],
        )

        tests_dir = os.path.join(self.directory, 'node', 'tests')
        expected_tests = set(self.nodes) - set(['tnode0101', 'tnode0102', 'tnode0208', 'nonode0101'])
        self.assertEqual(
            set(os.listdir(tests_dir)),
            expected_tests,
        )
        for node in expected_tests:
            script = os.path.join(tests_dir, node, '{0}.job'.format(node))
            self.assertNotEqual(os.stat(script).st_size, 0)
            with open(script) as fp:
                match = NODELIST_P.search(fp.read())
            self.assertEqual(
                set(hostlist.expand_hostlist(match.group(3))),
                set((node, )),
            )
Example #24
0
 def read_data_from_yaml(self):
     """
     read mac address and bmc configuration information from **mac.yaml** file.
     """
     data = read_yaml_config(self.yaml_file)
     result = None
     if data:
         result = {}
         data = data["inventory"]
         for cluster in data:
             cluster_data = data[cluster]
             if "bmc" in cluster_data and "common" in cluster_data["bmc"]:
                 # process the common bmc data in cluster
                 common_bmc_data = cluster_data["bmc"]["common"]
                 host_range = common_bmc_data.pop("range", None)
                 hosts = expand_hostlist(host_range)
             mac_data = cluster_data["macaddr"]
             for host in mac_data:
                 if host in hosts:
                     temp_common_bmc_data = deepcopy(common_bmc_data)
                     if "bmc" in mac_data[host]:
                         # bmc config in individual host have a high
                         # priority than common config
                         temp_common_bmc_data.update(mac_data[host]["bmc"])
                     mac_data[host]["bmc"] = temp_common_bmc_data
             result[cluster] = mac_data
     return result
Example #25
0
def dict_key_list_table_printer(d, indexed=False):
    '''
    accept a dict in the form:
    {key1: [list1],
     key2: [list2],
     .......
     =>
     | key1 | key2 |
     | l
     | i
     | s
     | t
    '''
    x = PrettyTable()
    temp = d.values()
    l = 0
    for item in temp:
        l0 = len(item)
        if l0 > l:
            l = l0

    if indexed:
        if l == 0:
            index_list = []
        else:
            index_list = hostlist.expand_hostlist("[1-{0}]".format(str(l)))
        x.add_column("index", index_list)

    for k, v in d.iteritems():
        v0 = v + [" "] * (l - len(v))
        x.add_column(k, v0)
    x.align = "l"
    return x
Example #26
0
def display_provision_form():

    clusters = cm_config_server().get("cloudmesh.server.provisioner.clusters")

    # clusters = ['india','bravo','sierra']

    # servers = n_inventory.hostlist(cluster)
    # server = n_inventory.host(name,auth=False)

    form = ProvisionForm(csrf=False)

    if form.validate_on_submit():
        flash("Success")
        print "FORM"
        pprint(form.__dict__)
        print "CLUSTER", form.cluster.data
        print "Service", form.service.data
        hosts = expand_hostlist(form.nodespec.data)
        print "Nodespec", hosts

        for host in hosts:
            print "PROVISION HOST", host
            provision.delay(host, form.service.data)

        return redirect("provision/tasks/{0}/{1}/{2}".format(
            form.cluster.data, form.nodespec.data, form.service.data))
        # return redirect("/provision/summary/")

    else:
        flash("Wrong submission")
    inventory.refresh()
    return render_template("mesh/provision/provision.html",
                           clusters=clusters,
                           form=form)
Example #27
0
def init_process(backend="nccl"):
    print(f"Starting process with rank {ptu.dist_rank}...", flush=True)

    if "SLURM_STEPS_GPUS" in os.environ:
        gpu_ids = os.environ["SLURM_STEP_GPUS"].split(",")
        os.environ["MASTER_PORT"] = str(12345 + int(min(gpu_ids)))
    else:
        os.environ["MASTER_PORT"] = str(12345)

    if "SLURM_JOB_NODELIST" in os.environ:
        hostnames = hostlist.expand_hostlist(os.environ["SLURM_JOB_NODELIST"])
        os.environ["MASTER_ADDR"] = hostnames[0]
    else:
        os.environ["MASTER_ADDR"] = "127.0.0.1"

    dist.init_process_group(
        backend,
        rank=ptu.dist_rank,
        world_size=ptu.world_size,
    )
    print(f"Process {ptu.dist_rank} is connected.", flush=True)
    dist.barrier()

    silence_print(ptu.dist_rank == 0)
    if ptu.dist_rank == 0:
        print(f"All processes are connected.", flush=True)
Example #28
0
	def GET(self, id, metric):
		res = db.select(settings["SlurmClusterName"]+"_job_table",
				what="nodelist, time_start, time_end",
				where="id_job = $id_job",
				vars={ "id_job" : id })
		job = res[0]
		url = "%s/render" % ( settings["GraphiteURL"] )
		target = "%s{%s}.%s" % (	settings["GraphitePrefix"],
						",".join(hostlist.expand_hostlist(job["nodelist"])),
						metric
					)
		if 'infiniband' not in metric:
			if 'cpu-' in metric or 'swap_io' in metric or metric.endswith('tx') or metric.endswith('rx') or metric.startswith('llite-') or metric.endswith('read') or metric.endswith('write'):
				target = "scaleToSeconds(derivative(%s),1)" % target
		data = urllib.urlencode({
					"target": target,
					"from": datetime.datetime.fromtimestamp(job["time_start"]).strftime("%H:%M_%Y%m%d"),
					"until": datetime.datetime.fromtimestamp(job["time_end"]).strftime("%H:%M_%Y%m%d"),
					"format": "json",
					"maxDataPoints": 200})
		req = urllib2.Request(url, data)
		ret = {}
		returned = urllib2.urlopen(req).read()
		graphiteData = json.loads(returned)
		for metric in graphiteData:
			node = metric["target"].split(".")[len(settings["GraphitePrefix"].split("."))-1]
			ret[node] = [[i[1] * 1000, i[0]] for i in metric["datapoints"]]
		return json.dumps(ret)
Example #29
0
 def create(self, kind, subkind, nameregex):
     #"india[9-11].futuregrid.org,india[01-02].futuregrid.org"
     names = expand_hostlist(nameregex)
     for name in names:
         if kind == "server":
             object = FabricServer(
                 name=name,
                 kind=kind,
                 subkind=subkind,
             )
         elif kind == "service":
             object = FabricService(
                 name=name,
                 kind=kind,
                 subkind=subkind)
             log.info("creating {0} {1} {2}".format(name, kind, subkind))
         elif kind == "cluster":
             object = FabricCluster(
                 name=name,
                 kind=kind,
                 subkind=subkind)
             log.info("creating {0} {1} {2}".format(name, kind, subkind))
         else:
             log.error(
                 "kind is not defined, creation of objects failed, kind, nameregex")
             return
         object.save(cascade=True)
Example #30
0
 def expand(cls, parameter, allow_duplicates=False, sort=False):
     if parameter is None:
         return parameter
     else:
         return expand_hostlist(parameter,
                                allow_duplicates=False,
                                sort=False)
Example #31
0
def display_provision_form():

    clusters = cm_config_server().get("cloudmesh.server.provisioner.clusters")

    # clusters = ['india','bravo','sierra']

    # servers = n_inventory.hostlist(cluster)
    # server = n_inventory.host(name,auth=False)


    form = ProvisionForm(csrf=False)

    if form.validate_on_submit():
        flash("Success")
        print "FORM"
        pprint(form.__dict__)
        print "CLUSTER", form.cluster.data
        print "Service", form.service.data
        hosts = expand_hostlist(form.nodespec.data)
        print "Nodespec", hosts

        for host in hosts:
            print "PROVISION HOST", host
            provision.delay(host, form.service.data)

        return redirect("provision/tasks/{0}/{1}/{2}"
                        .format(form.cluster.data,
                                form.nodespec.data,
                                form.service.data))
        # return redirect("/provision/summary/")

    else:
        flash("Wrong submission")
    inventory.refresh()
    return render_template("mesh/provision/provision.html", clusters=clusters, form=form)
Example #32
0
    def acct_reader(self, filename):
        ftr = [3600,60,1]
        acct = []
        with open(filename) as fd:
            for job in csv.DictReader(fd, delimiter = '|'):
                if self.jobids and job['JobID'] not in self.jobids: continue
                if job['NodeList'] == "None assigned": continue
                if len(job) != 13: 
                    print(job['JobID'] + " is not parsed correctly")
                    continue
                jent = {}
                jent['id']         = job['JobID']
                jent['user']       = job['User']
                jent['project']    = job['Account']
                jent['start_time'] = int(parse(job['Start']).strftime('%s'))
                jent['end_time']   = int(parse(job['End']).strftime('%s'))
                jent['queue_time'] = int(parse(job['Submit']).strftime('%s'))
                jent['queue']      = job['Partition']
                jent['name']       = job['JobName']
                jent['status']     = job['State'].split()[0]
                jent['nodes']      = int(job['NNodes'])
                jent['cores']      = int(job['ReqCPUS'])
                jent['host_list']  = hostlist.expand_hostlist(job['NodeList'])

                if '-' in job['Timelimit']:
                    days, time = job['Timelimit'].split('-')
                else:
                    time = job['Timelimit']
                    days = 0
                jent['requested_time'] = (int(days) * 86400 + 
                                          sum([a*b for a,b in zip(ftr, [int(i) for i in time.split(":")])]))/60
                acct += [jent]
            return acct
Example #33
0
def main(argv):
    # Default slurm configuration
    slurm_conf = "/etc/slurm/slurm.conf"

    argc = len(argv)
    if argc < 2 or argc > 3:
        usage()
    else:
        if argc == 2:
            nodes = parse_slurm_conf(slurm_conf)
        else:
            nodes = hostlist.expand_hostlist(argv[0])

        if len(nodes) > 0:
            for node in nodes:
                cmdline = "scp " + argv[argc -
                                        2] + " " + node + ":" + argv[argc - 1]
                print "Running", cmdline
                results = commands.getstatusoutput(cmdline)
                if results[0] == 0:
                    print "OK", results[1]
                else:
                    print "Error:", node + ")", results[1]
        else:
            print "Error: empty or invalid list of cluster nodes!"
Example #34
0
def main():
    global logger, ARGS

    signal.signal(signal.SIGINT, sig_handler)

    ARGS = parse_args()

    if ARGS.verbose:
        setup_logging("debug")
    else:
        setup_logging("info")

    logger.debug(ARGS)

    G.config = ConfigParser.SafeConfigParser()
    try:
        G.config.read(ARGS.cfgfile)
        G.hosts = hostlist.expand_hostlist(G.config.get("global", "pub_hosts"))
        G.url = G.config.get("DB", "url")
        G.port = G.config.get("global", "pub_port")
    except:
        logger.error("Can't read configuration file")
        sys.exit(1)

    ARGS.func()
Example #35
0
def dict_key_list_table_printer(d, indexed=False):
    """
    accept a dict in the form:
    {key1: [list1],
     key2: [list2],
     .......
     =>
     | key1 | key2 |
     | l
     | i
     | s
     | t
    """
    x = PrettyTable()
    temp = d.values()
    l = 0
    for item in temp:
        l0 = len(item)
        if l0 > l:
            l = l0

    if indexed:
        if l == 0:
            index_list = []
        else:
            index_list = hostlist.expand_hostlist("[1-{0}]".format(str(l)))
        x.add_column("index", index_list)

    for k, v in d.iteritems():
        v0 = v + [" "] * (l - len(v))
        x.add_column(k, v0)
    x.align = "l"
    return x
 def create(self, kind, namespec):
     '''
     creates fabric objects of the specified kind and matching the name specification
     :param kind: the kind . see FABRIC_TYPES
     :param namespec: the specifacation for a name list. 'i[001-003]'. creates the objects with names i001, i002, i003
     '''
     elements = []
     names = expand_hostlist(namespec)
     for name in names:
         if kind == "server":
             element = FabricServer(name=name, kind=kind)
         elif kind == "service":
             element = FabricService(name=name, kind=kind)
             log.info("creating {0} {1}".format(name, kind))
         elif kind == "cluster":
             element = FabricCluster(name=name, kind=kind)
             log.info("creating {0} {1}".format(name, kind))
         elif kind == "image":
             element = FabricImage(name=name, kind=kind)
             log.info("creating {0} {1}".format(name, kind))
         else:
             log.error(
                 "kind is not defined, creation of objects failed, kind, nameregex"
             )
             return
         self.stamp()
         element.save(cascade=True)
         elements.append(object)
     return elements
Example #37
0
    def test_submit_jobs_3(self, arg1):
        '''Test that submit doesn't submit jobs with nodes in --nodelist'''
        node_test = bench.tests.node_test.NodeTest("node")
        node_test.Submit.execute(self.directory, nodelist='tnode01[01-06]')

        self.assertTrue(bench.slurm.sbatch.called)

        directories = set()
        expected_directories = set()
        job_scripts = set()
        expected_job_scripts = set()
        for ii, call in enumerate(arg1.call_args_list):
            script_dir = os.path.join(self.node_test_dir, self.nodes[ii])
            args, kwargs = call  #call object is two things: args=tuple, kwargs=dict
            directories.add(script_dir)
            expected_directories.add(kwargs['chdir'])
            job_scripts.add(args[0])
            expected_job_scripts.add(script_dir + '/' + self.nodes[ii] +
                                     '.job')

            # Check that --nodelist nodes not submitted
            for node in hostlist.expand_hostlist('tnode01[07-10]'):
                self.assertNotIn(node, kwargs['chdir'])

        self.assertEqual(directories, expected_directories)
        self.assertEqual(job_scripts, expected_job_scripts)
Example #38
0
 def read_data_from_yaml(self):
     """
     read mac address and bmc configuration information from **mac.yaml** file.
     """
     data = read_yaml_config(self.yaml_file)
     result = None
     if data:
         result = {}
         data = data["inventory"]
         for cluster in data:
             cluster_data = data[cluster]
             if "bmc" in cluster_data and "common" in cluster_data["bmc"]:
                 # process the common bmc data in cluster
                 common_bmc_data = cluster_data["bmc"]["common"]
                 host_range = common_bmc_data.pop("range", None)
                 hosts = expand_hostlist(host_range)
             mac_data = cluster_data["macaddr"]
             for host in mac_data:
                 if host in hosts:
                     temp_common_bmc_data = deepcopy(common_bmc_data)
                     if "bmc" in mac_data[host]:
                         # bmc config in individual host have a high
                         # priority than common config
                         temp_common_bmc_data.update(mac_data[host]["bmc"])
                     mac_data[host]["bmc"] = temp_common_bmc_data
             result[cluster] = mac_data
     return result
Example #39
0
def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:
    """Get SLURM resources: nodename, nodelist, and gpus.

    Returns
    -------
    Tuple[str, List[str], Optional[List[int]]]
        nodename, nodelist, and gpus

    Raises
    ------
    RuntimeError
        if number of nodes could not be retrieved
    ValueError
        list of nodes is not of the same length sa number of nodes
    ValueError
        if current nodename is not found in node list
    """
    nodelist = hostlist.expand_hostlist(os.environ["SLURM_JOB_NODELIST"])
    nodename = os.environ["SLURMD_NODENAME"]
    num_nodes_env = os.getenv("SLURM_JOB_NUM_NODES")
    if num_nodes_env:
        num_nodes = int(num_nodes_env)
    else:
        raise RuntimeError("Could not get SLURM number of nodes")

    if len(nodelist) != num_nodes:
        raise ValueError(
            f"Number of slurm nodes {len(nodelist)} not equal to {num_nodes}")
    if nodename not in nodelist:
        raise ValueError(
            f"Nodename({nodename}) not in nodelist({nodelist}). This should not happen!"
        )
    gpus = local.get_gpus()
    return nodename, nodelist, gpus
Example #40
0
 def create (self, kind, namespec):
     '''
     creates fabric objects of the specified kind and matching the name specification
     :param kind: the kind . see FABRIC_TYPES
     :param namespec: the specifacation for a name list. 'i[001-003]'. creates the objects with names i001, i002, i003
     '''
     elements = []
     names = expand_hostlist(namespec)
     for name in names:
         if kind == "server":
             element = FabricServer(name=name, kind=kind)
         elif kind == "service":
             element = FabricService(name=name, kind=kind)
             log.info("creating {0} {1}".format(name, kind))
         elif kind == "cluster":
             element = FabricCluster(name=name, kind=kind)
             log.info("creating {0} {1}".format(name, kind))
         elif kind == "image":
             element = FabricImage(name=name, kind=kind)
             log.info("creating {0} {1}".format(name, kind))
         else:
             log.error(
                 "kind is not defined, creation of objects failed, kind, nameregex")
             return
         self.stamp()
         element.save(cascade=True)
         elements.append(object)
     return elements
Example #41
0
def filter_node_list(
    nodes,
    include_nodes=None,
    exclude_nodes=None,
    include_reservations=None,
    exclude_reservations=None,
    include_states=None,
    exclude_states=None,
    include_files=None,
    exclude_files=None,
):
    nodes = set(nodes)

    if include_states or exclude_states:
        nodes &= set(
            get_nodes(
                include_states=include_states,
                exclude_states=exclude_states,
            ))

    if include_nodes or include_reservations or include_files:
        include_nodes_ = set()
        if include_nodes:
            for hostlist_ in include_nodes:
                include_nodes_ |= set(hostlist.expand_hostlist(hostlist_))
        if include_reservations:
            for reservation in include_reservations:
                include_nodes_ |= get_reserved_nodes(reservation)
        if include_files:
            for include_file in include_files:
                include_nodes_ |= set(read_node_list(include_file))
        nodes &= include_nodes_

    if exclude_nodes or exclude_reservations or exclude_files:
        exclude_nodes_ = set()
        if exclude_nodes:
            for hostlist_ in exclude_nodes:
                exclude_nodes_ |= set(hostlist.expand_hostlist(hostlist_))
        if exclude_reservations:
            for reservation in exclude_reservations:
                exclude_nodes_ |= get_reserved_nodes(reservation)
        if exclude_files:
            for exclude_file in exclude_files:
                exclude_nodes_ |= set(read_node_list(exclude_file))
        nodes -= exclude_nodes_

    return nodes
 def get_status_short(self, raw_hosts=None):
     """get status of baremetal computer
     provided for **rain status --short [HOSTS]**
     :param string raw_hosts: one or more hosts with the valid formation of hostlist
     :return: a dict of the formation {"host1": "deployed", "host2": "deploying", "host3": "failed"}
     """
     hosts = expand_hostlist(raw_hosts) if raw_hosts else None
     return self.status.get_status_short(hosts)
Example #43
0
def get_worker_host_list(base_port, workers_per_host):
    hosts = expand_hostlist(os.environ['SLURM_NODELIST'])
    ports = [base_port + i for i in range(workers_per_host)]
    worker_hlist = []
    for h in hosts:
        for p in ports:
            worker_hlist.append('{}:{}'.format(h, p))
    return worker_hlist
Example #44
0
def get_slurm_allocated_nodes():

    hosts = os.environ.get("SLURM_NODELIST")
    hosts = hostlist.expand_hostlist(hosts)
    freenodes = []
    for h in hosts:
        freenodes.append((h + "\n"))
    return list(set(freenodes))
Example #45
0
 def ip_name_pair(self, nameregex, format_string, start=1):
     ips = expand_hostlist(nameregex)
     i = start
     names = []
     for ip in ips:
         names.append(format_string.format(i))
         i += 1
     return zip(names, ips)
Example #46
0
def get_worker_host_list(base_port,workers_per_host):
  hosts = expand_hostlist( os.environ['SLURM_NODELIST'])
  ports = [base_port + i for i in range(workers_per_host)]
  l = []
  for h in hosts:
    for p in ports:
      l.append('{}:{}'.format(h,p))
  return l
 def get_status_summary(self, raw_hosts=None):
     """get status summary of baremetal computer
     provided for **rain status --summary [HOSTS]**
     :param string raw_hosts: one or more hosts with the valid formation of hostlist
     :return: a dict of the formation {"deployed": 1, "deploying":2, "failed":2, "total": 5}
     """
     hosts = expand_hostlist(raw_hosts) if raw_hosts else None
     return self.status.get_status_summary(hosts)
 def baremetal_computer_host_off(self, raw_hosts):
     """Disable/OFF computers for baremetal provisioning
     provided for **rain admin off HOSTS**
     :param string raw_hosts: ne or more hosts with the valid formation of hostlist
     :return: True means successfully, otherwise False
     """
     hosts = expand_hostlist(raw_hosts) if raw_hosts else None
     return self.baremetal.disable_baremetal_computers(hosts)
Example #49
0
def filtered_hosts_based_policy(user, projects, hosts):
    """filtered hosts based on policy of the user and his/her projects
    """
    # wrapper
    wrapper = RainCobblerWrapper()
    policy = wrapper.get_policy_based_user_or_its_projects(user, projects)
    policy_hosts = hostlist.expand_hostlist(policy) if policy else None
    return [h for h in hosts if h in policy_hosts] if policy_hosts else []
Example #50
0
def filtered_hosts_based_baremetal(raw_hosts):
    """filtered hosts based on baremetal computers
    """
    # wrapper
    wrapper = RainCobblerWrapper()
    input_hosts = hostlist.expand_hostlist(raw_hosts)
    bm_hosts = wrapper.baremetal_computer_host_list()
    return [h for h in input_hosts if h in bm_hosts] if bm_hosts else []
Example #51
0
    def set_compute_nodelist(self, nodelist):
        nodelist_expanded = hostlist.expand_hostlist(nodelist)
        self._inventory_content['all']['children']['compute_nodes'][
            'hosts'] = {}

        for node in nodelist_expanded:
            self._inventory_content['all']['children']['compute_nodes'][
                'hosts'][node] = None
Example #52
0
def filtered_hosts_based_policy(user, projects, hosts):
    """filtered hosts based on policy of the user and his/her projects
    """
    # wrapper
    wrapper = RainCobblerWrapper()
    policy = wrapper.get_policy_based_user_or_its_projects(user, projects)
    policy_hosts = hostlist.expand_hostlist(policy) if policy else None
    return [h for h in hosts if h in policy_hosts] if policy_hosts else []
Example #53
0
def filtered_hosts_based_baremetal(raw_hosts):
    """filtered hosts based on baremetal computers
    """
    # wrapper
    wrapper = RainCobblerWrapper()
    input_hosts = hostlist.expand_hostlist(raw_hosts)
    bm_hosts = wrapper.baremetal_computer_host_list()
    return [h for h in input_hosts if h in bm_hosts] if bm_hosts else []
Example #54
0
    def generate(self):
        self.generate_bootspec()
        self._generate_globals()

        clusters = self.config.get("cloudmesh.inventory")

        for cluster_name in clusters:
            cluster = clusters[cluster_name]
            names = expand_hostlist(cluster["id"])
            net_id = 0
            for network in cluster["network"]:

                n_index = expand_hostlist(network["id"])
                n_label = expand_hostlist(network["label"])
                n_range = expand_hostlist(network["range"])
                n_name = network["name"]

                for i in range(0, len(names)):
                    name = n_index[i]
                    element = dict(network)
                    del element['range']
                    element.update({'cm_type': "inventory",
                                    'cm_key': 'server',
                                     'cm_kind': 'server',
                                     'cm_id': name,
                                     'cm_cluster': cluster_name,
                                     'id': name,
                                     'label' : n_label[i],
                                     'network_name': n_name,
                                     'cm_network_id': net_id,
                                     'ipaddr': n_range[i],
                                     'cm_attribute': 'network'}
                    )
                    self.insert(element)
                net_id += 1
        
        # added by HC
        # init rack status
        self.generate_rack_status()
        # init baremetal computer managemnt, maybe will be deprecated later
        bdb = BaremetalDB()
        bdb.init_base_document_structure()
        # insert necessary mac info of baremetal computers to inventory
        bmc = BaremetalComputer()
        bmc.insert_mac_data_to_inventory()
Example #55
0
    def _configure(self):

        slurm_nodelist = os.environ.get("SLURM_NODELIST")
        if slurm_nodelist is None:
            msg = "$SLURM_NODELIST not set!"
            self._log.error(msg)
            raise RuntimeError(msg)

        # Parse SLURM nodefile environment variable
        slurm_nodes = hostlist.expand_hostlist(slurm_nodelist)
        self._log.info("Found SLURM_NODELIST %s. Expanded to: %s", slurm_nodelist, slurm_nodes)

        # $SLURM_NPROCS = Total number of cores allocated for the current job
        slurm_nprocs_str = os.environ.get("SLURM_NPROCS")
        if slurm_nprocs_str is None:
            msg = "$SLURM_NPROCS not set!"
            self._log.error(msg)
            raise RuntimeError(msg)
        else:
            slurm_nprocs = int(slurm_nprocs_str)

        # $SLURM_NNODES = Total number of (partial) nodes in the job's resource allocation
        slurm_nnodes_str = os.environ.get("SLURM_NNODES")
        if slurm_nnodes_str is None:
            msg = "$SLURM_NNODES not set!"
            self._log.error(msg)
            raise RuntimeError(msg)
        else:
            slurm_nnodes = int(slurm_nnodes_str)

        # $SLURM_CPUS_ON_NODE = Number of cores per node (physically)
        slurm_cpus_on_node_str = os.environ.get("SLURM_CPUS_ON_NODE")
        if slurm_cpus_on_node_str is None:
            msg = "$SLURM_CPUS_ON_NODE not set!"
            self._log.error(msg)
            raise RuntimeError(msg)
        else:
            slurm_cpus_on_node = int(slurm_cpus_on_node_str)

        # Verify that $SLURM_NPROCS <= $SLURM_NNODES * $SLURM_CPUS_ON_NODE
        if not slurm_nprocs <= slurm_nnodes * slurm_cpus_on_node:
            self._log.warning(
                "$SLURM_NPROCS(%d) <= $SLURM_NNODES(%d) * $SLURM_CPUS_ON_NODE(%d)",
                slurm_nprocs,
                slurm_nnodes,
                slurm_cpus_on_node,
            )

        # Verify that $SLURM_NNODES == len($SLURM_NODELIST)
        if slurm_nnodes != len(slurm_nodes):
            self._log.error("$SLURM_NNODES(%d) != len($SLURM_NODELIST)(%d)", slurm_nnodes, len(slurm_nodes))

        # Report the physical number of cores or the total number of cores
        # in case of a single partial node allocation.
        self.cores_per_node = min(slurm_cpus_on_node, slurm_nprocs)

        self.node_list = slurm_nodes
Example #56
0
File: slurmssh.py Project: anl/IT
def parse_slurm_env(env_var_name):
   cluster=[]
   env=os.getenv(env_var_name)
   if env is not None:
      cluster.extend(hostlist.expand_hostlist(env))
   else:
      print "Error: cannot get "+env_var_name+" from environment"

   return cluster
Example #57
0
def lookup(list):
  debug("Forward Lookup:")
  debug(list)
  if "*" in list or ":" in list:
    # MOAB events on Cray XE6 use node list format [1-7]*16:[10-99]*7
    list = re.sub("\*\d+","", list) # omit *nprocs
    list = re.sub(":",",", list)    # change : to ,
  new_list = hostlist.expand_hostlist(list)
  # this takes the list we get back and turns it into a string for splunk
  return ",".join(str(n) for n in new_list)
Example #58
0
File: slurmscpmp.py Project: anl/IT
def main(argv):
   # Default slurm configuration
   slurm_conf="/etc/slurm/slurm.conf"
   num_workers=8

   try:
      opts,args=getopt.getopt(argv,"p:nh")
   except getopt.GetoptError:
      usage()
      sys.exit()

   nodes=[]

   for opt,arg in opts:
      if opt in ("-h"):
         usage()
         sys.exit()
      elif opt in ("-p"): # partition
         nodes=parse_slurm_conf(slurm_conf,arg)
         if len(nodes)==0:
            print "Error: invalid partition -",arg
            sys.exit()
      elif opt in ("-n"): # dump nodelist
         if not nodes:
            nodes=parse_slurm_conf(slurm_conf)

         for node in nodes:
            print node
         sys.exit()
      elif opt in ('-N'): # set workers
         num_workers=int(arg)

   argc=len(args)
   if argc<2 or argc>3:
      usage()
   else:
      if argc==3:
         nodes=hostlist.expand_hostlist(args[0])
      elif not nodes:
         nodes=parse_slurm_conf(slurm_conf)

      if len(nodes)>0:
         cmdline=[]
         for node in nodes:
            cmdline.append([node,"scp "+args[argc-2]+" "+node+":"+args[argc-1]])

         pool=Pool(num_workers)
         results=pool.map(copy_to_node,cmdline)
         pool.close()

         for result in results:
            if result!="OK":
               print result
      else:
         print "Error: empty or invalid list of cluster nodes!"
Example #59
0
def get_ps_host_list(base_port,num_ps):
  assert(num_ps < 10000000)
  port = base_port
  l = []
  hosts = expand_hostlist( os.environ['SLURM_NODELIST'])
  while True:
    for host in hosts:
      if len(l) >= num_ps:
        return l
      l.append('{}:{}'.format(host,port))
    port += 1  
Example #60
0
def stateChangeLogic(record, nodes, start_state, end_state):
  global output_results, options, known_states, node_states
  if "-" in nodes or "[" in nodes:
    try:
      # MOAB JOBSTART events on Cray XE6 use node list format [1-7]*16:[10-999]*7
      nodes = re.sub("\*\d+","", nodes) # omit *nprocs
      nodes = re.sub(":",",", nodes)    # change : to ,
      node_list = hostlist.expand_hostlist(nodes)
    except: # try to deal with BadHostlist exceptions
      debug2("---- Bad hostlist: " + str(record['_time']) +" nodes="+nodes)
      # guess it is missing a left bracket, and truncate at last comma
      m=re.match("(^.*)\[(.*),(.*)", nodes)
      if not m == None:
        nodes = m.group(1) +"["+ m.group(2) +"]"
      debug2("---- Changing to: nodes="+nodes)
      node_list = hostlist.expand_hostlist(nodes) # do or die
  else:
    node_list = nodes.split(",")

  if len(nodes) > 500:  # omit long node lists
      record['_raw'] = record['_raw'].replace(nodes,'(LONG_NODE_LIST)')

  # the below is a hack for cielo, because RSV lines don't include some service nodes.
  # if RSV lines list at least 98% of all known hosts, apply it to all hosts.
  raw = str(record['_raw'])
  if (raw.find("RSVSTART")!=-1 or raw.find("RSVEND")!=-1):
      these = len(node_list)
      all = len(node_states)
      # the hardcoded .95 may cause problems later and should be improved.
      # eg it could be an argument and set via local/savedsearches.conf for each machine,
      # but really it should be a percentage of the hosts in each hpc_system index...
      if (all>these and these>0.95*all):
          debug2("Applying RSV to all known hosts ("+ str(these) +">0.95*"+ str(all)+"): "+raw)
          node_list = node_states.keys() 

#  debug2("---- in stateChangeLogic:" + str(record['_time']) + " start="+ start_state  + " end="+ end_state + " nodes="+nodes)

  for node in node_list:
    current_state = node_states.get(node)
    if current_state == None or current_state=="Unknown" or start_state=="*" or current_state==start_state:
      nodeStateChange(record, node, current_state, end_state)