Ejemplo n.º 1
0
    def _configure(self):
        # TODO: $NCPUS?!?! = 1 on archer

        pbspro_nodefile = os.environ.get('PBS_NODEFILE')

        if pbspro_nodefile is None:
            msg = "$PBS_NODEFILE not set!"
            self._log.error(msg)
            raise RuntimeError(msg)

        self._log.info("Found PBSPro $PBS_NODEFILE %s." % pbspro_nodefile)

        # Dont need to parse the content of nodefile for PBSPRO, only the length
        # is interesting, as there are only duplicate entries in it.
        pbspro_nodes        = [line.strip() for line in open(pbspro_nodefile)]
        pbspro_nodes_length = len(pbspro_nodes)

        # Number of Processors per Node
        val = os.environ.get('NUM_PPN')
        if not val:
            val = os.environ.get('SAGA_PPN')

        if not val:
            raise RuntimeError("$NUM_PPN / $SAGA_PPN not set!")

        pbspro_num_ppn = int(val)

        # Number of Nodes allocated
        val = os.environ.get('NODE_COUNT')
        if val:
            pbspro_node_count = int(val)
        else:
            pbspro_node_count = len(set(pbspro_nodes))
            self._log.error("$NODE_COUNT not set - use %d" % pbspro_node_count)

        # Number of Parallel Environments
        val = os.environ.get('NUM_PES')
        if val:
            pbspro_num_pes = int(val)
        else:
            pbspro_num_pes = len(pbspro_nodes)
            self._log.error("$NUM_PES not set - use %d" % pbspro_num_pes)

        pbspro_vnodes = self._parse_pbspro_vnodes()

        # Verify that $NUM_PES == $NODE_COUNT * $NUM_PPN == len($PBS_NODEFILE)
        if not (pbspro_node_count * pbspro_num_ppn == pbspro_num_pes == pbspro_nodes_length):
            self._log.warning("NUM_PES != NODE_COUNT * NUM_PPN != len($PBS_NODEFILE)")

        # node names are unique, so can serve as node uids
        self.node_list      = [[node, node] for node in pbspro_vnodes]
        self.cores_per_node = pbspro_num_ppn
        self.gpus_per_node  = self._cfg.get('gpus_per_node', 0)  # FIXME GPU

        self.lfs_per_node   = {'path' : ru.expand_env(
                                           self._cfg.get('lfs_path_per_node')),
                               'size' :    self._cfg.get('lfs_size_per_node', 0)
                              }
Ejemplo n.º 2
0
    def _configure(self):

        lsf_hostfile = os.environ.get('LSB_DJOB_HOSTFILE')
        if lsf_hostfile is None:
            msg = "$LSB_DJOB_HOSTFILE not set!"
            self._log.error(msg)
            raise RuntimeError(msg)

        lsb_mcpu_hosts = os.environ.get('LSB_MCPU_HOSTS')
        if lsb_mcpu_hosts is None:
            msg = "$LSB_MCPU_HOSTS not set!"
            self._log.error(msg)
            raise RuntimeError(msg)

        # parse LSF hostfile
        # format:
        # <hostnameX>
        # <hostnameX>
        # <hostnameY>
        # <hostnameY>
        #
        # There are in total "-n" entries (number of tasks)
        # and "-R" entries per host (tasks per host).
        # (That results in "-n" / "-R" unique hosts)
        #
        lsf_nodes = [line.strip() for line in open(lsf_hostfile)]
        self._log.info("Found LSB_DJOB_HOSTFILE %s. Expanded to: %s",
                       lsf_hostfile, lsf_nodes)
        lsf_node_list = list(set(lsf_nodes))

        # Grab the core (slot) count from the environment
        # Format: hostX N hostY N hostZ N
        lsf_cores_count_list = list(map(int, lsb_mcpu_hosts.split()[1::2]))
        lsf_core_counts = list(set(lsf_cores_count_list))
        lsf_cores_per_node = min(lsf_core_counts)
        lsf_gpus_per_node = self._cfg.get('gpus_per_node', 0)  # FIXME GPU
        lsf_lfs_per_node = {
            'path': ru.expand_env(self._cfg.get('lfs_path_per_node')),
            'size': self._cfg.get('lfs_size_per_node', 0)
        }
        lsf_mem_per_node = self._cfg.get('mem_per_node', 0)

        self._log.info("Found unique core counts: %s Using: %d",
                       lsf_core_counts, lsf_cores_per_node)

        # node names are unique, so can serve as node uids
        self.node_list = [[node, node] for node in lsf_node_list]
        self.cores_per_node = lsf_cores_per_node
        self.gpus_per_node = lsf_gpus_per_node
        self.lfs_per_node = lsf_lfs_per_node
        self.mem_per_node = lsf_mem_per_node
Ejemplo n.º 3
0
    def _configure(self):

        lsf_hostfile = os.environ.get('LSB_DJOB_HOSTFILE')
        if lsf_hostfile is None:
            msg = "$LSB_DJOB_HOSTFILE not set!"
            self._log.error(msg)
            raise RuntimeError(msg)

        lsb_mcpu_hosts = os.environ.get('LSB_MCPU_HOSTS')
        if lsb_mcpu_hosts is None:
            msg = "$LSB_MCPU_HOSTS not set!"
            self._log.error(msg)
            raise RuntimeError(msg)

        # parse LSF hostfile
        # format:
        # <hostnameX>
        # <hostnameX>
        # <hostnameY>
        # <hostnameY>
        #
        # There are in total "-n" entries (number of tasks)
        # and "-R" entries per host (tasks per host).
        # (That results in "-n" / "-R" unique hosts)
        #
        lsf_nodes = [line.strip() for line in open(lsf_hostfile)]
        self._log.info("Found LSB_DJOB_HOSTFILE %s. Expanded to: %s",
                      lsf_hostfile, lsf_nodes)
        lsf_node_list = list(set(lsf_nodes))

        # Grab the core (slot) count from the environment
        # Format: hostX N hostY N hostZ N
        lsf_cores_count_list = map(int, lsb_mcpu_hosts.split()[1::2])
        lsf_core_counts      = list(set(lsf_cores_count_list))
        lsf_cores_per_node   = min(lsf_core_counts)
        lsf_gpus_per_node    = self._cfg.get('gpus_per_node', 0)  # FIXME GPU

        self.lfs_per_node    = {'path' : ru.expand_env(
                                            self._cfg.get('lfs_path_per_node')),
                                'size' :    self._cfg.get('lfs_size_per_node', 0)
                               }

        self._log.info("Found unique core counts: %s Using: %d",
                      lsf_core_counts, lsf_cores_per_node)

        # node names are unique, so can serve as node uids
        self.node_list      = [[node, node] for node in lsf_node_list]
        self.cores_per_node = lsf_cores_per_node
        self.gpus_per_node  = lsf_gpus_per_node
        self.lfs_per_node   = lfs_lfs_per_node
Ejemplo n.º 4
0
    def _configure(self):

        self._log.info("Configured to run on system with %s.", self.name)

        CCM_NODEFILE_DIR = os.path.expanduser('~/.crayccm')

        ccm_nodefile_list = [
            x for x in os.listdir(CCM_NODEFILE_DIR)
            if x.startswith('ccm_nodelist')
        ]

        if not ccm_nodefile_list:
            raise Exception("No CCM nodefiles found in: %s." %
                            CCM_NODEFILE_DIR)

        ccm_nodefile_name = max(
            ccm_nodefile_list,
            key=lambda x: os.stat(os.path.join(CCM_NODEFILE_DIR, x)).st_mtime)
        ccm_nodefile = os.path.join(CCM_NODEFILE_DIR, ccm_nodefile_name)

        hostname = os.uname()[1]
        if hostname not in open(ccm_nodefile).read():
            raise RuntimeError("Using the most recent CCM nodefile (%s),"
                               " but I (%s) am not in it!" %
                               (ccm_nodefile, hostname))

        # Parse the CCM nodefile
        ccm_nodes = [line.strip() for line in open(ccm_nodefile)]
        self._log.info("Found CCM nodefile: %s.", ccm_nodefile)

        # Get the number of raw entries
        ccm_nodes_length = len(ccm_nodes)

        # Unique nodes
        ccm_node_list = list(set(ccm_nodes))
        ccm_node_list_length = len(ccm_node_list)

        # Some simple arithmetic
        self.cores_per_node = ccm_nodes_length / ccm_node_list_length
        self.gpus_per_node = self._cfg.get('gpus_per_node', 0)  # FIXME GPU

        self.lfs_per_node = {
            'path': ru.expand_env(self._cfg.get('lfs_path_per_node')),
            'size': self._cfg.get('lfs_size_per_node', 0)
        }

        # node names are unique, so can serve as node uids
        self.node_list = [[node, node] for node in ccm_node_list]
Ejemplo n.º 5
0
    def _configure(self):

        self._log.info("Using fork on localhost.")

        # For the fork ResourceManager (ie. on localhost), we fake an infinite
        # number of cores, so don't perform any sanity checks.
        detected_cores = multiprocessing.cpu_count()

        if detected_cores != self.requested_cores:
            if self._cfg.resource_cfg.fake_resources:
                self._log.info("using %d instead of available %d cores.",
                               self.requested_cores, detected_cores)
            else:
                if self.requested_cores > detected_cores:
                    raise RuntimeError('insufficient cores found (%d < %d' %
                                       (detected_cores, self.requested_cores))

        # if cores_per_node is set in the agent config, we slice the number of
        # cores into that many virtual nodes.  cpn defaults to requested_cores,
        # to preserve the previous behavior (1 node).
        self.cores_per_node = self._cfg.get('cores_per_node',
                                            self.requested_cores)
        self.gpus_per_node = self._cfg.get('gpus_per_node', 0)
        self.mem_per_node = self._cfg.get('mem_per_node', 0)

        self.lfs_per_node = {
            'path': ru.expand_env(self._cfg.get('lfs_path_per_node')),
            'size': self._cfg.get('lfs_size_per_node', 0)
        }

        if not self.cores_per_node:
            self.cores_per_node = 1

        self.node_list = list()
        requested_nodes = int(
            math.ceil(
                float(self.requested_cores) / float(self.cores_per_node)))
        for i in range(requested_nodes):
            # enumerate the node list entries for a unique uis
            self.node_list.append(["localhost", 'localhost_%d' % i])

        self._log.debug(
            'configure localhost as %s nodes '
            '(%s cores, %s gpus, %s lfs, %s mem)', len(self.node_list),
            self.cores_per_node, self.gpus_per_node, self.lfs_per_node,
            self.mem_per_node)
Ejemplo n.º 6
0
    def _configure(self):

        self._log.info("Configured to run on system with %s.", self.name)

        CCM_NODEFILE_DIR = os.path.expanduser('~/.crayccm')

        ccm_nodefile_list = filter(lambda x: x.startswith('ccm_nodelist'),
                                   os.listdir(CCM_NODEFILE_DIR))
        if not ccm_nodefile_list:
            raise Exception("No CCM nodefiles found in: %s." % CCM_NODEFILE_DIR)

        ccm_nodefile_name = max(ccm_nodefile_list, key=lambda x:
                              os.stat(os.path.join(CCM_NODEFILE_DIR, x)).st_mtime)
        ccm_nodefile = os.path.join(CCM_NODEFILE_DIR, ccm_nodefile_name)

        hostname = os.uname()[1]
        if hostname not in open(ccm_nodefile).read():
            raise RuntimeError("Using the most recent CCM nodefile (%s),"
                               " but I (%s) am not in it!" % (ccm_nodefile, hostname))

        # Parse the CCM nodefile
        ccm_nodes = [line.strip() for line in open(ccm_nodefile)]
        self._log.info("Found CCM nodefile: %s.", ccm_nodefile)

        # Get the number of raw entries
        ccm_nodes_length = len(ccm_nodes)

        # Unique nodes
        ccm_node_list        = list(set(ccm_nodes))
        ccm_node_list_length = len(ccm_node_list)

        # Some simple arithmetic
        self.cores_per_node = ccm_nodes_length / ccm_node_list_length
        self.gpus_per_node  = self._cfg.get('gpus_per_node', 0)  # FIXME GPU

        self.lfs_per_node   = {'path' : ru.expand_env(
                                           self._cfg.get('lfs_path_per_node')),
                               'size' :    self._cfg.get('lfs_size_per_node', 0)
                              }

        # node names are unique, so can serve as node uids
        self.node_list = [[node, node] for node in ccm_node_list]
Ejemplo n.º 7
0
    def _configure(self):

        # we only support Cobalt on Theta right now, and since we know that
        # Theta is a Cray, we know that aprun is available.  Alas, aprun
        # provides the only way (we could find so far) to determing the list of
        # nodes we have available (`COBALT_NODELIST` seems broken).  So we run
        # `aprun` with the rank of nodes we *think* we have, and with `-N 1` to
        # place one rank per node, and run `hostname` - that gives is the list
        # of hostnames.  The number of nodes we receive from `$COBALT_PARTSIZE`.

        n_nodes = int(os.environ['COBALT_PARTSIZE'])
        out, _, _ = ru.sh_callout('aprun -n %d -N 1 hostname' % n_nodes)
        node_list = out.split()
        assert (len(node_list) == n_nodes), node_list

        # we also want    to learn the core count per node
        cmd = 'cat /proc/cpuinfo | grep processor | wc -l'
        out, _, _ = ru.sh_callout('aprun -n %d -N 1 %s' % (n_nodes, cmd))
        core_counts = list(set([int(x) for x in out.split()]))
        assert (len(core_counts) == 1), core_counts
        cores_per_node = core_counts[0]

        gpus_per_node = self._cfg.get('gpus_per_node', 0)
        lfs_per_node = {
            'path': ru.expand_env(self._cfg.get('lfs_path_per_node')),
            'size': self._cfg.get('lfs_size_per_node', 0)
        }
        mem_per_node = self._cfg.get('mem_per_node', 0)

        self._log.info("Found unique core counts: %s", cores_per_node)

        # node names are unique, so can serve as node uids
        self.node_list = [[node, node] for node in node_list]
        self.cores_per_node = cores_per_node
        self.gpus_per_node = gpus_per_node
        self.lfs_per_node = lfs_per_node
        self.mem_per_node = mem_per_node
Ejemplo n.º 8
0
    def _configure(self):

        self._log.info("Using fork on localhost.")

        # For the fork LRMS (ie. on localhost), we fake an infinite number of
        # cores, so don't perform any sanity checks.
        detected_cpus = multiprocessing.cpu_count()

        if detected_cpus != self.requested_cores:
            self._log.info("using %d instead of physically available %d cores.",
                    self.requested_cores, detected_cpus)

        # if cores_per_node is set in the agent config, we slice the number of
        # cores into that many virtual nodes.  cpn defaults to requested_cores,
        # to preserve the previous behavior (1 node).
        self.cores_per_node = self._cfg.get('cores_per_node', self.requested_cores)
        self.gpus_per_node  = self._cfg.get('gpus_per_node',   0)
        self.mem_per_node   = self._cfg.get('memory_per_node', 0)

        self.lfs_per_node   = {'path' : ru.expand_env(
                                           self._cfg.get('lfs_path_per_node')),
                               'size' :    self._cfg.get('lfs_size_per_node', 0)
                              }


        if not self.cores_per_node:
            self.cores_per_node = 1

        self.node_list  = list()
        requested_nodes = int(math.ceil(float(self.requested_cores) /
                                        float(self.cores_per_node ) ) )
        for i in range(requested_nodes):
            # enumerate the node list entries for a unique uis
            self.node_list.append(["localhost", 'localhost_%d' % i])

        self._log.debug('configure localhost as %s nodes (%s cores, %s gpus, %s lfs).',
                len(self.node_list), self.cores_per_node, self.gpus_per_node, self.lfs_per_node)
Ejemplo n.º 9
0
    def _configure(self):

        sge_hostfile = os.environ.get('PE_HOSTFILE')
        if sge_hostfile is None:
            msg = "$PE_HOSTFILE not set!"
            self._log.error(msg)
            raise RuntimeError(msg)

        def _sigusr2_handler():

            self._log.warn('caught sigusr2')
            self.stop()

        signal.signal(signal.SIGUSR1, _sigusr2_handler)

        # SGE core configuration might be different than what multiprocessing
        # announces
        # Alternative: "qconf -sq all.q|awk '/^slots *[0-9]+$/{print $2}'"

        # Parse SGE hostfile for nodes
        sge_node_list = [line.split()[0] for line in open(sge_hostfile)]
        self._log.info("Found PE_HOSTFILE %s. Expanded to: %s", sge_hostfile,
                       sge_node_list)

        # Parse SGE hostfile for cores
        sge_cores_count_list = [
            int(line.split()[1]) for line in open(sge_hostfile)
        ]
        sge_core_counts = list(set(sge_cores_count_list))
        sge_gpus_per_node = self._cfg.get('gpus_per_node', 0)  # FIXME GPU

        sge_lfs_per_node = {
            'path': ru.expand_env(self._cfg.get('lfs_path_per_node')),
            'size': self._cfg.get('lfs_size_per_node', 0)
        }

        # Check if nodes have the same core count
        if len(sge_core_counts) == 1:
            sge_cores_per_node = min(sge_core_counts)
            self._log.info("Found unique core counts: %s Using: %d",
                           sge_core_counts, sge_cores_per_node)

            # node names are unique, so can serve as node uids
            self.node_list = [[node, node] for node in sge_node_list]
            self.cores_per_node = sge_cores_per_node
            self.gpus_per_node = sge_gpus_per_node
            self.lfs_per_node = sge_lfs_per_node

        else:
            # In case of non-homogeneous counts, consider all slots be single core
            sge_cores_per_node = 1
            self._log.info("Found unique core counts: %s Using: %d",
                           sge_core_counts, sge_cores_per_node)
            self.cores_per_node = sge_cores_per_node
            self.gpus_per_node = sge_gpus_per_node
            self.lfs_per_node = sge_lfs_per_node

            # Expand node list, create unique IDs for each core
            self.node_list = []
            for node, cores in zip(sge_node_list, sge_cores_count_list):
                for core in cores:
                    self.node_list.append(node, '%s_%s' % (node, core))
Ejemplo n.º 10
0
def test_expand_env():

    noenv = {'BIZ' : 'biz'}
    env   = {'BAR' : 'bar'}

    os.environ['BAR'] = 'bar'
    os.environ['BIZ'] = 'biz'

    bar = os.environ.get('BAR')
    biz = os.environ.get('BIZ')

    tc = {'${BAR}'             : [bar,                  # os.environ
                                  'bar',                # env
                                  None],                # noenv
          'foo_${BAR}_baz'     : ['foo_%s_baz' % bar,
                                  'foo_bar_baz',
                                  'foo__baz'   ],
          'foo_${BAR:buz}_baz' : ['foo_%s_baz' % bar,
                                  'foo_bar_baz',
                                  'foo_buz_baz'],
          'foo_${BAR:$BIZ}_baz': ['foo_%s_baz' % bar,
                                  'foo_bar_baz',
                                  'foo_%s_baz' % biz],
         }

    # test string expansion (and also create list and dict for other tests
    l = list()
    d = dict()
    i = 0
    for k,v in tc.items():
        assert(ru.expand_env(k       ) == v[0])
        assert(ru.expand_env(k,   env) == v[1])
        assert(ru.expand_env(k, noenv) == v[2])
        l.append(k)
        d[i] = k
        i   += 1

    # test list expansion
    l0 = copy.deepcopy(l)
    l1 = copy.deepcopy(l)
    l2 = copy.deepcopy(l)

    ru.expand_env(l0)
    ru.expand_env(l1, env)
    ru.expand_env(l2, noenv)

    for i,v in enumerate(l):
        assert(l0[i] == tc[v][0])
        assert(l1[i] == tc[v][1])
        assert(l2[i] == tc[v][2])

    # test dict expansion
    d0 = copy.deepcopy(d)
    d1 = copy.deepcopy(d)
    d2 = copy.deepcopy(d)

    ru.expand_env(d0)
    ru.expand_env(d1, env)
    ru.expand_env(d2, noenv)

    for k,v in d0.items(): assert(v == tc[d[k]][0])
    for k,v in d1.items(): assert(v == tc[d[k]][1])
    for k,v in d2.items(): assert(v == tc[d[k]][2])

    # test `ignore_missing` flag
    env = {'BAR' : 'bar'}
    src = 'foo${FIZ}.baz'
    tgt = 'foo.baz'
    assert(ru.expand_env(src, env                     ) == tgt)
    assert(ru.expand_env(src, env, ignore_missing=True) == tgt)

    with pytest.raises(ValueError):
        ru.expand_env(src, env, ignore_missing=False)
Ejemplo n.º 11
0
    def _configure(self):

        slurm_nodelist = os.environ.get('SLURM_NODELIST')
        if slurm_nodelist is None:
            msg = "$SLURM_NODELIST not set!"
            self._log.error(msg)
            raise RuntimeError(msg)

        # Parse SLURM nodefile environment variable
        slurm_nodes = hostlist.expand_hostlist(slurm_nodelist)
        self._log.info("Found SLURM_NODELIST %s. Expanded to: %s",
                       slurm_nodelist, slurm_nodes)

        # $SLURM_NPROCS = Total number of cores allocated for the current job
        slurm_nprocs_str = os.environ.get('SLURM_NPROCS')
        if slurm_nprocs_str is None:
            msg = "$SLURM_NPROCS not set!"
            self._log.error(msg)
            raise RuntimeError(msg)
        else:
            slurm_nprocs = int(slurm_nprocs_str)

        # $SLURM_NNODES = Total number of (partial) nodes in the job's resource allocation
        slurm_nnodes_str = os.environ.get('SLURM_NNODES')
        if slurm_nnodes_str is None:
            msg = "$SLURM_NNODES not set!"
            self._log.error(msg)
            raise RuntimeError(msg)
        else:
            slurm_nnodes = int(slurm_nnodes_str)

        # $SLURM_CPUS_ON_NODE = Number of cores per node (physically)
        slurm_cpus_on_node_str = os.environ.get('SLURM_CPUS_ON_NODE')
        if slurm_cpus_on_node_str is None:
            msg = "$SLURM_CPUS_ON_NODE not set!"
            self._log.error(msg)
            raise RuntimeError(msg)
        else:
            slurm_cpus_on_node = int(slurm_cpus_on_node_str)

        # Verify that $SLURM_NPROCS <= $SLURM_NNODES * $SLURM_CPUS_ON_NODE
        if not slurm_nprocs <= slurm_nnodes * slurm_cpus_on_node:
            self._log.warning(
                "$SLURM_NPROCS(%d) <= $SLURM_NNODES(%d) * $SLURM_CPUS_ON_NODE(%d)",
                slurm_nprocs, slurm_nnodes, slurm_cpus_on_node)

        # Verify that $SLURM_NNODES == len($SLURM_NODELIST)
        if slurm_nnodes != len(slurm_nodes):
            self._log.error("$SLURM_NNODES(%d) != len($SLURM_NODELIST)(%d)",
                            slurm_nnodes, len(slurm_nodes))

        # Report the physical number of cores or the total number of cores
        # in case of a single partial node allocation.
        self.cores_per_node = self._cfg.get('cores_per_node', 0)
        self.gpus_per_node = self._cfg.get('gpus_per_node', 0)  # FIXME GPU
        self.mem_per_node = self._cfg.get('mem_per_node', 0)

        self.lfs_per_node = {
            'path': ru.expand_env(self._cfg.get('lfs_path_per_node')),
            'size': self._cfg.get('lfs_size_per_node', 0)
        }

        if not self.cores_per_node:
            self.cores_per_node = min(slurm_cpus_on_node, slurm_nprocs)

        # node names are unique, so can serve as node uids
        self.node_list = [[node, node] for node in slurm_nodes]

        self.lm_info['cores_per_node'] = self.cores_per_node
Ejemplo n.º 12
0
    def _configure(self):

        self._log.info("Configured to run on system with %s.", self.name)

        torque_nodefile = os.environ.get('PBS_NODEFILE')
        if torque_nodefile is None:
            msg = "$PBS_NODEFILE not set!"
            self._log.error(msg)
            raise RuntimeError(msg)

        # Parse PBS the nodefile
        torque_nodes = [line.strip() for line in open(torque_nodefile)]
        self._log.info("PBS_NODEFILE %s: %s", torque_nodefile, torque_nodes)

        # Number of cpus involved in allocation
        val = os.environ.get('PBS_NCPUS')
        if val:
            torque_num_cpus = int(val)
        else:
            msg = "$PBS_NCPUS not set! (new Torque version?)"
            torque_num_cpus = None
            self._log.warning(msg)

        # Number of nodes involved in allocation
        val = os.environ.get('PBS_NUM_NODES')
        if val:
            torque_num_nodes = int(val)
        else:
            msg = "$PBS_NUM_NODES not set! (old Torque version?)"
            torque_num_nodes = None
            self._log.warning(msg)

        torque_gpus_per_node = self._cfg.get('gpus_per_node', 0)
        torque_lfs_per_node = {
            'path': ru.expand_env(self._cfg.get('lfs_path_per_node')),
            'size': self._cfg.get('lfs_size_per_node', 0)
        }

        # Number of cores (processors) per node
        val = os.environ.get('PBS_NUM_PPN')
        if val:
            torque_cores_per_node = int(val)
        else:
            msg = "$PBS_NUM_PPN is not set!"
            torque_cores_per_node = None
            self._log.warning(msg)

        if self._cfg.get('cores_per_node'):
            cfg_cpn = self._cfg.get('cores_per_node')
            self._log.info('overwriting cores_per_node[%s] from cfg [%s]',
                           torque_cores_per_node, cfg_cpn)
            torque_cores_per_node = cfg_cpn

        if torque_cores_per_node in [None, 1]:
            # lets see if SAGA has been forthcoming with some information
            self._log.warning("fall back to $SAGA_PPN : %s",
                              os.environ.get('SAGA_PPN', None))
            torque_cores_per_node = int(
                os.environ.get('SAGA_PPN', torque_cores_per_node))

        # Number of entries in nodefile should be PBS_NUM_NODES * PBS_NUM_PPN
        torque_nodes_length = len(torque_nodes)
        torque_node_list = []
        for i in torque_nodes:
            if not torque_node_list.count(i):
                torque_node_list.append(i)

    # if torque_num_nodes and torque_cores_per_node and \
    #     torque_nodes_length < torque_num_nodes * torque_cores_per_node:
    #     msg = "Number of entries in $PBS_NODEFILE (%s) does not match with $PBS_NUM_NODES*$PBS_NUM_PPN (%s*%s)" % \
    #           (torque_nodes_length, torque_num_nodes,  torque_cores_per_node)
    #     raise RuntimeError(msg)

    # only unique node names
        torque_node_list_length = len(torque_node_list)
        self._log.debug("Node list: %s(%d)", torque_node_list,
                        torque_node_list_length)

        if torque_num_nodes and torque_cores_per_node:
            # Modern style Torque
            self.cores_per_node = torque_cores_per_node
        elif torque_num_cpus:
            # Blacklight style (TORQUE-2.3.13)
            self.cores_per_node = torque_num_cpus
        else:
            # Old style Torque (Should we just use this for all versions?)
            self.cores_per_node = torque_nodes_length / torque_node_list_length

        # node names are unique, so can serve as node uids
        self.node_list = [[node, node] for node in torque_node_list]
        self.gpus_per_node = torque_gpus_per_node
        self.lfs_per_node = torque_lfs_per_node
Ejemplo n.º 13
0
    def _configure(self):
        # TODO: $NCPUS?!?! = 1 on archer

        pbspro_nodefile = os.environ.get('PBS_NODEFILE')

        if pbspro_nodefile is None:
            msg = "$PBS_NODEFILE not set!"
            self._log.error(msg)
            raise RuntimeError(msg)

        self._log.info("Found PBSPro $PBS_NODEFILE %s." % pbspro_nodefile)

        # Dont need to parse the content of nodefile for PBSPRO, only the length
        # is interesting, as there are only duplicate entries in it.
        pbspro_nodes        = [line.strip() for line in open(pbspro_nodefile)]
        pbspro_nodes_length = len(pbspro_nodes)

        # Number of Processors per Node
        val = os.environ.get('NUM_PPN')
        if not val:
            val = os.environ.get('SAGA_PPN')

        if not val:
            raise RuntimeError("$NUM_PPN / $SAGA_PPN not set!")

        pbspro_num_ppn = int(val)

        # Number of Nodes allocated
        val = os.environ.get('NODE_COUNT')
        if val:
            pbspro_node_count = int(val)
        else:
            pbspro_node_count = len(set(pbspro_nodes))
            self._log.warn("$NODE_COUNT not set - use %d" % pbspro_node_count)

        # Number of Parallel Environments
        val = os.environ.get('NUM_PES')
        if val:
            pbspro_num_pes = int(val)
        else:
            pbspro_num_pes = len(pbspro_nodes)
            self._log.warn("$NUM_PES not set - use %d" % pbspro_num_pes)
        try:
            pbspro_vnodes = self._parse_pbspro_vnodes()
        except:
            self._log.exception('node parsing failed')
            raise

        # Verify that $NUM_PES == $NODE_COUNT * $NUM_PPN == len($PBS_NODEFILE)
        if not (pbspro_node_count * pbspro_num_ppn == pbspro_num_pes == pbspro_nodes_length):
            self._log.warning("NUM_PES != NODE_COUNT * NUM_PPN != len($PBS_NODEFILE)")

        # node names are unique, so can serve as node uids
        self.node_list      = [[node, node] for node in pbspro_vnodes]
        self.cores_per_node = pbspro_num_ppn
        self.gpus_per_node  = self._cfg.get('gpus_per_node', 0)  # FIXME GPU

        self.lfs_per_node   = {'path' : ru.expand_env(
                                           self._cfg.get('lfs_path_per_node')),
                               'size' :    self._cfg.get('lfs_size_per_node', 0)
                              }
Ejemplo n.º 14
0
    def _configure(self):

        slurm_nodelist = os.environ.get('SLURM_NODELIST')
        if slurm_nodelist is None:
            msg = "$SLURM_NODELIST not set!"
            self._log.error(msg)
            raise RuntimeError(msg)

        # Parse SLURM nodefile environment variable
        slurm_nodes = hostlist.expand_hostlist(slurm_nodelist)
        self._log.info("Found SLURM_NODELIST %s. Expanded to: %s", slurm_nodelist, slurm_nodes)

        # $SLURM_NPROCS = Total number of cores allocated for the current job
        slurm_nprocs_str = os.environ.get('SLURM_NPROCS')
        if slurm_nprocs_str is None:
            msg = "$SLURM_NPROCS not set!"
            self._log.error(msg)
            raise RuntimeError(msg)
        else:
            slurm_nprocs = int(slurm_nprocs_str)

        # $SLURM_NNODES = Total number of (partial) nodes in the job's resource allocation
        slurm_nnodes_str = os.environ.get('SLURM_NNODES')
        if slurm_nnodes_str is None:
            msg = "$SLURM_NNODES not set!"
            self._log.error(msg)
            raise RuntimeError(msg)
        else:
            slurm_nnodes = int(slurm_nnodes_str)

        # $SLURM_CPUS_ON_NODE = Number of cores per node (physically)
        slurm_cpus_on_node_str = os.environ.get('SLURM_CPUS_ON_NODE')
        if slurm_cpus_on_node_str is None:
            msg = "$SLURM_CPUS_ON_NODE not set!"
            self._log.error(msg)
            raise RuntimeError(msg)
        else:
            slurm_cpus_on_node = int(slurm_cpus_on_node_str)

        # Verify that $SLURM_NPROCS <= $SLURM_NNODES * $SLURM_CPUS_ON_NODE
        if not slurm_nprocs <= slurm_nnodes * slurm_cpus_on_node:
            self._log.warning("$SLURM_NPROCS(%d) <= $SLURM_NNODES(%d) * $SLURM_CPUS_ON_NODE(%d)",
                            slurm_nprocs, slurm_nnodes, slurm_cpus_on_node)

        # Verify that $SLURM_NNODES == len($SLURM_NODELIST)
        if slurm_nnodes != len(slurm_nodes):
            self._log.error("$SLURM_NNODES(%d) != len($SLURM_NODELIST)(%d)",
                           slurm_nnodes, len(slurm_nodes))

        # Report the physical number of cores or the total number of cores
        # in case of a single partial node allocation.
        self.cores_per_node = self._cfg.get('cores_per_node', 0)
        self.gpus_per_node  = self._cfg.get('gpus_per_node',  0)  # FIXME GPU

        self.lfs_per_node   = {'path' : ru.expand_env(
                                           self._cfg.get('lfs_path_per_node')),
                               'size' :    self._cfg.get('lfs_size_per_node', 0)
                              }

        if not self.cores_per_node:
            self.cores_per_node = min(slurm_cpus_on_node, slurm_nprocs)


        # node names are unique, so can serve as node uids
        self.node_list = [[node, node] for node in slurm_nodes]

        self.lm_info['cores_per_node'] = self.cores_per_node
Ejemplo n.º 15
0
    def add_md_stage(self, exchanged_from=None, sid=None, last=False):

        self._prof.prof('add_md_start', uid=self.rid)
        self._cycle += 1
        self._log.debug('%5s %s add md', self.rid, self._uid)

      # task = re.Task(from_dict=self._workload['md'])
      # task.name = 'mdtsk-%s-%s' % (self.rid, self.cycle)
        sandbox     = '%s.%04d.md' % (self.rid, self.cycle)
        link_inputs = list()

        # link initial data
        link_inputs += expand_ln(self._workload.md.inputs,
                     'pilot:///%s' % self._workload.data.inputs,
                     'task://', self.rid, self.cycle)

        if self._cycle == 0:
            # link initial data
            link_inputs += expand_ln(self._workload.md.inputs_0,
                         'pilot:///%s' % self._workload.data.inputs,
                         'task://',
                         self.rid, self.cycle)
        else:
            # get data from previous task
            t = last_task(self)
            if exchanged_from:
                self._log.debug('Exchange from %s', exchanged_from.name)
                link_inputs += expand_ln(self._workload.md.ex_2_md,
                        'pilot:///%s' % (exchanged_from.sandbox),
                        'task://',
                        self.rid, self.cycle)
            else:
                # FIXME: this apparently can't happen
                link_inputs += expand_ln(self._workload.md.md_2_md,
                         'resource:///%s' % (t.sandbox),
                         'task://',
                         self.rid, self.cycle)

        copy_outputs = expand_ln(self._workload.md.outputs,
                         'task://',
                         'client:///%s' % self._workload.data.outputs,
                         self.rid, self.cycle)

        if last:
            copy_outputs += expand_ln(self._workload.md.outputs_n,
                         'task://',
                         'client:///%s' % self._workload.data.outputs,
                         self.rid, self.cycle)

        # TODO: filter out custom keys from that dict before deepcopy
        env   = {'REPEX_RID'   : str(self.rid),
                 'REPEX_CYCLE' : str(self.cycle)}
        tds   = copy.deepcopy(self._workload['md']['descriptions'])
        first = 0
        last  = len(tds) - 1
        for idx, td in enumerate(tds):

            stage = re.Stage()
            task  = re.Task()
            td    = ru.expand_env(td, env=env)

            for k,v in td.items():
                setattr(task, k, v)

            if self._workload.pre_exec:
                if task.pre_exec:
                    task.pre_exec.extend(self._workload.pre_exec)
                else:
                    task.pre_exec.extend = self._workload.pre_exec

            task.name    = '%s.%04d.%02d.md' % (self.rid, self.cycle, idx)
            task.sandbox = sandbox

            if idx == first:
                task.link_input_data = link_inputs

            if idx == last:
                task.download_output_data = copy_outputs
                stage.post_exec = self.check_exchange

            stage.add_tasks(task)
            self.add_stages(stage)
            self._log.debug('%5s add md: %s', self.rid, task.name)

        self._prof.prof('add_md_stop', uid=self.rid)
Ejemplo n.º 16
0
    def _configure(self):

        self._log.info("Configured to run on system with %s.", self.name)

        torque_nodefile = os.environ.get('PBS_NODEFILE')
        if torque_nodefile is None:
            msg = "$PBS_NODEFILE not set!"
            self._log.error(msg)
            raise RuntimeError(msg)

        # Parse PBS the nodefile
        torque_nodes = [line.strip() for line in open(torque_nodefile)]
        self._log.info("PBS_NODEFILE %s: %s", torque_nodefile, torque_nodes)

        # Number of cpus involved in allocation
        val = os.environ.get('PBS_NCPUS')
        if val:
            torque_num_cpus = int(val)
        else:
            msg = "$PBS_NCPUS not set! (new Torque version?)"
            torque_num_cpus = None
            self._log.warning(msg)

        # Number of nodes involved in allocation
        val = os.environ.get('PBS_NUM_NODES')
        if val:
            torque_num_nodes = int(val)
        else:
            msg = "$PBS_NUM_NODES not set! (old Torque version?)"
            torque_num_nodes = None
            self._log.warning(msg)

        torque_gpus_per_node  = self._cfg.get('gpus_per_node', 0)
        torque_lfs_per_node   = {'path' : ru.expand_env(
                                             self._cfg.get('lfs_path_per_node')),
                                 'size' :    self._cfg.get('lfs_size_per_node', 0)
                                }

        # Number of cores (processors) per node
        val = os.environ.get('PBS_NUM_PPN')
        if val:
            torque_cores_per_node = int(val)
        else:
            msg = "$PBS_NUM_PPN is not set!"
            torque_cores_per_node = None
            self._log.warning(msg)

        if self._cfg.get('cores_per_node'):
            cfg_cpn = self._cfg.get('cores_per_node')
            self._log.info('overwriting cores_per_node[%s] from cfg [%s]', 
                    torque_cores_per_node, cfg_cpn)
            torque_cores_per_node = cfg_cpn


        if torque_cores_per_node in [None, 1]:
            # lets see if SAGA has been forthcoming with some information
            self._log.warning("fall back to $SAGA_PPN : %s", os.environ.get ('SAGA_PPN', None))
            torque_cores_per_node = int(os.environ.get('SAGA_PPN', torque_cores_per_node))

        # Number of entries in nodefile should be PBS_NUM_NODES * PBS_NUM_PPN
        torque_nodes_length = len(torque_nodes)
        torque_node_list = []
        [torque_node_list.append(i) for i in torque_nodes if not torque_node_list.count(i)]

      # if torque_num_nodes and torque_cores_per_node and \
      #     torque_nodes_length < torque_num_nodes * torque_cores_per_node:
      #     msg = "Number of entries in $PBS_NODEFILE (%s) does not match with $PBS_NUM_NODES*$PBS_NUM_PPN (%s*%s)" % \
      #           (torque_nodes_length, torque_num_nodes,  torque_cores_per_node)
      #     raise RuntimeError(msg)

        # only unique node names
        torque_node_list_length = len(torque_node_list)
        self._log.debug("Node list: %s(%d)", torque_node_list, torque_node_list_length)

        if torque_num_nodes and torque_cores_per_node:
            # Modern style Torque
            self.cores_per_node = torque_cores_per_node
        elif torque_num_cpus:
            # Blacklight style (TORQUE-2.3.13)
            self.cores_per_node = torque_num_cpus
        else:
            # Old style Torque (Should we just use this for all versions?)
            self.cores_per_node = torque_nodes_length / torque_node_list_length

        # node names are unique, so can serve as node uids
        self.node_list     = [[node, node] for node in torque_node_list]
        self.gpus_per_node = torque_gpus_per_node
        self.lfs_per_node  = torque_lfs_per_node