Exemple #1
0
def test_shortnames_nochange_cobalt():
    env_resources = EnvResources()
    # Test Cobalt abbrev
    exp_names = ['21', '22', '137', '138', '1234', '11234']
    env_resources.scheduler = 'Cobalt'
    abbrev_names = env_resources.shortnames(exp_names)
    assert abbrev_names == exp_names, "Abbreviated names returned do not match expected"
    del env_resources
Exemple #2
0
def test_shortnames_slurm():
    env_resources = EnvResources()
    # Test Slurm abbrev
    exp_names = ['knl-0019', 'knl-0021', 'knl-0022']
    full_names = [
        'knl-0019.some.suffix', 'knl-0021.some.suffix', 'knl-0022.diff_suffix'
    ]
    env_resources.scheduler = 'Slurm'
    abbrev_names = env_resources.shortnames(full_names)
    assert abbrev_names == exp_names, "Abbreviated names returned do not match expected"
    del env_resources
Exemple #3
0
def test_shortnames_nochange_slurm():
    env_resources = EnvResources()
    # Test Slurm abbrev
    exp_names = [
        'knl-0019', 'knl-0021', 'knl-0022', 'knl-0137', 'knl-0138', 'knl-0139',
        'knl-2345'
    ]
    env_resources.scheduler = 'Slurm'
    abbrev_names = env_resources.shortnames(exp_names)
    assert abbrev_names == exp_names, "Abbreviated names returned do not match expected"
    del env_resources
Exemple #4
0
class Resources:
    """Provides system resources to libEnsemble and executor.

    This is intialized when the executor is created with auto_resources set to true.

    **Object Attributes:**

    These are set on initialization.

    :ivar string top_level_dir: Directory where searches for node_list file
    :ivar boolean central_mode: If true, then running in central mode; otherwise distributed
    :ivar EnvResources env_resources: An object storing environment variables used by resources
    :ivar list global_nodelist: A list of all nodes available for running user applications
    :ivar int logical_cores_avail_per_node: Logical cores (including SMT threads) available on a node
    :ivar int physical_cores_avail_per_node: Physical cores available on a node
    :ivar WorkerResources worker_resources: An object that can contain worker specific resources
    """

    DEFAULT_NODEFILE = 'node_list'

    def __init__(self,
                 top_level_dir=None,
                 central_mode=False,
                 zero_resource_workers=[],
                 allow_oversubscribe=False,
                 launcher=None,
                 cores_on_node=None,
                 node_file=None,
                 nodelist_env_slurm=None,
                 nodelist_env_cobalt=None,
                 nodelist_env_lsf=None,
                 nodelist_env_lsf_shortform=None):
        """Initializes a new Resources instance

        Determines the compute resources available for current allocation, including
        node list and cores/hardware threads available within nodes.

        Parameters
        ----------

        top_level_dir: string, optional
            Directory libEnsemble runs in (default is current working directory)

        central_mode: boolean, optional
            If true, then running in central mode, otherwise distributed.
            Central mode means libE processes (manager and workers) are grouped together and
            do not share nodes with applications. Distributed mode means Workers share nodes
            with applications.

        zero_resource_workers: list of ints, optional
            List of workers that require no resources.

        allow_oversubscribe: boolean, optional
            If false, then resources will raise an error if task process
            counts exceed the CPUs available to the worker, as detected by
            auto_resources. Larger node counts will always raise an error.
            When auto_resources is off, this argument is ignored.

        launcher: String, optional
            The name of the job launcher, such as mpirun or aprun. This may be used to obtain
            intranode information by launching a probing job onto the compute nodes.
            If not present, the local node will be used to obtain this information.

        cores_on_node: tuple (int,int), optional
            If supplied gives (physical cores, logical cores) for the nodes. If not supplied,
            this will be auto-detected.

        node_file: String, optional
            If supplied, give the name of a file in the run directory to use as a node-list
            for use by libEnsemble. Defaults to a file named 'node_list'. If the file does
            not exist, then the node-list will be auto-detected.

        nodelist_env_slurm: String, optional
            The environment variable giving a node list in Slurm format (Default: uses SLURM_NODELIST).
            Note: This is queried only if a node_list file is not provided and auto_resources=True.

        nodelist_env_cobalt: String, optional
            The environment variable giving a node list in Cobalt format (Default: uses COBALT_PARTNAME).
            Note: This is queried only if a node_list file is not provided and auto_resources=True.

        nodelist_env_lsf: String, optional
            The environment variable giving a node list in LSF format (Default: uses LSB_HOSTS).
            Note: This is queried only if a node_list file is not provided and auto_resources=True.

        nodelist_env_lsf_shortform: String, optional
            The environment variable giving a node list in LSF short-form format (Default: uses LSB_MCPU_HOSTS)
            Note: This is only queried if a node_list file is not provided and auto_resources=True.

        """

        self.top_level_dir = top_level_dir or os.getcwd()
        self.central_mode = central_mode
        if self.central_mode:
            logger.debug('Running in central mode')
        self.allow_oversubscribe = allow_oversubscribe

        self.env_resources = EnvResources(
            nodelist_env_slurm=nodelist_env_slurm,
            nodelist_env_cobalt=nodelist_env_cobalt,
            nodelist_env_lsf=nodelist_env_lsf,
            nodelist_env_lsf_shortform=nodelist_env_lsf_shortform)

        # This is global nodelist avail to workers - may change to global_worker_nodelist
        self.local_host = self.env_resources.shortnames([socket.gethostname()
                                                         ])[0]
        if node_file is None:
            node_file = Resources.DEFAULT_NODEFILE
        self.global_nodelist = Resources.get_global_nodelist(
            node_file=node_file,
            rundir=self.top_level_dir,
            env_resources=self.env_resources)
        self.launcher = launcher
        remote_detect = False
        if self.local_host not in self.global_nodelist:
            remote_detect = True

        if not cores_on_node:
            cores_on_node = \
                node_resources.get_sub_node_resources(launcher=self.launcher,
                                                      remote_mode=remote_detect,
                                                      env_resources=self.env_resources)
        self.physical_cores_avail_per_node = cores_on_node[0]
        self.logical_cores_avail_per_node = cores_on_node[1]
        self.libE_nodes = None
        self.worker_resources = None
        self.zero_resource_workers = zero_resource_workers

    def add_comm_info(self, libE_nodes):
        """Adds comms-specific information to resources

        Removes libEnsemble nodes from nodelist if in central_mode.
        """
        self.libE_nodes = self.env_resources.shortnames(libE_nodes)
        libE_nodes_in_list = list(
            filter(lambda x: x in self.libE_nodes, self.global_nodelist))
        if libE_nodes_in_list:
            if self.central_mode and len(self.global_nodelist) > 1:
                self.global_nodelist = Resources.remove_nodes(
                    self.global_nodelist, self.libE_nodes)
                if not self.global_nodelist:
                    logger.warning(
                        "Warning. Node-list for tasks is empty. Remove central_mode or add nodes"
                    )

    def set_worker_resources(self, workerid, comm):
        self.worker_resources = WorkerResources(workerid, comm, self)

    @staticmethod
    def get_MPI_variant():
        """Returns MPI base implementation

        Returns
        -------
        mpi_variant: string:
            MPI variant 'aprun' or 'jsrun' or 'mpich' or 'openmpi'

        """

        try:
            subprocess.check_call(['aprun', '--version'],
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
            return 'aprun'
        except OSError:
            pass

        try:
            subprocess.check_call(['jsrun', '--version'],
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
            return 'jsrun'
        except OSError:
            pass

        try:
            # Explore mpi4py.MPI.get_vendor() and mpi4py.MPI.Get_library_version() for mpi4py
            try_mpich = subprocess.Popen(['mpirun', '-npernode'],
                                         stdout=subprocess.PIPE,
                                         stderr=subprocess.STDOUT)
            stdout, _ = try_mpich.communicate()
            if 'unrecognized argument npernode' in stdout.decode():
                return 'mpich'
            return 'openmpi'
        except Exception:
            pass

        try:
            subprocess.check_call(['srun', '--version'],
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
            return 'srun'
        except OSError:
            pass

    # ---------------------------------------------------------------------------

    # This is for central mode where libE nodes will not share with app nodes
    @staticmethod
    def remove_nodes(global_nodelist_in, remove_list):
        """Removes any nodes in remove_list from the global nodelist"""
        global_nodelist = list(
            filter(lambda x: x not in remove_list, global_nodelist_in))
        return global_nodelist

    @staticmethod
    def best_split(a, n):
        """Creates the most even split of list a into n parts and return list of lists"""
        k, m = divmod(len(a), n)
        return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)]
                for i in range(n))

    @staticmethod
    def get_global_nodelist(node_file=DEFAULT_NODEFILE,
                            rundir=None,
                            env_resources=None):
        """
        Returns the list of nodes available to all libEnsemble workers.

        If a node_file exists this is used, otherwise the environment
        is interrogated for a node list. If a dedicated manager node is used,
        then a node_file is recommended.

        In central mode, any node with a libE worker is removed from the list.
        """
        top_level_dir = rundir or os.getcwd()
        node_filepath = os.path.join(top_level_dir, node_file)
        global_nodelist = []
        if os.path.isfile(node_filepath):
            logger.debug("node_file found - getting nodelist from node_file")
            with open(node_filepath, 'r') as f:
                for line in f:
                    global_nodelist.append(line.rstrip())
            if env_resources:
                global_nodelist = env_resources.shortnames(global_nodelist)
        else:
            logger.debug(
                "No node_file found - searching for nodelist in environment")
            if env_resources:
                global_nodelist = env_resources.get_nodelist()

            if not global_nodelist:
                # Assume a standalone machine
                logger.info(
                    "Can not find nodelist from environment. Assuming standalone"
                )
                global_nodelist.append(
                    env_resources.shortnames([socket.gethostname()])[0])

        if global_nodelist:
            return global_nodelist
        raise ResourcesException("Error. global_nodelist is empty")