コード例 #1
0
def test_abbrev_nodenames_nochange_cobalt():
    env_resources = EnvResources()
    # Test Cobalt abbrev
    exp_names = ['21', '22', '137', '138', '1234', '11234']
    env_resources.schedular = 'Cobalt'
    abbrev_names = env_resources.abbrev_nodenames(exp_names)
    assert abbrev_names == exp_names, "Abbreviated names returned do not match expected"
    del env_resources
コード例 #2
0
def test_abbrev_nodenames_nochange_slurm():
    env_resources = EnvResources()
    # Test Cobalt abbrev
    exp_names = [
        'knl-0019', 'knl-0021', 'knl-0022', 'knl-0137', 'knl-0138', 'knl-0139',
        'knl-2345'
    ]
    env_resources.schedular = 'Cobalt'
    abbrev_names = env_resources.abbrev_nodenames(exp_names)
    assert abbrev_names == exp_names, "Abbreviated names returned do not match expected"
    del env_resources
コード例 #3
0
class Resources:
    """Provide system resources to libEnsemble and job controller.

    This is intialised when the job_controller is created with auto_resources set to True.

    **Object Attributes:**

    These are set on initialisation.

    :ivar string top_level_dir: Directory where searches for worker_list file.
    :ivar boolean central_mode: If true, then running in central mode, else distributed.
    :ivar EnvResources env_resources: An object storing environment variables used by resources.
    :ivar list global_nodelist: A list of all nodes available for running user applications
    :ivar int logical_cores_avail_per_node: Logical cores (including SMT threads) available on a node.
    :ivar int physical_cores_avail_per_node: Physical cores available on a node.
    :ivar WorkerResources worker_resources: An object that can contain worker specific resources.
    """
    def __init__(self,
                 top_level_dir=None,
                 central_mode=False,
                 launcher=None,
                 nodelist_env_slurm=None,
                 nodelist_env_cobalt=None,
                 nodelist_env_lsf=None,
                 nodelist_env_lsf_shortform=None):
        """Initialise new Resources instance

        Works out the compute resources available for current allocation, including
        node list and cores/hardware threads available within nodes.

        Parameters
        ----------

        top_level_dir: string, optional
            Directory libEnsemble runs in (default is current working directory)

        central_mode: boolean, optional
            If true, then running in central mode, else distributed.
            Central mode means libE processes (manager and workers) are grouped together and
            do not share nodes with applications. Distributed mode means Workers share nodes
            with applications.

        launcher: String, optional
            The name of the job launcher such as mpirun or aprun. This may be used to obtain
            intra-node information by launching a probing job onto the compute nodes.
            If not present, the local node will be used to obtain this information.

        nodelist_env_slurm: String, optional
            The environment variable giving a node list in Slurm format (Default: Uses SLURM_NODELIST)
            Note: This is only queried if a worker_list file is not provided and auto_resources=True.

        nodelist_env_cobalt: String, optional
            The environment variable giving a node list in Cobalt format (Default: Uses COBALT_PARTNAME)
            Note: This is only queried if a worker_list file is not provided and auto_resources=True.

        nodelist_env_lsf: String, optional
            The environment variable giving a node list in LSF format (Default: Uses LSB_HOSTS)
            Note: This is only queried if a worker_list file is not provided and auto_resources=True.

        nodelist_env_lsf_shortform: String, optional
            The environment variable giving a node list in LSF short-form format (Default: Uses LSB_MCPU_HOSTS)
            Note: This is only queried if a worker_list file is not provided and auto_resources=True.

        """

        self.top_level_dir = top_level_dir or os.getcwd()
        self.central_mode = central_mode
        if self.central_mode:
            logger.debug('Running in central mode')

        self.env_resources = EnvResources(
            nodelist_env_slurm=nodelist_env_slurm,
            nodelist_env_cobalt=nodelist_env_cobalt,
            nodelist_env_lsf=nodelist_env_lsf,
            nodelist_env_lsf_shortform=nodelist_env_lsf_shortform)

        # This is global nodelist avail to workers - may change to global_worker_nodelist
        self.global_nodelist = Resources.get_global_nodelist(
            rundir=self.top_level_dir, env_resources=self.env_resources)
        remote_detect = False
        if socket.gethostname() not in self.global_nodelist:
            remote_detect = True

        cores_info = node_resources.get_sub_node_resources(
            launcher=launcher,
            remote_mode=remote_detect,
            env_resources=self.env_resources)
        self.logical_cores_avail_per_node = cores_info[0]
        self.physical_cores_avail_per_node = cores_info[1]

        self.libE_nodes = None
        self.worker_resources = None

    def add_comm_info(self, libE_nodes):
        """Add comms specific information to resources

        Removes libEnsemble nodes from nodelist if in central_mode.
        """
        self.libE_nodes = self.env_resources.abbrev_nodenames(libE_nodes)
        libE_nodes_in_list = list(
            filter(lambda x: x in self.libE_nodes, self.global_nodelist))
        if libE_nodes_in_list:
            if self.central_mode and len(self.global_nodelist) > 1:
                self.global_nodelist = Resources.remove_nodes(
                    self.global_nodelist, self.libE_nodes)
                if not self.global_nodelist:
                    logger.warning(
                        "Warning. Node-list for sub-jobs is empty. Remove central_mode or add nodes"
                    )

    def set_worker_resources(self, workerid, comm):
        self.worker_resources = WorkerResources(workerid, comm, self)

    @staticmethod
    def get_MPI_variant():
        """Returns MPI base implementation

        Returns
        -------
        mpi_variant: string:
            MPI variant 'aprun' or 'jsrun' or 'mpich' or 'openmpi'

        """

        try:
            subprocess.check_call(['aprun', '--version'],
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
            return 'aprun'
        except OSError:
            pass

        try:
            subprocess.check_call(['jsrun', '--version'],
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
            return 'jsrun'
        except OSError:
            pass

        try:
            # Explore mpi4py.MPI.get_vendor() and mpi4py.MPI.Get_library_version() for mpi4py
            try_mpich = subprocess.Popen(['mpirun', '-npernode'],
                                         stdout=subprocess.PIPE,
                                         stderr=subprocess.STDOUT)
            stdout, _ = try_mpich.communicate()
            if 'unrecognized argument npernode' in stdout.decode():
                return 'mpich'
            return 'openmpi'
        except Exception:
            pass

        try:
            subprocess.check_call(['srun', '--version'],
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
            return 'srun'
        except OSError:
            pass

    # ---------------------------------------------------------------------------

    # This is for central mode where libE nodes will not share with app nodes
    @staticmethod
    def remove_nodes(global_nodelist_in, remove_list):
        """Any nodes in remove_list are removed from the global nodelist"""
        global_nodelist = list(
            filter(lambda x: x not in remove_list, global_nodelist_in))
        return global_nodelist

    @staticmethod
    def best_split(a, n):
        """Create the most even split of list a into n parts and return list of lists"""
        k, m = divmod(len(a), n)
        return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)]
                for i in range(n))

    @staticmethod
    def get_global_nodelist(rundir=None, env_resources=None):
        """
        Return the list of nodes available to all libEnsemble workers

        If a worker_list file exists this is used, otherwise the environment
        is interrogated for a node list. If a dedicated manager node is used,
        then a worker_list file is recommended.

        In central mode, any node with a libE worker is removed from the list.
        """
        top_level_dir = rundir or os.getcwd()
        worker_list_file = os.path.join(top_level_dir, 'worker_list')
        global_nodelist = []
        if os.path.isfile(worker_list_file):
            logger.debug(
                "worker_list found - getting nodelist from worker_list")
            with open(worker_list_file, 'r') as f:
                for line in f:
                    global_nodelist.append(line.rstrip())
        else:
            logger.debug(
                "No worker_list found - searching for nodelist in environment")
            if env_resources:
                global_nodelist = env_resources.get_nodelist()

            if not global_nodelist:
                # Assume a standalone machine
                logger.info(
                    "Can not find nodelist from environment. Assuming standalone"
                )
                global_nodelist.append(socket.gethostname())

        if global_nodelist:
            return global_nodelist
        raise ResourcesException("Error. global_nodelist is empty")