Ejemplo n.º 1
0
    def test_configure(self, mocked_init):

        session, configs = self.setUp()

        component = Default(cfg=None, session=None)
        component._cfg        = mock.Mock()
        component._log        = ru.Logger('dummy')
        component._rp_version = '0.0'
        component._session    = session

        component._pmgr       = 'pmgr.0'
        component._prof       = ru.Config(cfg = {'enabled': False})
        component._cache_lock = ru.Lock()
        component._cache      = dict()
        component._sandboxes  = dict()

        component._mod_dir    = os.path.dirname(os.path.abspath(__file__))
        component._root_dir   = "%s/../../src/radical/pilot/" % component._mod_dir
        component._conf_dir   = "%s/configs/" % component._root_dir

        component._rp_version    = rp.version
        component._rp_sdist_name = rp.sdist_name
        component._rp_sdist_path = rp.sdist_path

        resource = 'local.localhost'
        rcfg     = configs.local.localhost

        pilot    = {
                        'uid'         : 'pilot.0000',
                        'description' : {'cores'          : 10,
                                         'gpus'           : 2,
                                         'queue'          : 'default',
                                         'project'        : 'foo',
                                         'job_name'       : None,
                                         'runtime'        : 10,
                                         'app_comm'       : 0,
                                         'cleanup'        : 0,
                                         'memory'         : 0,
                                         'candidate_hosts': None,
                                         }
                   }
        ret = component._prepare_pilot(resource, rcfg, pilot, {})
        assert(ret['jd'].name == 'pilot.0000')

        pilot    = {
                        'uid'         : 'pilot.0000',
                        'description' : {'cores'          : 10,
                                         'gpus'           : 2,
                                         'queue'          : 'default',
                                         'project'        : 'foo',
                                         'job_name'       : 'bar',
                                         'runtime'        : 10,
                                         'app_comm'       : 0,
                                         'cleanup'        : 0,
                                         'memory'         : 0,
                                         'candidate_hosts': None,
                                         }
                   }
        ret = component._prepare_pilot(resource, rcfg, pilot, {})
        assert(ret['jd'].name == 'bar')
Ejemplo n.º 2
0
    def __init__ (self, default=True, uid=None):
        """
        default: bool
        ret:     None
        """

        simple_base = super  (Session, self)
        simple_base.__init__ (uid=uid)

        self._logger = ru.Logger('radical.saga')

        # if the default session is expected, we point our context list to the
        # shared list of the default session singleton.  Otherwise, we create
        # a private list which is not populated.

        # a session also has a lease manager, for adaptors in this session to use.

        if  default :
            default_session     = DefaultSession (uid=self._id)
            self.contexts       = copy.deepcopy(default_session.contexts)
            self._lease_manager = default_session._lease_manager
        else :
            self.contexts       = _ContextList (session=self)

            # FIXME: at the moment, the lease manager is owned by the session.
            # Howevwer, the pty layer is the main user of the lease manager,
            # and we thus keep the lease manager options in the pty subsection.
            # So here we are, in the session, evaluating the pty config options.
            self._cfg = ru.Config(module='radical.saga.session')
            self._lease_manager = ru.LeaseManager (
                    max_pool_size=self._cfg.pty.connection_pool_size,
                    max_pool_wait=self._cfg.pty.connection_pool_wait,
                    max_obj_age  =self._cfg.pty.connection_pool_ttl
                    )
Ejemplo n.º 3
0
    def add_resource_config(self, resource_config):
        '''
        Adds a new :class:`ru.Config` to the session's dictionary of known
        resources, or accept a string which points to a configuration file.

        For example::

               rc = ru.Config("./mycluster.json")
               rc.job_manager_endpoint = "ssh+pbs://mycluster
               rc.filesystem_endpoint  = "sftp://mycluster
               rc.default_queue        = "private"

               session = rp.Session()
               session.add_resource_config(rc)

               pd = rp.ComputePilotDescription()
               pd.resource = "mycluster"
               pd.cores    = 16
               pd.runtime  = 5 # minutes

               pilot = pm.submit_pilots(pd)
        '''

        if isinstance(resource_config, str):

            # let exceptions fall through
            rcs = ru.Config('radical.pilot.resource', name=resource_config)

            for rc in rcs:
                self._log.info('load rcfg for %s' % rc)
                self._rcfgs[rc] = rcs[rc].as_dict()

        else:
            self._log.debug('load rcfg for %s', resource_config.label)
            self._rcfgs[resource_config.label] = resource_config.as_dict()
Ejemplo n.º 4
0
    def __init__(self):

        # Engine manages cpis from adaptors
        self._adaptor_registry = dict()

        # get angine, adaptor and pty configs
        self._cfg = ru.Config('radical.saga.engine')
        self._pty_cfg = ru.Config('radical.saga.pty')
        self._registry = ru.Config('radical.saga.registry')

        # Initialize the logging, and log version (this is a singleton!)
        self._logger = ru.Logger('radical.saga')
        self._logger.info('radical.saga         version: %s' % version_detail)

        # load adaptors
        self._load_adaptors()
Ejemplo n.º 5
0
def main():
    # TODO: Test both with and without a provided config file.
    kwargs = {}
    if len(sys.argv) > 1:
        cfg = ru.Config(cfg=ru.read_json(sys.argv[1]))
        kwargs['cfg'] = cfg
        descr = cfg.worker_descr,
        count = cfg.n_workers,
        cores = cfg.cpn,
        gpus = cfg.gpn
    else:
        descr = rp.TaskDescription({
            'uid': 'raptor.worker',
            'executable': 'scalems_rp_worker',
            'arguments': []
        })
        count = 1
        cores = 1
        gpus = 0
    master = ScaleMSMaster(**kwargs)

    master.submit(descr=descr, count=count, cores=cores, gpus=gpus)

    master.start()
    master.join()
    master.stop()
Ejemplo n.º 6
0
    def __init__(self,
                 url,
                 session=None,
                 logger=None,
                 cfg=None,
                 posix=True,
                 interactive=True):

        if logger: self.logger = logger
        else: self.logger = ru.Logger('radical.saga.pty')

        if session: self.session = session
        else: self.session = ss.Session(default=True)

        self.logger.debug("PTYShell init %s" % self)

        self.url = url  # describes the shell to run
        self.posix = posix  # /bin/sh compatible?
        self.interactive = interactive  # bash -i ?
        self.latency = 0.0  # set by factory
        self.cp_slave = None  # file copy channel

        self.initialized = False

        self.pty_id = PTYShell._pty_id
        PTYShell._pty_id += 1

        name = None
        if isinstance(cfg, str):
            name = cfg
            cfg = None
        self.cfg = ru.Config('radical.saga.session', name=name, cfg=cfg)
        self.cfg = self.cfg.pty

        # get prompt pattern from config, or use default
        self.prompt = self.cfg.get('prompt_pattern', DEFAULT_PROMPT)
        self.prompt_re = re.compile("^(.*?)%s" % self.prompt, re.DOTALL)
        self.logger.info("PTY prompt pattern: %s" % self.prompt)

        # local dir for file staging caches
        self.base = ru.get_radical_base('saga') + 'adaptors/shell/'
        try:
            ru.rec_makedir(self.base)
        except OSError as e:
            raise rse.NoSuccess('could not create staging dir: %s' % e) from e

        self.factory = supsf.PTYShellFactory()
        self.pty_info = self.factory.initialize(self.url,
                                                self.session,
                                                self.prompt,
                                                self.logger,
                                                self.cfg,
                                                self.posix,
                                                interactive=self.interactive)
        self.pty_shell = self.factory.run_shell(self.pty_info)

        self._trace('init : %s' % self.pty_shell.command)

        self.initialize()
Ejemplo n.º 7
0
    def start_components(self, cfg=None):
        '''
        check if any components are defined under `cfg['components']`
        and start them
        '''

        self._prof.prof('start_components_start', uid=self._uid)

        timeout = self._cfg.heartbeat.timeout

        if cfg is None:
            cfg = self._cfg

        # we pass a copy of the complete session config to all components, but
        # merge it into the component specific config settings (no overwrite),
        # and then remove the `bridges` and `components` sections
        #
        scfg = ru.Config(cfg=cfg)
        if 'bridges' in scfg: del (scfg['bridges'])
        if 'components' in scfg: del (scfg['components'])

        for cname, ccfg in cfg.get('components', {}).items():

            for _ in range(ccfg.get('count', 1)):

                ccfg.uid = ru.generate_id(cname, ns=self._sid)
                ccfg.cmgr = self.uid
                ccfg.kind = cname
                ccfg.sid = cfg.sid
                ccfg.base = cfg.base
                ccfg.path = cfg.path
                ccfg.heartbeat = cfg.heartbeat

                ccfg.merge(scfg, policy=ru.PRESERVE, log=self._log)

                fname = '%s/%s.json' % (cfg.path, ccfg.uid)
                ccfg.write(fname)

                self._log.info('create  component %s [%s]', cname, ccfg.uid)

                out, err, ret = ru.sh_callout('radical-pilot-component %s' %
                                              fname)
                self._log.debug('out: %s', out)
                self._log.debug('err: %s', err)
                if ret:
                    raise RuntimeError('bridge startup failed')

                self._uids.append(ccfg.uid)
                self._log.info('created component %s [%s]', cname, ccfg.uid)

        # all components should start now, for their heartbeats
        # to appear.
        failed = self._hb.wait_startup(self._uids, timeout=timeout * 10)
        if failed:
            raise RuntimeError('could not start all components %s' % failed)

        self._prof.prof('start_components_stop', uid=self._uid)
Ejemplo n.º 8
0
    def __init__(self, command, cfg='utils', logger=None):
        """
        The class constructor, which runs (execvpe) command in a separately
        forked process.  The bew process will inherit the environment of the
        application process.

        :type  command: string or list of strings
        :param command: The given command is what is run as a child, and
        fed/drained via pty pipes.  If given as string, command is split into an
        array of strings, using :func:`shlex.split`.

        :type  logger:  :class:`radical.utils.logger.Logger` instance
        :param logger:  logger stream to send status messages to.
        """

        self.rlock = mt.RLock()
        self.logger = logger

        if not self.logger: self.logger = ru.Logger('radical.saga.pty')
        self.logger.debug("PTYProcess init %s" % self)

        name = None
        if isinstance(cfg, str):
            name = cfg
            cfg = None

        self.cfg = ru.Config('radical.saga.session', name=name, cfg=cfg)
        self.cfg = self.cfg.pty

        if isinstance(command, str):
            command = shlex.split(command)

        if not isinstance(command, list):
            raise se.BadParameter("PTYProcess expects string or list command")

        if len(command) < 1:
            raise se.BadParameter("PTYProcess expects non-empty command")

        self.command = command  # list of strings too run()

        self.cache = ""  # data cache
        self.tail = ""  # tail of data data cache for error messages
        self.child = None  # the process as created by subprocess.Popen
        self.ptyio = None  # the process' io channel, from pty.fork()

        self.exit_code = None  # child died with code (may be revived)
        self.exit_signal = None  # child kill by signal (may be revived)

        self.recover_max = 3  # TODO: make configure option.  This does not
        self.recover_attempts = 0  # apply for recovers triggered by gc_timeout!

        try:
            self.initialize()

        except Exception as e:
            raise ptye.translate_exception(e, "pty or process creation failed")\
                  from e
Ejemplo n.º 9
0
    def __init__(self, cfg=None):

        self.logger = ru.Logger('radical.saga.pty')
        self.rlock = mt.RLock()
        self.registry = dict()

        name = None
        if isinstance(cfg, str):
            name = cfg
            cfg = None

        self.cfg = ru.Config('radical.saga.session', name=name, cfg=cfg)
        self.cfg = self.cfg.pty
Ejemplo n.º 10
0
    def __init__(self, adaptor_info, adaptor_options=None, expand_env=True):

        # FIXME: engine is loading cfg already, here we load again...

        self._info    = adaptor_info
        self._name    = adaptor_info['name']
        self._schemas = adaptor_info['schemas']

        self._lock    = mt.RLock()
        self._logger  = ru.Logger('radical.saga.api')

        self._cfg     = ru.Config(module='radical.saga.adaptors',
                                  name=self._name,
                                  expand=expand_env)

        if 'enabled' not in self._cfg:
            self._cfg['enabled'] = True
Ejemplo n.º 11
0
    def __init__(self, adaptor_info, adaptor_options=None, expand_env=True):

        # FIXME: engine is loading cfg already, here we load again...

        self._info = adaptor_info
        self._name = adaptor_info['name']
        self._schemas = adaptor_info['schemas']

        self._lock = ru.RLock(self._name)
        self._logger = ru.Logger('radical.saga.api')

        # we need to expand later once we got env from the remote resource
        self._cfg = ru.Config(module='radical.saga',
                              name=self._name,
                              expand=expand_env)

        if 'enabled' not in self._cfg:
            self._cfg['enabled'] = True
Ejemplo n.º 12
0
    def _get_config(self, cfg=None):
        '''
        derive a worker base configuration from the control pubsub configuration
        '''

        # FIXME: this uses insider knowledge on the config location and
        #        structure.  It would be better if agent.0 creates the worker
        #        base config from scratch on startup.

        pwd = os.getcwd()
        ru.dict_merge(cfg, ru.read_json('%s/../control_pubsub.json' % pwd))

        del (cfg['channel'])
        del (cfg['cmgr'])

        cfg['log_lvl'] = 'debug'
        cfg['kind'] = 'master'
        cfg['base'] = pwd
        cfg['uid'] = ru.generate_id('master.%(item_counter)06d',
                                    ru.ID_CUSTOM,
                                    ns=self._session.uid)

        return ru.Config(cfg=cfg)
Ejemplo n.º 13
0
def rp_config():
    """Provide a RADICAL Pilot Resource Config to a test suite.

    The 'resource' key in a Pilot Description must name a key that the Session
    can use to get default values for the execution environment.
    """
    # Ref: https://radicalpilot.readthedocs.io/en/stable/machconf.html#customizing-resource-configurations-programatically
    import radical.pilot as rp
    import radical.utils as ru
    # TODO: Resolve usage error.
    # Ref: https://github.com/radical-cybertools/radical.pilot/issues/2181
    try:
        cfg = rp.ResourceConfig(
            'local.localhost',
            ru.Config('radical.pilot.session', name='default', cfg=None))
    except:
        cfg = dict()
    # `local.localhost` is preconfigured, but some of the properties are likely not appropriate.
    # Ref: https://github.com/radical-cybertools/radical.pilot/blob/devel/src/radical/pilot/configs/resource_local.json
    # TODO: Is there a more canonical way to programmatically generate a valid config?
    # Ref: https://radicalpilot.readthedocs.io/en/stable/machconf.html#writing-a-custom-resource-configuration-file
    # TODO: Set a sensible number of cores / threads / GPUs.
    return dict(config=cfg, rp=rp, ru=ru)
Ejemplo n.º 14
0
def check_runs(cfg_file, run_file):

    runs = list()
    n_smiles = dict()

    rec_path = 'input/receptors.ad/'  # FIXME
    smi_path = 'input/smiles/'  # FIXME

    cfg = ru.Config(cfg=ru.read_json(cfg_file))
    res_path = cfg.fs_url + cfg.workload.results

    fs = rs.filesystem.Directory(res_path)

    with open(run_file, 'r') as fin:

        for line in fin.readlines():

            line = line.strip()

            if not line:
                continue

            if line.startswith('#'):
                continue

            elems = line.split()

            assert (len(elems) == 4), line

            receptor = str(elems[0])
            smiles = str(elems[1])
            n_workers = int(elems[2])
            runtime = int(elems[3])

            runs.append([receptor, smiles, n_workers, runtime])

    return runs
Ejemplo n.º 15
0
    def setUp(self):

        class Session(object):
            def __init__(self):
                self.uid = 'uid.0'
                self.sid = 'sid.0'
                self.cfg = ru.Config(cfg={'dburl': 'db://'})

            def _get_resource_sandbox(self, pilot):
                return ru.Url('/resource/sandbox/%s' % pilot)

            def _get_session_sandbox(self, pilot):
                return ru.Url('/session/sandbox/%s' % pilot)

            def _get_pilot_sandbox(self, pilot):
                return ru.Url('/pilot/sandbox/%s' % pilot)

            def _get_client_sandbox(self):
                return ru.Url('/client/sandbox')


        session = Session()
        configs = ru.Config('radical.pilot.resource', name='*')
        return session, configs
Ejemplo n.º 16
0
def test_zmq_pubsub():
    '''
    create a bridge, 2 producers (A, B) and 2 consumers (C, D).  Send with the
    following rates for 10 seconds:

      A: 10/s
      B: 20/s

    Ensure that
      - the ratios of sent / received messages reflects the rates
      - the local order of messages is preserved
      - messages are received exactly once (no messages get lost / duplicated)
    '''

    c_a = 200
    c_b = 400

    cfg = ru.Config(cfg={'uid'      : 'test_pubsub',
                         'channel'  : 'test',
                         'kind'     : 'pubsub',
                         'log_level': 'error',
                         'path'     : '/tmp/',
                         'sid'      : 'test_sid',
                         'bulk_size': 0,
                         'stall_hwm': 1,
                        })

    b = ru.zmq.PubSub(cfg)
    b.start()

    assert(b.addr_in  != b.addr_out)
    assert(b.addr_in  == b.addr_pub)
    assert(b.addr_out == b.addr_sub)

    data = dict()
    for i in 'ABCD':
        data[i] = dict()
        for j in 'AB':
            data[i][j] = 0

    def cb(uid, topic, msg):
        if msg['idx'] is None:
            return False
        data[uid][msg['src']] += 1

    cb_C = lambda t,m: cb('C', t, m)
    cb_D = lambda t,m: cb('D', t, m)

    ru.zmq.Subscriber(channel=cfg['channel'], url=str(b.addr_sub),
                      topic='topic', cb=cb_C)
    ru.zmq.Subscriber(channel=cfg['channel'], url=str(b.addr_sub),
                      topic='topic', cb=cb_D)
    time.sleep(0.1)

    # --------------------------------------------------------------------------
    def work_pub(uid, n, delay):

        pub = ru.zmq.Publisher(channel=cfg['channel'], url=str(b.addr_pub))
        idx = 0
        while idx < n:
            time.sleep(delay)
            pub.put('topic', {'src': uid,
                              'idx': idx})
            idx += 1
            data[uid][uid] += 1

        # send EOF
        pub.put('topic', {'src': uid,
                          'idx': None})
    # --------------------------------------------------------------------------


    t_a = mt.Thread(target=work_pub, args=['A', c_a, 0.005])
    t_b = mt.Thread(target=work_pub, args=['B', c_b, 0.005])

    t_a.start()
    t_b.start()

    t_a.join()
    t_b.join()

    b.stop()
    time.sleep(0.1)

    assert(data['A']['A'] == c_a)
    assert(data['B']['B'] == c_b)

    assert(data['C']['A'] + data['C']['B'] +
           data['D']['A'] + data['D']['B'] == 2 * (c_a + c_b))
Ejemplo n.º 17
0
    def __init__(self, cfg=None, backend='zmq'):

        self._backend = backend  # FIXME: use

        self._lock = ru.Lock('master')
        self._workers = dict()  # wid: worker
        self._requests = dict()  # bookkeeping of submitted requests
        self._lock = mt.Lock()  # lock the request dist on updates

        cfg.sid = os.environ['RP_SESSION_ID']
        cfg.base = os.environ['RP_PILOT_SANDBOX']
        cfg.path = os.environ['RP_PILOT_SANDBOX']
        self._session = Session(cfg=cfg, uid=cfg.sid, _primary=False)
        cfg = self._get_config(cfg)

        rpu.Component.__init__(self, cfg, self._session)

        self.register_output(rps.AGENT_STAGING_INPUT_PENDING,
                             rpc.AGENT_STAGING_INPUT_QUEUE)
        self.register_subscriber(rpc.CONTROL_PUBSUB, self._control_cb)

        # set up RU ZMQ Queues for request distribution and result collection
        req_cfg = ru.Config(
            cfg={
                'channel': '%s.to_req' % self._uid,
                'type': 'queue',
                'uid': self._uid + '.req',
                'path': os.getcwd(),
                'stall_hwm': 0,
                'bulk_size': 56
            })

        res_cfg = ru.Config(
            cfg={
                'channel': '%s.to_res' % self._uid,
                'type': 'queue',
                'uid': self._uid + '.res',
                'path': os.getcwd(),
                'stall_hwm': 0,
                'bulk_size': 56
            })

        self._req_queue = ru.zmq.Queue(req_cfg)
        self._res_queue = ru.zmq.Queue(res_cfg)

        self._req_queue.start()
        self._res_queue.start()

        self._req_addr_put = str(self._req_queue.addr_put)
        self._req_addr_get = str(self._req_queue.addr_get)

        self._res_addr_put = str(self._res_queue.addr_put)
        self._res_addr_get = str(self._res_queue.addr_get)

        # this master will put requests onto the request queue, and will get
        # responses from the response queue.  Note that the responses will be
        # delivered via an async callback (`self._result_cb`).
        self._req_put = ru.zmq.Putter('%s.to_req' % self._uid,
                                      self._req_addr_put)
        self._res_get = ru.zmq.Getter('%s.to_res' % self._uid,
                                      self._res_addr_get,
                                      cb=self._result_cb)

        # for the workers it is the opposite: they will get requests from the
        # request queue, and will send responses to the response queue.
        self._info = {
            'req_addr_get': self._req_addr_get,
            'res_addr_put': self._res_addr_put
        }

        # make sure the channels are up before allowing to submit requests
        time.sleep(1)

        # connect to the local agent
        self._log.debug('startup complete')
Ejemplo n.º 18
0
                }
            }
            self.request(item)

        self._prof.prof('create_stop')


# ------------------------------------------------------------------------------
if __name__ == '__main__':

    # This master script runs as a task within a pilot allocation.  The purpose
    # of this master is to (a) spawn a set or workers within the same
    # allocation, (b) to distribute work items (`dock` function calls) to those
    # workers, and (c) to collect the responses again.
    cfg_fname = 'wf2_md.cfg'
    cfg = ru.Config(cfg=ru.read_json(cfg_fname))
    cfg.idx = int(sys.argv[1])

    # FIXME: worker startup should be moved into master
    workload = cfg.workload
    n_nodes = cfg.nodes
    cpn = cfg.cpn
    gpn = cfg.gpn

    # Prepare dirlist in case we are iterating and we have detected outliers
    initial_MD = True
    outlier_filepath = '%s/Outlier_search/restart_points.json' % cfg[
        'base_path']

    if os.path.exists(outlier_filepath):
        initial_MD = False
Ejemplo n.º 19
0
    def __init__(self):

        self.logger = ru.Logger('radical.saga.pty')
        self.registry = {}
        self.rlock = ru.RLock('pty shell factory')
        self.cfg = ru.Config('radical.saga', 'utils')['pty']
Ejemplo n.º 20
0
    def get_config(self, name=None):

        return ru.Config(module='radical.saga', name=name)
Ejemplo n.º 21
0
    def __init__(self, dburl=None, uid=None, cfg=None, _primary=True):
        '''
        Creates a new session.  A new Session instance is created and
        stored in the database.

        **Arguments:**
            * **dburl** (`string`): The MongoDB URL.  If none is given,
              RP uses the environment variable RADICAL_PILOT_DBURL.  If that is
              not set, an error will be raises.

            * **cfg** (`str` or `dict`): a named or instantiated configuration
              to be used for the session.

            * **uid** (`string`): Create a session with this UID.  Session UIDs
              MUST be unique - otherwise they will lead to conflicts in the
              underlying database, resulting in undefined behaviours (or worse).

            * **_primary** (`bool`): only sessions created by the original
              application process (via `rp.Session()`, will connect to the  DB.
              Secondary session instances are instantiated internally in
              processes spawned (directly or indirectly) by the initial session,
              for example in some of it's components.  A secondary session will
              inherit the original session ID, but will not attempt to create
              a new DB collection - if such a DB connection is needed, the
              component needs to establish that on its own.
        '''

        # NOTE: `name` and `cfg` are overloaded, the user cannot point to
        #       a predefined config and amed it at the same time.  This might
        #       be ok for the session, but introduces a minor API inconsistency.
        name = 'default'
        if isinstance(cfg, str):
            name = cfg
            cfg  = None

        self._dbs     = None
        self._closed  = False
        self._primary = _primary

        self._pmgrs   = dict()  # map IDs to pmgr instances
        self._umgrs   = dict()  # map IDs to umgr instances
        self._cmgr    = None    # only primary sessions have a cmgr

        self._cfg     = ru.Config('radical.pilot.session',  name=name, cfg=cfg)
        self._rcfgs   = ru.Config('radical.pilot.resource', name='*')

        if _primary:

            pwd = os.getcwd()

            if not self._cfg.sid:
                if uid:
                    self._cfg.sid = uid
                else:
                    self._cfg.sid = ru.generate_id('rp.session',
                                                   mode=ru.ID_PRIVATE)
            if not self._cfg.base:
                self._cfg.base = pwd

            if not self._cfg.path:
                self._cfg.path = '%s/%s' % (self._cfg.base, self._cfg.sid)

            if not self._cfg.client_sandbox:
                self._cfg.client_sandbox = pwd

        else:
            for k in ['sid', 'base', 'path']:
                assert(k in self._cfg), 'non-primary session misses %s' % k

        # change RU defaults to point logfiles etc. to the session sandbox
        def_cfg             = ru.DefaultConfig()
        def_cfg.log_dir     = self._cfg.path
        def_cfg.report_dir  = self._cfg.path
        def_cfg.profile_dir = self._cfg.path

        self._uid  = self._cfg.sid

        self._prof = self._get_profiler(name=self._uid)
        self._rep  = self._get_reporter(name=self._uid)
        self._log  = self._get_logger  (name=self._uid,
                                       level=self._cfg.get('debug'))

        from . import version_detail as rp_version_detail
        self._log.info('radical.pilot version: %s' % rp_version_detail)
        self._log.info('radical.saga  version: %s' % rs.version_detail)
        self._log.info('radical.utils version: %s' % ru.version_detail)

        self._prof.prof('session_start', uid=self._uid, msg=int(_primary))

        # now we have config and uid - initialize base class (saga session)
        rs.Session.__init__(self, uid=self._uid)

        # cache sandboxes etc.
        self._cache_lock = ru.RLock()
        self._cache      = {'resource_sandbox' : dict(),
                            'session_sandbox'  : dict(),
                            'pilot_sandbox'    : dict(),
                            'client_sandbox'   : self._cfg.client_sandbox,
                            'js_shells'        : dict(),
                            'fs_dirs'          : dict()}

        if _primary:
            self._initialize_primary(dburl)

        # at this point we have a DB connection, logger, etc, and are done
        self._prof.prof('session_ok', uid=self._uid, msg=int(_primary))
Ejemplo n.º 22
0
    def __init__(self, session, cfg='default', scheduler=None):
        """
        Creates a new UnitManager and attaches it to the session.

        **Arguments:**
            * session [:class:`radical.pilot.Session`]:
              The session instance to use.
            * cfg (`dict` or `string`):
              The configuration or name of configuration to use.
            * scheduler (`string`):
              The name of the scheduler plug-in to use.

        **Returns:**
            * A new `UnitManager` object [:class:`radical.pilot.UnitManager`].
        """

        self._pilots      = dict()
        self._pilots_lock = ru.RLock('umgr.pilots_lock')
        self._units       = dict()
        self._units_lock  = ru.RLock('umgr.units_lock')
        self._callbacks   = dict()
        self._cb_lock     = ru.RLock('umgr.cb_lock')
        self._terminate   = mt.Event()
        self._closed      = False
        self._rec_id      = 0       # used for session recording
        self._uid         = ru.generate_id('umgr.%(item_counter)04d',
                                           ru.ID_CUSTOM, ns=session.uid)

        for m in rpc.UMGR_METRICS:
            self._callbacks[m] = dict()

        # NOTE: `name` and `cfg` are overloaded, the user cannot point to
        #       a predefined config and amed it at the same time.  This might
        #       be ok for the session, but introduces a minor API inconsistency.
        #
        name = None
        if isinstance(cfg, str):
            name = cfg
            cfg  = None

        cfg           = ru.Config('radical.pilot.umgr', name=name, cfg=cfg)
        cfg.uid       = self._uid
        cfg.owner     = self._uid
        cfg.sid       = session.uid
        cfg.base      = session.base
        cfg.path      = session.path
        cfg.dburl     = session.dburl
        cfg.heartbeat = session.cfg.heartbeat

        if scheduler:
            # overwrite the scheduler from the config file
            cfg.scheduler = scheduler


        rpu.Component.__init__(self, cfg, session=session)
        self.start()

        self._log.info('started umgr %s', self._uid)
        self._rep.info('<<create unit manager')

        # create pmgr bridges and components, use session cmgr for that
        self._cmgr = rpu.ComponentManager(self._cfg)
        self._cmgr.start_bridges()
        self._cmgr.start_components()

        # The output queue is used to forward submitted units to the
        # scheduling component.
        self.register_output(rps.UMGR_SCHEDULING_PENDING,
                             rpc.UMGR_SCHEDULING_QUEUE)

        # the umgr will also collect units from the agent again, for output
        # staging and finalization
        if self._cfg.bridges.umgr_staging_output_queue:
            self._has_sout = True
            self.register_output(rps.UMGR_STAGING_OUTPUT_PENDING,
                                 rpc.UMGR_STAGING_OUTPUT_QUEUE)
        else:
            self._has_sout = False

        # register the state notification pull cb
        # FIXME: this should be a tailing cursor in the update worker
        self.register_timed_cb(self._state_pull_cb,
                               timer=self._cfg['db_poll_sleeptime'])

        # register callback which pulls units back from agent
        # FIXME: this should be a tailing cursor in the update worker
        self.register_timed_cb(self._unit_pull_cb,
                               timer=self._cfg['db_poll_sleeptime'])

        # also listen to the state pubsub for unit state changes
        self.register_subscriber(rpc.STATE_PUBSUB, self._state_sub_cb)

        # let session know we exist
        self._session._register_umgr(self)

        self._prof.prof('setup_done', uid=self._uid)
        self._rep.ok('>>ok\n')
Ejemplo n.º 23
0
    def __init__(self,
                 url,
                 session=None,
                 logger=None,
                 cfg=None,
                 posix=True,
                 interactive=True):

        if logger: self.logger = logger
        else: self.logger = ru.Logger('radical.saga.pty')

        if session: self.session = session
        else: self.session = ss.Session(default=True)

        self.logger.debug("PTYShell init %s" % self)

        self.url = url  # describes the shell to run
        self.posix = posix  # /bin/sh compatible?
        self.interactive = interactive  # bash -i ?
        self.latency = 0.0  # set by factory
        self.cp_slave = None  # file copy channel

        self.initialized = False

        self.pty_id = PTYShell._pty_id
        PTYShell._pty_id += 1

        name = None
        if isinstance(cfg, str):
            name = cfg
            cfg = None
        self.cfg = ru.Config('radical.saga.session', name=name, cfg=cfg)
        self.cfg = self.cfg.pty

        # get prompt pattern from config, or use default
        self.prompt = self.cfg.get('prompt_pattern', DEFAULT_PROMPT)
        self.prompt_re = re.compile("^(.*?)%s" % self.prompt, re.DOTALL)
        self.logger.info("PTY prompt pattern: %s" % self.prompt)

        # we need a local dir for file staging caches.  At this point we use
        # $HOME, but should make this configurable (FIXME)
        self.base = os.environ['HOME'] + '/.radical/saga/adaptors/shell/'

        try:
            os.makedirs(self.base)

        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(self.base):
                pass
            else:
                raise rse.NoSuccess ("could not create staging dir: %s" % e) \
                      from e

        self.factory = supsf.PTYShellFactory()
        self.pty_info = self.factory.initialize(self.url,
                                                self.session,
                                                self.prompt,
                                                self.logger,
                                                self.cfg,
                                                self.posix,
                                                interactive=self.interactive)
        self.pty_shell = self.factory.run_shell(self.pty_info)

        self._trace('init : %s' % self.pty_shell.command)

        self.initialize()
Ejemplo n.º 24
0
    def __init__(self, cfg):

        if isinstance(cfg, str): cfg = ru.Config(cfg=ru.read_json(cfg))
        else                   : cfg = ru.Config(cfg=cfg)

        self._n_cores = cfg.cores
        self._n_gpus  = cfg.gpus

        self._info    = ru.Config(cfg=cfg.get('info', {}))
        self._session = Session(cfg=cfg, uid=cfg.sid, _primary=False)

        rpu.Component.__init__(self, cfg, self._session)

        self._term    = mp.Event()          # set to terminate
        self._res_evt = mp.Event()          # set on free resources

        self._mlock   = ru.Lock(self._uid)  # lock `_modes` and `_mdata`
        self._modes   = dict()              # call modes (call, exec, eval, ...)
        self._mdata   = dict()              # call mode meta data

        # We need to make sure to run only up to `gpn` tasks using a gpu
        # within that pool, so need a separate counter for that.
        self._resources = {'cores' : [0] * self._n_cores,
                           'gpus'  : [0] * self._n_gpus}

        # resources are initially all free
        self._res_evt.set()

      # # create a multiprocessing pool with `cpn` worker processors.  Set
      # # `maxtasksperchild` to `1` so that we get a fresh process for each
      # # task.  That will also allow us to run command lines via `exec`,
      # # effectively replacing the worker process in the pool for a specific
      # # task.
      # #
      # # We use a `fork` context to inherit log and profile handles.
      # #
      # # NOTE: The mp documentation is wrong; mp.Pool does *not* have a context
      # #       parameters.  Instead, the Pool has to be created within
      # #       a context.
      # ctx = mp.get_context('fork')
      # self._pool = ctx.Pool(processes=self._n_cores,
      #                       initializer=None,
      #                       maxtasksperchild=1)
      # NOTE: a multiprocessing pool won't work, as pickle is not able to
      #       serialize our worker object.  So we use our own process pool.
      #       It's not much of a loss since we want to respawn new processes for
      #       each task anyway (to improve isolation).
        self._pool  = dict()  # map task uid to process instance
        self._plock = ru.Lock('p' + self._uid)  # lock _pool

        # We also create a queue for communicating results back, and a thread to
        # watch that queue
        self._result_queue = mp.Queue()
        self._result_thead = mt.Thread(target=self._result_watcher)
        self._result_thead.daemon = True
        self._result_thead.start()

        # connect to master
        self.register_subscriber(rpc.CONTROL_PUBSUB, self._control_cb)
        self.register_publisher(rpc.CONTROL_PUBSUB)

        # run worker initialization *before* starting to work on requests.
        # the worker provides three builtin methods:
        #     eval:  evaluate a piece of python code
        #     exec:  execute  a command line (fork/exec)
        #     shell: execute  a shell command
        #     call:  execute  a method or function call
        self.register_mode('call',  self._call)
        self.register_mode('eval',  self._eval)
        self.register_mode('exec',  self._exec)
        self.register_mode('shell', self._shell)

        self.pre_exec()

        # connect to the request / response ZMQ queues
        self._res_put = ru.zmq.Putter('to_res', self._info.res_addr_put)
        self._req_get = ru.zmq.Getter('to_req', self._info.req_addr_get,
                                                cb=self._request_cb)

        # the worker can return custom information which will be made available
        # to the master.  This can be used to communicate, for example, worker
        # specific communication endpoints.

        # `info` is a placeholder for any additional meta data communicated to
        # the worker
        self.publish(rpc.CONTROL_PUBSUB, {'cmd': 'worker_register',
                                          'arg': {'uid' : self._uid,
                                                  'info': self._info}})
Ejemplo n.º 25
0
def test_zmq_queue():
    '''
    create a bridge, 2 producers (A, B) and 2 consumers (C, D).  Send with the
    following rates for 10 seconds:

      A: 10/s
      B: 20/s

    Ensure that
      - the ratios of sent / received messages reflects the rates
      - the local order of messages is preserved
      - messages are received exactly once (no messages get lost / duplicated)
    '''

    c_a = 100
    c_b = 200

    cfg = ru.Config(
        cfg={
            'uid': 'test_queue',
            'channel': 'test',
            'kind': 'queue',
            'log_level': 'error',
            'path': '/tmp/',
            'sid': 'test_sid',
            'bulk_size': 50,
            'stall_hwm': 1,
        })

    b = ru.zmq.Queue(cfg)
    b.start()

    assert (b.addr_in != b.addr_out)
    assert (b.addr_in == b.addr_put)
    assert (b.addr_out == b.addr_get)

    C = ru.zmq.Getter(channel=cfg['channel'], url=str(b.addr_get))
    D = ru.zmq.Getter(channel=cfg['channel'], url=str(b.addr_get))

    A = ru.zmq.Putter(channel=cfg['channel'], url=str(b.addr_put))
    B = ru.zmq.Putter(channel=cfg['channel'], url=str(b.addr_put))

    data = dict()

    def work_put(putter, uid, n, delay):

        data[uid] = list()
        idx = 0
        while idx < n:
            time.sleep(delay)
            putter.put({'src': uid, 'idx': idx})
            idx += 1
            data[uid].append(uid)

        # send EOF
        putter.put({'src': uid, 'idx': None})

    def work_get(getter, uid):

        data[uid] = list()
        done = False
        n = 0
        while not done:
            msgs = getter.get()
            for msg in msgs:
                msg = ru.as_string(msg)
                if msg['idx'] is None:
                    final = True
                    done = True
                else:
                    data[uid].append(msg['src'])
                    n += 1

        getter.stop()

    t_a = mt.Thread(target=work_put, args=[A, 'A', c_a, 0.010])
    t_b = mt.Thread(target=work_put, args=[B, 'B', c_b, 0.005])
    t_c = mt.Thread(target=work_get, args=[C, 'C'])
    t_d = mt.Thread(target=work_get, args=[D, 'D'])

    t_a.daemon = True
    t_b.daemon = True
    t_c.daemon = True
    t_d.daemon = True

    t_a.start()
    t_b.start()
    t_c.start()
    t_d.start()

    time.sleep(3)
    b.stop()

    # uids = list(data.keys())
    # for x in uids:
    #     for y in uids:
    #         print('%s: %s: %d' % (x, y, data[x].count(y)))
    #
    # print(len(data['A']))
    # print(len(data['B']))
    # print(len(data['C']))
    # print(len(data['D']))

    assert (data['A'].count('A') == c_a)
    assert (data['B'].count('B') == c_b)
    assert (len(data['A']) == c_a)
    assert (len(data['B']) == c_b)

    assert (data['C'].count('A') + data['C'].count('B') +
            data['D'].count('A') + data['D'].count('B') == c_a + c_b)

    avg = (c_a + c_b) / 2
    assert (avg - 30 < data['C'].count('A') + data['C'].count('B') < avg + 30)
    assert (avg - 30 < data['D'].count('A') + data['D'].count('B') < avg + 30)
Ejemplo n.º 26
0
def test_zmq_queue_cb():
    '''
    same test, but use subscriber callbacks for message delivery, and only use
    one subscriber
    '''

    data = {'put': dict(), 'get': dict()}
    c_a = 2
    c_b = 4
    cfg = ru.Config(
        cfg={
            'uid': 'test_queue',
            'channel': 'test',
            'kind': 'queue',
            'log_level': 'error',
            'path': '/tmp/',
            'sid': 'test_sid',
            'bulk_size': 0,
            'stall_hwm': 1,
        })

    def get_msg_a(msg):
        uid, _ = msg.split('.')
        if uid not in data['get']:
            data['get'][uid] = list()
        data['get'][uid].append(uid)

    b = ru.zmq.Queue(cfg)
    b.start()

    assert (b.addr_in != b.addr_out)
    assert (b.addr_in == b.addr_put)
    assert (b.addr_out == b.addr_get)

    ru.zmq.Getter(channel=cfg['channel'], url=str(b.addr_get), cb=get_msg_a)

    time.sleep(1.0)

    A = ru.zmq.Putter(channel=cfg['channel'], url=str(b.addr_put))
    B = ru.zmq.Putter(channel=cfg['channel'], url=str(b.addr_put))

    def work_put(putter, uid, n, delay):

        data['put'][uid] = list()
        idx = 0
        while idx < n:
            time.sleep(delay)
            msg = '%s.%d' % (uid, idx)
            putter.put(msg)
            idx += 1
            data['put'][uid].append(uid)

    t_a = mt.Thread(target=work_put, args=[A, 'A', c_a, 0.010])
    t_b = mt.Thread(target=work_put, args=[B, 'B', c_b, 0.005])

    t_a.daemon = True
    t_b.daemon = True

    t_a.start()
    t_b.start()

    time.sleep(1.0)
    b.stop()

    # import pprint
    # pprint.pprint(data)

    assert (data['put']['A'].count('A') == c_a)
    assert (data['put']['B'].count('B') == c_b)
    assert (len(data['put']['A']) == c_a)
    assert (len(data['put']['B']) == c_b)

    # print(data['get']['A'].count('A'))
    # print(data['get']['B'].count('B'))
    # print(c_a)
    # print(c_b)

    assert (data['get']['A'].count('A') + data['get']['B'].count('B') == c_a +
            c_b)
Ejemplo n.º 27
0
    # resource specified as argument
    if len(sys.argv) == 7:
        cfg_file = sys.argv[1]
        cfg_ml1_file = sys.argv[2]
        cfg_wf1_file = sys.argv[3]
        cfg_wf2_file = sys.argv[4]
        cfg_wf3_cg_file = sys.argv[5]
        cfg_wf3_fg_file = sys.argv[6]

    else:
        reporter.exit(
            'Usage:\t%s [config.json] [config_ml1.json] [config_wf1.json] [config_wf2.json] [config_wf3_cg.json] [config_wf3_fg.json]\n\n'
            % sys.argv[0])

    try:
        cfg = ru.Config(cfg=ru.read_json(cfg_file))
        cfg_ml1 = ru.Config(cfg=ru.read_json(cfg_ml1_file))
        cfg_wf1 = ru.Config(cfg=ru.read_json(cfg_wf1_file))
        cfg_wf2 = ru.Config(cfg=ru.read_json(cfg_wf2_file))
        cfg_wf3_cg = ru.Config(cfg=ru.read_json(cfg_wf3_cg_file))
        cfg_wf3_fg = ru.Config(cfg=ru.read_json(cfg_wf3_fg_file))

        if not check_environment():
            raise ("ERROR: Incorrect environment set up.")

        pdesc = {
            'resource': cfg['pdesc']['resource'],
            'queue': cfg['pdesc']['queue'],
            'schema': cfg['pdesc']['schema'],
            'walltime': cfg['pdesc']['walltime'],
            'cpus': cfg['pdesc']['cpus_node'] * 4 * cfg['pdesc']['nodes'],
Ejemplo n.º 28
0
    def test_add_md_stage(self, mocked_generate_id, mocked_Logger):
        self.maxDiff = None
        pwd = os.path.dirname(os.path.abspath(__file__))
        wl_cfg = ru.read_json(pwd + '/test_case/workflow_gromacs.json')
        workload = ru.Config(cfg=wl_cfg)

        test_rep = Replica(workload=workload)

        test_rep.add_md_stage(sid='test_sid')

        self.assertEqual(len(test_rep.stages),
                         len(workload['md']['description']))
        task0 = list(test_rep.stages[0].tasks)[0]
        self.assertEqual(task0.name, 'test.0000.0000.md')
        self.assertEqual(task0.sandbox, 'test.0000.md')

        link_inputs = [
            'pilot:///inputs//mdin.mdp.test > task:///mdin.mdp',
            'pilot:///inputs//sys.top > task:///sys.top',
            'pilot:///inputs//sys.itp > task:///sys.itp',
            'pilot:///inputs//inp.ener > task:///inp.ener',
            'pilot:///inputs//martini_v2.2.itp > task:///martini_v2.2.itp',
            'pilot:///inputs//inpcrd.gro.test > task:///inpcrd.gro'
        ]
        self.assertEqual(task0.link_input_data, link_inputs)
        self.assertEqual(task0.arguments, [
            'grompp', '-f', 'mdin.mdp', '-c', 'inpcrd.gro', '-o', 'sys.tpr',
            '-p', 'sys.top'
        ])
        self.assertEqual(
            task0.cpu_reqs, {
                'cpu_process_type': 'MPI',
                'cpu_processes': 1,
                'cpu_thread_type': None,
                'cpu_threads': 1
            })
        self.assertEqual(task0.executable, 'gmx_mpi')
        self.assertEqual(
            task0.pre_exec,
            ["module load gromacs/2020.2-cpu", "export GMX_MAXBACKUP=-1"])

        task1 = list(test_rep.stages[1].tasks)[0]
        self.assertEqual(task1.name, 'test.0000.0001.md')
        self.assertEqual(task1.sandbox, 'test.0000.md')

        link_inputs = []
        self.assertEqual(task1.link_input_data, link_inputs)
        self.assertEqual(task1.arguments, [
            "mdrun", "-s", "sys.tpr", "-deffnm", "sys", "-c", "outcrd.gro",
            "-e", "sys.edr"
        ])
        self.assertEqual(
            task1.cpu_reqs, {
                'cpu_process_type': 'MPI',
                'cpu_processes': 4,
                'cpu_thread_type': None,
                'cpu_threads': 1
            })
        self.assertEqual(task1.executable, 'gmx_mpi')
        self.assertEqual(
            task1.pre_exec,
            ["module load gromacs/2020.2-cpu", "export GMX_MAXBACKUP=-1"])

        task2 = list(test_rep.stages[2].tasks)[0]
        self.assertEqual(task2.name, 'test.0000.0002.md')
        self.assertEqual(task2.sandbox, 'test.0000.md')

        link_inputs = []
        self.assertEqual(task2.link_input_data, link_inputs)
        download_output_data = [
            'task:///outcrd.gro > ' + 'client:///outputs//outcrd.gro.test.0000'
        ]
        self.assertEqual(task2.download_output_data, download_output_data)
        self.assertEqual(task2.arguments, [
            "energy", "-f", "sys.edr", "-b", 0.25, "<", "inp.ener", ">",
            "mdinfo"
        ])
        self.assertEqual(
            task2.cpu_reqs, {
                'cpu_process_type': 'MPI',
                'cpu_processes': 1,
                'cpu_thread_type': None,
                'cpu_threads': 1
            })
        self.assertEqual(task2.executable, 'gmx_mpi')
        self.assertEqual(
            task2.pre_exec,
            ["module load gromacs/2020.2-cpu", "export GMX_MAXBACKUP=-1"])

        # Inserting second MD cycle
        ex_0 = Task()
        ex_0.name = 'test.ex'
        ex_0.sandbox = 'ex.0'
        test_rep.add_md_stage(sid='test.sid', exchanged_from=ex_0)
        task0 = list(test_rep.stages[3].tasks)[0]

        self.assertEqual(task0.name, 'test.0001.0000.md')
        self.assertEqual(task0.sandbox, 'test.0001.md')

        link_inputs = [
            'pilot:///inputs//mdin.mdp.test > task:///mdin.mdp',
            'pilot:///inputs//sys.top > task:///sys.top',
            'pilot:///inputs//sys.itp > task:///sys.itp',
            'pilot:///inputs//inp.ener > task:///inp.ener',
            'pilot:///inputs//martini_v2.2.itp > task:///martini_v2.2.itp',
            'pilot:///ex.0/outcrd.gro.test > task:///inpcrd.gro'
        ]
        self.assertEqual(task0.link_input_data, link_inputs)
        self.assertEqual(task0.arguments, [
            'grompp', '-f', 'mdin.mdp', '-c', 'inpcrd.gro', '-o', 'sys.tpr',
            '-p', 'sys.top'
        ])
        self.assertEqual(
            task0.cpu_reqs, {
                'cpu_process_type': 'MPI',
                'cpu_processes': 1,
                'cpu_thread_type': None,
                'cpu_threads': 1
            })
        self.assertEqual(task0.executable, 'gmx_mpi')
        self.assertEqual(
            task0.pre_exec,
            ["module load gromacs/2020.2-cpu", "export GMX_MAXBACKUP=-1"])

        task1 = list(test_rep.stages[4].tasks)[0]
        self.assertEqual(task1.name, 'test.0001.0001.md')
        self.assertEqual(task1.sandbox, 'test.0001.md')

        link_inputs = []
        self.assertEqual(task1.link_input_data, link_inputs)
        self.assertEqual(task1.arguments, [
            "mdrun", "-s", "sys.tpr", "-deffnm", "sys", "-c", "outcrd.gro",
            "-e", "sys.edr"
        ])
        self.assertEqual(
            task1.cpu_reqs, {
                'cpu_process_type': 'MPI',
                'cpu_processes': 4,
                'cpu_thread_type': None,
                'cpu_threads': 1
            })
        self.assertEqual(task1.executable, 'gmx_mpi')
        self.assertEqual(
            task1.pre_exec,
            ["module load gromacs/2020.2-cpu", "export GMX_MAXBACKUP=-1"])

        task2 = list(test_rep.stages[5].tasks)[0]
        self.assertEqual(task2.name, 'test.0001.0002.md')
        self.assertEqual(task2.sandbox, 'test.0001.md')

        link_inputs = []
        self.assertEqual(task2.link_input_data, link_inputs)
        download_output_data = [
            'task:///outcrd.gro > ' + 'client:///outputs//outcrd.gro.test.0001'
        ]
        self.assertEqual(task2.download_output_data, download_output_data)
        self.assertEqual(task2.arguments, [
            "energy", "-f", "sys.edr", "-b", 0.25, "<", "inp.ener", ">",
            "mdinfo"
        ])
        self.assertEqual(
            task2.cpu_reqs, {
                'cpu_process_type': 'MPI',
                'cpu_processes': 1,
                'cpu_thread_type': None,
                'cpu_threads': 1
            })
        self.assertEqual(task2.executable, 'gmx_mpi')
        self.assertEqual(
            task2.pre_exec,
            ["module load gromacs/2020.2-cpu", "export GMX_MAXBACKUP=-1"])
Ejemplo n.º 29
0
if __name__ == '__main__':

    reporter = ru.Reporter(name='radical.entk')
    reporter.title('COVID-19 - Workflow2')

    # resource specified as argument
    if len(sys.argv) == 2:
        cfg_file = sys.argv[1]
    elif sys.argv[0] == "molecules_adrp.py":
        cfg_file = "adrp_system.json"
    elif sys.argv[0] == "molecules_3clpro.py":
        cfg_file = "3clpro_system.json"
    else:
        reporter.exit('Usage:\t%s [config.json]\n\n' % sys.argv[0])

    cfg = ru.Config(cfg=ru.read_json(cfg_file))
    cfg['node_counts'] = max(1, cfg['md_counts'] // cfg['gpu_per_node'])

    res_dict = {
            'resource': cfg['resource'],
            'queue'   : cfg['queue'],
            'schema'  : cfg['schema'],
            'walltime': cfg['walltime'],
            'project' : cfg['project'],
            'cpus'    : 42 * 4 * cfg['node_counts'],
            'gpus'    : 6 * cfg['node_counts']
    }

    # Create Application Manager
    appman = AppManager(hostname=os.environ.get('RMQ_HOSTNAME'),
                        port=int(os.environ.get('RMQ_PORT')),
Ejemplo n.º 30
0
    def _load_adaptors(self, inject_registry=None):
        """
        Try to load all adaptors that are registered in saga.engine.registry.py.
        This method is called from the constructor.  As Engine is a singleton,
        this method is called once after the module is first loaded in any
        python application.

        :param inject_registry: Inject a fake registry. *For unit tests only*.
        """

        self._logger.debug("listing  adaptor registry: %s" % self._registry)

        # check if some unit test wants to use a special registry.  If
        # so, we reset cpi infos from the earlier singleton creation.
        if inject_registry is not None:
            self._adaptor_registry = dict()
            self._registry = {'adaptor_registry': inject_registry}

        # attempt to load all registered modules
        for module_name in self._registry.get('adaptor_registry', []):

            self._logger.info("loading  adaptor %s" % module_name)

            # first, import the module
            adaptor_module = None
            try:
                adaptor_module = ru.import_module(module_name)

            except Exception:
                self._logger.warn("skip adaptor %s: import failed",
                                  module_name,
                                  exc_info=True)
                continue

            # we expect the module to have an 'Adaptor' class
            # implemented, which, on calling 'register()', returns
            # a info dict for all implemented adaptor classes.
            adaptor_instance = None
            adaptor_info = None

            try:
                adaptor_instance = adaptor_module.Adaptor()
                adaptor_info = adaptor_instance.register()

            except rse.SagaException:
                self._logger.warn("skip adaptor %s: failed to load",
                                  module_name,
                                  exc_info=True)
                continue

            except Exception:
                self._logger.warn("skip adaptor %s: init failed",
                                  module_name,
                                  exc_info=True)
                continue

            # the adaptor must also provide a sanity_check() method, which sould
            # be used to confirm that the adaptor can function properly in the
            # current runtime environment (e.g., that all pre-requisites and
            # system dependencies are met).
            try:
                adaptor_instance.sanity_check()

            except Exception:
                self._logger.warn("skip adaptor %s: test failed",
                                  module_name,
                                  exc_info=True)
                continue

            # check if we have a valid adaptor_info
            if adaptor_info is None:
                self._logger.warn("skip adaptor %s: invalid adaptor data",
                                  module_name)
                continue


            if  'name'    not in adaptor_info or \
                'cpis'    not in adaptor_info or \
                'version' not in adaptor_info or \
                'schemas' not in adaptor_info    :
                self._logger.warn("skip adaptor %s: incomplete data",
                                  module_name)
                continue

            adaptor_name = adaptor_info['name']
            adaptor_version = adaptor_info['version']
            adaptor_schemas = adaptor_info['schemas']
            adaptor_enabled = True  # default

            # disable adaptors in 'alpha' or 'beta' versions -- unless
            # the 'load_beta_adaptors' config option is set to True
            if not self._cfg['load_beta_adaptors']:

                if 'alpha' in adaptor_version.lower() or \
                   'beta'  in adaptor_version.lower()    :

                    self._logger.warn("skip beta adaptor %s (version %s)",
                                      module_name, adaptor_version)
                    continue

            # get the 'enabled' option in the adaptor's config
            # section (radical.saga.cpi.base) ensures that the option exists,
            # if it is initialized correctly in the adaptor class.
            adaptor_config = None
            adaptor_enabled = False

            try:
                adaptor_config = ru.Config('radical.saga', name=adaptor_name)
                adaptor_enabled = adaptor_config.get('enabled', True)

            except rse.SagaException:
                self._logger.warn("skip adaptor %s: init failed",
                                  module_name,
                                  exc_info=True)
                continue

            except Exception as e:
                self._logger.warn("skip adaptor %s: init error",
                                  module_name,
                                  exc_info=True)
                continue

            # only load adaptor if it is not disabled via config files
            if not adaptor_enabled:
                self._logger.warn("skip adaptor %s: disabled", module_name)
                continue

            # check if the adaptor has anything to register
            if 0 == len(adaptor_info['cpis']):
                self._logger.warn("skip adaptor %s: adaptor has no cpis",
                                  module_name)
                continue

            # we got an enabled adaptor with valid info - yay!  We can
            # now register all adaptor classes (cpi implementations).
            for cpi_info in adaptor_info['cpis']:

                # check cpi information details for completeness
                if  'type'  not in cpi_info or \
                    'class' not in cpi_info    :
                    self._logger.warn("skip %s cpi: incomplete info detail",
                                      module_name)
                    continue

                # adaptor classes are registered for specific API types.
                cpi_type = cpi_info['type']
                cpi_cname = cpi_info['class']
                cpi_class = None

                try:
                    cpi_class = getattr(adaptor_module, cpi_cname)

                except Exception:
                    # this exception likely means that the adaptor does not call
                    # the radical.saga.adaptors.Base initializer (correctly)
                    self._logger.warn("skip adaptor %s: invalid %s",
                                      module_name,
                                      cpi_info['class'],
                                      exc_info=True)
                    continue

                # make sure the cpi class is a valid cpi for the given type.
                # We walk through the list of known modules, and try to find
                # a modules which could have that class.  We do the following
                # tests:
                #
                #   cpi_class: ShellJobService
                #   cpi_type:  radical.saga.job.Service
                #   modules:   radical.saga.adaptors.cpi.job
                #   modules:   radical.saga.adaptors.cpi.job.service
                #   classes:   radical.saga.adaptors.cpi.job.Service
                #   classes:   radical.saga.adaptors.cpi.job.service.Service
                #
                #   cpi_class: X509Context
                #   cpi_type:  radical.saga.Context
                #   modules:   radical.saga.adaptors.cpi.context
                #   classes:   radical.saga.adaptors.cpi.context.Context
                #
                # So, we add a 'adaptors.cpi' after the 'saga' namespace
                # element, then append the rest of the given namespace.  If that
                # gives a module which has the requested class, fine -- if not,
                # we add a lower cased version of the class name as last
                # namespace element, and check again.

                # ->   radical .  saga .  job .  Service
                # <- ['radical', 'saga', 'job', 'Service']
                cpi_type_nselems = cpi_type.split('.')

                if  len(cpi_type_nselems) < 3 or \
                    len(cpi_type_nselems) > 4    :
                    self._logger.warn("skip adaptor %s invalid cpi %s",
                                      module_name, cpi_type)
                    continue

                if  cpi_type_nselems[0] != 'radical' and \
                    cpi_type_nselems[1] != 'saga'    :
                    self._logger.warn("skip adaptor %s: invalid cpi ns %s",
                                      module_name,
                                      cpi_type,
                                      exc_info=True)
                    continue

                # -> ['radical', 'saga',                    'job', 'Service']
                # <- ['radical', 'saga', 'adaptors', 'cpi', 'job', 'Service']
                cpi_type_nselems.insert(2, 'adaptors')
                cpi_type_nselems.insert(3, 'cpi')

                #  # -> ['radical', 'saga', 'adaptors', 'cpi', 'job',  'Service']
                #  # <- ['radical', 'saga', 'adaptors', 'cpi', 'job'], 'Service'
                #  cpi_type_cname = cpi_type_nselems.pop ()
                #
                #  # -> ['radical', 'saga', 'adaptors', 'cpi', 'job'], 'Service'
                #  # <-  'radical.saga.adaptors.cpi.job
                #  # <-  'radical.saga.adaptors.cpi.job.service
                #  cpi_type_modname_1 = '.'.join (cpi_type_nselems)
                #  cpi_type_modname_2 = '.'.join (cpi_type_nselems + \
                #                                 [cpi_type_cname.lower()])
                #
                #  # does either module exist?
                #  cpi_type_modname = None
                #
                #  if  cpi_type_modname_1 in sys.modules :
                #      cpi_type_modname = cpi_type_modname_1
                #
                #  if  cpi_type_modname_2 in sys.modules :
                #      cpi_type_modname = cpi_type_modname_2
                #
                #  if  not cpi_type_modname :
                #      self._logger.warn("skip adaptor %s: unknown cpi %s",
                #                        module_name, cpi_type, exc_info=True)
                #      sys.exit()
                #      continue
                #
                #  # so, make sure the given cpi is actually
                #  # implemented by the adaptor class
                #  cpi_ok = False
                #  for name, cpi_obj \
                #      in inspect.getmembers (sys.modules[cpi_type_modname]):
                #      if  name == cpi_type_cname      and \
                #          inspect.isclass (cpi_obj)       :
                #          if  issubclass (cpi_class, cpi_obj) :
                #              cpi_ok = True
                #
                #  if not cpi_ok :
                #      self._logger.warn("skip adaptor %s: no cpi %s (%s)",
                #                        module_name, cpi_class, cpi_type,
                #                       exc_info=True)
                #      continue

                # finally, register the cpi for all its schemas!
                registered_schemas = list()
                for adaptor_schema in adaptor_schemas:

                    adaptor_schema = adaptor_schema.lower()

                    # make sure we can register that cpi type
                    if cpi_type not in self._adaptor_registry:
                        self._adaptor_registry[cpi_type] = dict()

                    # make sure we can register that schema
                    if adaptor_schema not in self._adaptor_registry[cpi_type]:
                        self._adaptor_registry[cpi_type][adaptor_schema] = []

                    # we register the cpi class, so that we can create
                    # instances as needed, and the adaptor instance,
                    # as that is passed to the cpi class c'tor later
                    # on (the adaptor instance is used to share state
                    # between cpi instances, amongst others)
                    info = {
                        'cpi_cname': cpi_cname,
                        'cpi_class': cpi_class,
                        'adaptor_name': adaptor_name,
                        'adaptor_instance': adaptor_instance
                    }

                    # make sure this tuple was not registered, yet
                    if info in self._adaptor_registry[cpi_type][
                            adaptor_schema]:
                        self._logger.warn("skip adaptor %s: exists %s: %s",
                                          module_name,
                                          cpi_class,
                                          adaptor_instance,
                                          exc_info=True)
                        continue

                    self._adaptor_registry[cpi_type] \
                                          [adaptor_schema].append(info)
                    registered_schemas.append(str("%s://" % adaptor_schema))

                self._logger.info("Register adaptor %s for %s API: %s" %
                                  (module_name, cpi_type, registered_schemas))