Exemple #1
0
    def initialize(self):

        self._pwd = os.getcwd()

        self.register_input(rps.AGENT_EXECUTING_PENDING,
                            rpc.AGENT_EXECUTING_QUEUE, self.work)

        self.register_output(rps.AGENT_STAGING_OUTPUT_PENDING,
                             rpc.AGENT_STAGING_OUTPUT_QUEUE)

        self.register_publisher (rpc.AGENT_UNSCHEDULE_PUBSUB)
        self.register_subscriber(rpc.CONTROL_PUBSUB, self.command_cb)

        self._cancel_lock    = ru.RLock()
        self._cus_to_cancel  = list()
        self._watch_queue    = queue.Queue ()

        self._pid = self._cfg['pid']

        self.task_map = {}
        self.task_map_lock = ru.Lock()

        # we needs the LaunchMethod to construct commands.
        assert(self._cfg['task_launch_method'] ==
               self._cfg['mpi_launch_method' ] == "ORTE_LIB"), \
               "ORTE_LIB spawner only works with ORTE_LIB LaunchMethod."

        self._task_launcher = rp.agent.LaunchMethod.create(name="ORTE_LIB",
                                           cfg=self._cfg, session=self._session)
        self._orte_initialized = False
        self._cu_environment   = self._populate_cu_environment()

        self.gtod   = "%s/gtod" % self._pwd
        self.tmpdir = tempfile.gettempdir()
Exemple #2
0
    def test_configure(self, mocked_init):

        session, configs = self.setUp()

        component = Default(cfg=None, session=None)
        component._cfg        = mock.Mock()
        component._log        = ru.Logger('dummy')
        component._rp_version = '0.0'
        component._session    = session

        component._pmgr       = 'pmgr.0'
        component._prof       = ru.Config(cfg = {'enabled': False})
        component._cache_lock = ru.Lock()
        component._cache      = dict()
        component._sandboxes  = dict()

        component._mod_dir    = os.path.dirname(os.path.abspath(__file__))
        component._root_dir   = "%s/../../src/radical/pilot/" % component._mod_dir
        component._conf_dir   = "%s/configs/" % component._root_dir

        component._rp_version    = rp.version
        component._rp_sdist_name = rp.sdist_name
        component._rp_sdist_path = rp.sdist_path

        resource = 'local.localhost'
        rcfg     = configs.local.localhost

        pilot    = {
                        'uid'         : 'pilot.0000',
                        'description' : {'cores'          : 10,
                                         'gpus'           : 2,
                                         'queue'          : 'default',
                                         'project'        : 'foo',
                                         'job_name'       : None,
                                         'runtime'        : 10,
                                         'app_comm'       : 0,
                                         'cleanup'        : 0,
                                         'memory'         : 0,
                                         'candidate_hosts': None,
                                         }
                   }
        ret = component._prepare_pilot(resource, rcfg, pilot, {})
        assert(ret['jd'].name == 'pilot.0000')

        pilot    = {
                        'uid'         : 'pilot.0000',
                        'description' : {'cores'          : 10,
                                         'gpus'           : 2,
                                         'queue'          : 'default',
                                         'project'        : 'foo',
                                         'job_name'       : 'bar',
                                         'runtime'        : 10,
                                         'app_comm'       : 0,
                                         'cleanup'        : 0,
                                         'memory'         : 0,
                                         'candidate_hosts': None,
                                         }
                   }
        ret = component._prepare_pilot(resource, rcfg, pilot, {})
        assert(ret['jd'].name == 'bar')
Exemple #3
0
def test_locks():
    '''
    Test debug lock wrappers
    '''

    os.environ['RADICAL_DEBUG'] = 'True'

    l  = ru.Lock()
    rl = ru.RLock(name='bar')

    assert(not l.waits)
    assert(not rl.waits)

    with l:
        with rl:
            assert(not l.waits)
            assert(not rl.waits)

    assert(l.name  in ru.debug._debug_helper.locks)                       # noqa
    assert(rl.name in ru.debug._debug_helper.rlocks)                      # noqa

    ru.debug._debug_helper.unregister_lock(l.name)                        # noqa
    ru.debug._debug_helper.unregister_rlock(rl.name)                      # noqa

    assert(l.name  not in ru.debug._debug_helper.locks)                   # noqa
    assert(rl.name not in ru.debug._debug_helper.rlocks)                  # noqa
Exemple #4
0
    def initialize(self):

        self._sid        = self._cfg['sid']
        self._dburl      = self._cfg['dburl']

        # TODO: get db handle from a connected session
        _, db, _, _, _   = ru.mongodb_connect(self._dburl)
        self._mongo_db   = db
        self._coll       = self._mongo_db[self._sid]
        self._bulk       = self._coll.initialize_ordered_bulk_op()
        self._last       = time.time()        # time of last bulk push
        self._uids       = list()             # list of collected uids
        self._lock       = ru.Lock()          # protect _bulk

        self._bulk_time = self._cfg.bulk_time
        self._bulk_size = self._cfg.bulk_size

        self.register_subscriber(rpc.STATE_PUBSUB, self._state_cb)
        self.register_timed_cb(self._idle_cb, timer=self._bulk_time)
Exemple #5
0
    def __init__(self, cfg):

        if isinstance(cfg, str): cfg = ru.Config(cfg=ru.read_json(cfg))
        else                   : cfg = ru.Config(cfg=cfg)

        self._n_cores = cfg.cores
        self._n_gpus  = cfg.gpus

        self._info    = ru.Config(cfg=cfg.get('info', {}))
        self._session = Session(cfg=cfg, uid=cfg.sid, _primary=False)

        rpu.Component.__init__(self, cfg, self._session)

        self._term    = mp.Event()          # set to terminate
        self._res_evt = mp.Event()          # set on free resources

        self._mlock   = ru.Lock(self._uid)  # lock `_modes` and `_mdata`
        self._modes   = dict()              # call modes (call, exec, eval, ...)
        self._mdata   = dict()              # call mode meta data

        # We need to make sure to run only up to `gpn` tasks using a gpu
        # within that pool, so need a separate counter for that.
        self._resources = {'cores' : [0] * self._n_cores,
                           'gpus'  : [0] * self._n_gpus}

        # resources are initially all free
        self._res_evt.set()

      # # create a multiprocessing pool with `cpn` worker processors.  Set
      # # `maxtasksperchild` to `1` so that we get a fresh process for each
      # # task.  That will also allow us to run command lines via `exec`,
      # # effectively replacing the worker process in the pool for a specific
      # # task.
      # #
      # # We use a `fork` context to inherit log and profile handles.
      # #
      # # NOTE: The mp documentation is wrong; mp.Pool does *not* have a context
      # #       parameters.  Instead, the Pool has to be created within
      # #       a context.
      # ctx = mp.get_context('fork')
      # self._pool = ctx.Pool(processes=self._n_cores,
      #                       initializer=None,
      #                       maxtasksperchild=1)
      # NOTE: a multiprocessing pool won't work, as pickle is not able to
      #       serialize our worker object.  So we use our own process pool.
      #       It's not much of a loss since we want to respawn new processes for
      #       each task anyway (to improve isolation).
        self._pool  = dict()  # map task uid to process instance
        self._plock = ru.Lock('p' + self._uid)  # lock _pool

        # We also create a queue for communicating results back, and a thread to
        # watch that queue
        self._result_queue = mp.Queue()
        self._result_thead = mt.Thread(target=self._result_watcher)
        self._result_thead.daemon = True
        self._result_thead.start()

        # connect to master
        self.register_subscriber(rpc.CONTROL_PUBSUB, self._control_cb)
        self.register_publisher(rpc.CONTROL_PUBSUB)

        # run worker initialization *before* starting to work on requests.
        # the worker provides three builtin methods:
        #     eval:  evaluate a piece of python code
        #     exec:  execute  a command line (fork/exec)
        #     shell: execute  a shell command
        #     call:  execute  a method or function call
        self.register_mode('call',  self._call)
        self.register_mode('eval',  self._eval)
        self.register_mode('exec',  self._exec)
        self.register_mode('shell', self._shell)

        self.pre_exec()

        # connect to the request / response ZMQ queues
        self._res_put = ru.zmq.Putter('to_res', self._info.res_addr_put)
        self._req_get = ru.zmq.Getter('to_req', self._info.req_addr_get,
                                                cb=self._request_cb)

        # the worker can return custom information which will be made available
        # to the master.  This can be used to communicate, for example, worker
        # specific communication endpoints.

        # `info` is a placeholder for any additional meta data communicated to
        # the worker
        self.publish(rpc.CONTROL_PUBSUB, {'cmd': 'worker_register',
                                          'arg': {'uid' : self._uid,
                                                  'info': self._info}})
Exemple #6
0
    def __init__(self, cfg=None, backend='zmq'):

        self._backend = backend  # FIXME: use

        self._lock = ru.Lock('master')
        self._workers = dict()  # wid: worker
        self._requests = dict()  # bookkeeping of submitted requests
        self._lock = mt.Lock()  # lock the request dist on updates

        cfg.sid = os.environ['RP_SESSION_ID']
        cfg.base = os.environ['RP_PILOT_SANDBOX']
        cfg.path = os.environ['RP_PILOT_SANDBOX']
        self._session = Session(cfg=cfg, uid=cfg.sid, _primary=False)
        cfg = self._get_config(cfg)

        rpu.Component.__init__(self, cfg, self._session)

        self.register_output(rps.AGENT_STAGING_INPUT_PENDING,
                             rpc.AGENT_STAGING_INPUT_QUEUE)
        self.register_subscriber(rpc.CONTROL_PUBSUB, self._control_cb)

        # set up RU ZMQ Queues for request distribution and result collection
        req_cfg = ru.Config(
            cfg={
                'channel': '%s.to_req' % self._uid,
                'type': 'queue',
                'uid': self._uid + '.req',
                'path': os.getcwd(),
                'stall_hwm': 0,
                'bulk_size': 56
            })

        res_cfg = ru.Config(
            cfg={
                'channel': '%s.to_res' % self._uid,
                'type': 'queue',
                'uid': self._uid + '.res',
                'path': os.getcwd(),
                'stall_hwm': 0,
                'bulk_size': 56
            })

        self._req_queue = ru.zmq.Queue(req_cfg)
        self._res_queue = ru.zmq.Queue(res_cfg)

        self._req_queue.start()
        self._res_queue.start()

        self._req_addr_put = str(self._req_queue.addr_put)
        self._req_addr_get = str(self._req_queue.addr_get)

        self._res_addr_put = str(self._res_queue.addr_put)
        self._res_addr_get = str(self._res_queue.addr_get)

        # this master will put requests onto the request queue, and will get
        # responses from the response queue.  Note that the responses will be
        # delivered via an async callback (`self._result_cb`).
        self._req_put = ru.zmq.Putter('%s.to_req' % self._uid,
                                      self._req_addr_put)
        self._res_get = ru.zmq.Getter('%s.to_res' % self._uid,
                                      self._res_addr_get,
                                      cb=self._result_cb)

        # for the workers it is the opposite: they will get requests from the
        # request queue, and will send responses to the response queue.
        self._info = {
            'req_addr_get': self._req_addr_get,
            'res_addr_put': self._res_addr_put
        }

        # make sure the channels are up before allowing to submit requests
        time.sleep(1)

        # connect to the local agent
        self._log.debug('startup complete')
Exemple #7
0
    def _delayed_configure(self, cud):

        if self._configured:
            return

        self.chunk = {
            'cpu_processes': cud['cpu_processes'],
            'cpu_process_type': cud['cpu_process_type'],
            'cpu_threads': cud['cpu_threads'],
            'cpu_thread_type': cud['cpu_thread_type'],
            'gpu_processes': cud['gpu_processes'],
            'gpu_process_type': cud['gpu_process_type'],
            'gpu_threads': cud['gpu_threads'],
            'gpu_thread_type': cud['gpu_thread_type'],
        }

        self.cpn = self._rm_cores_per_node
        self.gpn = self._rm_gpus_per_node

        self.free = list()  # list of free chunks
        self.lock = ru.Lock()  # lock for the above list

        cores_needed = cud['cpu_processes'] * cud['cpu_threads']
        gpus_needed = cud['gpu_processes']

        # check if we need single or multi-node chunks
        if  cud['cpu_process_type'] != 'MPI' and \
            cud['gpu_process_type'] != 'MPI' :

            # single node task - check if it fits
            if cores_needed > self.cpn or \
               gpus_needed  > self.gpn:
                raise ValueError('unit does not fit on node')

        # ---------------------------------------------------------------------
        # create as many equal sized chunks from the available nodes as
        # possible, and put them into the `free` list.  The actual scheduling
        # algorithm will blindly pick chunks from that list whenever a new CUD
        # arrives.
        cblock = cud['cpu_threads']
        ncblocks = cud['cpu_processes']
        cblocks = list()
        cidx = 0

        while cidx + cblock <= self.cpn:
            cblocks.append(list(range(cidx, cidx + cblock)))
            cidx += cblock

        gblock = 1
        ngblocks = cud['gpu_processes']
        gblocks = list()
        gidx = 0
        while gidx + gblock <= self.gpn:
            gblocks.append(list(range(gidx, gidx + gblock)))
            gidx += gblock

        self._log.debug('core blocks %s', cblocks)
        self._log.debug('gpu  blocks %s', gblocks)

        for node in self.nodes:
            node['cblocks'] = copy.deepcopy(cblocks)
            node['gblocks'] = copy.deepcopy(gblocks)

        # ----------------------------------------------------------------------
        def next_slot(slot=None):
            if slot:
                del (slot['ncblocks'])
                del (slot['ngblocks'])
                self.free.append(slot)
            return {
                'nodes': list(),
                'cores_per_node': self.cpn,
                'gpus_per_node': self.gpn,
                'lm_info': self._rm_lm_info,
                'ncblocks': 0,
                'ngblocks': 0
            }

        # ---------------------------------------------------------------------
        nidx = 0
        nnodes = len(self.nodes)
        slot = next_slot()
        while nidx < nnodes:


            if  slot['ncblocks'] == ncblocks and \
                slot['ngblocks'] == ngblocks :
                slot = next_slot(slot)

            node = self.nodes[nidx]
            nuid = node['uid']
            nname = node['name']
            ok = True

            while slot['ncblocks'] < ncblocks:
                if node['cblocks']:
                    cblock = node['cblocks'].pop(0)
                    slot['nodes'].append({
                        'name': nname,
                        'uid': nuid,
                        'core_map': [cblock],
                        'gpu_map': []
                    })
                    slot['ncblocks'] += 1
                else:
                    ok = False
                    break

            while slot['ngblocks'] < ngblocks:
                if node['gblocks']:

                    # move the process onto core `0` (oversubscribed)
                    # enabled)
                    gblock = node['gblocks'].pop(0)
                    slot['nodes'].append({
                        'name': nname,
                        'uid': nuid,
                        'core_map': [[0]],
                        'gpu_map': [gblock]
                    })
                    slot['ngblocks'] += 1
                else:
                    ok = False
                    break

            if ok:
                self.free.append(slot)
                slot = next_slot()
                continue

            nidx += 1


        if  slot['ncblocks'] == ncblocks and \
            slot['ngblocks'] == ngblocks :
            self.free.append(slot)

        if not self.free:
            raise RuntimeError(
                'configuration cannot be used for this workload')

        # run this method only once
        self._configured = True
Exemple #8
0
    def __init__(self, session, cfg='default'):
        '''
        Creates a new PilotManager and attaches is to the session.

        **Arguments:**
            * session [:class:`rp.Session`]:
              The session instance to use.
            * cfg (`dict` or `string`):
              The configuration or name of configuration to use.

        **Returns:**
            * A new `PilotManager` object [:class:`rp.PilotManager`].
        '''

        assert (session.primary), 'pmgr needs primary session'

        self._pilots = dict()
        self._pilots_lock = ru.RLock('pmgr.pilots_lock')
        self._callbacks = dict()
        self._pcb_lock = ru.RLock('pmgr.pcb_lock')
        self._terminate = mt.Event()
        self._closed = False
        self._rec_id = 0  # used for session recording
        self._uid = ru.generate_id('pmgr.%(item_counter)04d',
                                   ru.ID_CUSTOM,
                                   ns=session.uid)

        for m in rpc.PMGR_METRICS:
            self._callbacks[m] = dict()

        # NOTE: `name` and `cfg` are overloaded, the user cannot point to
        #       a predefined config and amed it at the same time.  This might
        #       be ok for the session, but introduces a minor API inconsistency.
        #
        name = None
        if isinstance(cfg, str):
            name = cfg
            cfg = None

        cfg = ru.Config('radical.pilot.pmgr', name=name, cfg=cfg)
        cfg.uid = self._uid
        cfg.owner = self._uid
        cfg.sid = session.uid
        cfg.base = session.base
        cfg.path = session.path
        cfg.dburl = session.dburl
        cfg.heartbeat = session.cfg.heartbeat

        rpu.Component.__init__(self, cfg, session=session)
        self.start()

        self._log.info('started pmgr %s', self._uid)
        self._rep.info('<<create pilot manager')

        # create pmgr bridges and components, use session cmgr for that
        self._cmgr = rpu.ComponentManager(self._cfg)
        self._cmgr.start_bridges()
        self._cmgr.start_components()

        # The output queue is used to forward submitted pilots to the
        # launching component.
        self.register_output(rps.PMGR_LAUNCHING_PENDING,
                             rpc.PMGR_LAUNCHING_QUEUE)

        # we also listen on the control pubsub, to learn about completed staging
        # directives
        self.register_subscriber(rpc.CONTROL_PUBSUB, self._staging_ack_cb)
        self._active_sds = dict()
        self._sds_lock = ru.Lock('pmgr_sds_lock')

        # register the state notification pull cb and hb pull cb
        # FIXME: we may want to have the frequency configurable
        # FIXME: this should be a tailing cursor in the update worker
        self.register_timed_cb(self._state_pull_cb,
                               timer=self._cfg['db_poll_sleeptime'])
        self.register_timed_cb(self._pilot_heartbeat_cb,
                               timer=self._cfg['db_poll_sleeptime'])

        # also listen to the state pubsub for pilot state changes
        self.register_subscriber(rpc.STATE_PUBSUB, self._state_sub_cb)

        # let session know we exist
        self._session._register_pmgr(self)

        self._prof.prof('setup_done', uid=self._uid)
        self._rep.ok('>>ok\n')
    def __init__(self, workload, resource, replicas=None):

        self._uid = ru.generate_id('rx')
        self._prof = ru.Profiler('radical.repex')
        self._prof.prof('create', uid=self._uid)

        self._workload = ru.Config(cfg=workload)
        self._resource = ru.Config(cfg=resource)
        self._replicas = replicas

        # the replicas need to be aware about pre_exec directives
        self._workload.pre_exec = self._resource.pre_exec

        assert (self._workload.config.replicas or self._replicas)
        assert (self._workload.config.cycles)

        self._cycles = self._workload.config.cycles
        self._waitlist = list()

        if self._replicas:
            self._workload.config.replicas = len(self._replicas)
        else:
            self._replicas = [
                Replica(workload=self._workload)
                for _ in range(self._workload.config.replicas)
            ]

        self._pre_alg = prepare_algs.get(self._workload.prepare.algorithm)
        self._sel_alg = selection_algs.get(self._workload.selection.algorithm)
        self._exc_alg = exchange_algs.get(self._workload.exchange.algorithm)

        # if the configured algorithms are not known (not hard-coded in RX),
        # then assume they point to user specified files and load them
        if not self._pre_alg:
            filename, funcname = self._workload.prepare.algorithm.split(':')
            syms = ru.import_file(filename)
            self._pre_alg = syms['functions'][funcname]

        if not self._sel_alg:
            filename, funcname = self._workload.selection.algorithm.split(':')
            syms = ru.import_file(filename)
            self._sel_alg = syms['functions'][funcname]

        if not self._exc_alg:
            filename, funcname = self._workload.exchange.algorithm.split(':')
            syms = ru.import_file(filename)
            self._exc_alg = syms['functions'][funcname]

        assert (self._pre_alg), 'preparation algorithm missing'
        assert (self._sel_alg), 'selection algorithm missing'
        assert (self._exc_alg), 'exchange algorithm missing'

        rmq_host = str(self._resource.get('rmq_host', 'localhost'))
        rmq_port = int(self._resource.get('rmq_port', '5672'))
        rmq_user = str(self._resource.get('rmq_user', 'guest'))
        rmq_pass = str(self._resource.get('rmq_pass', 'guest'))
        re.AppManager.__init__(self,
                               autoterminate=True,
                               hostname=rmq_host,
                               port=rmq_port,
                               username=rmq_user,
                               password=rmq_pass)

        for r in self._replicas:
            r._initialize(check_ex=self._check_exchange,
                          check_res=self._check_resume,
                          sid=self.sid,
                          prof=self._prof)

        self._lock = ru.Lock(name='rx')

        rd = copy.deepcopy(self._resource)
        if 'rmq_host' in rd: del (rd['rmq_host'])
        if 'rmq_port' in rd: del (rd['rmq_port'])
        if 'pre_exec' in rd: del (rd['pre_exec'])

        self.resource_desc = rd

        self._log = ru.Logger('radical.repex')
        self._dout = open('dump.log', 'a')
        self._dump(msg='startup')

        # run the replica pipelines
        self.workflow = set(self._replicas)