def initialize(self): self._pwd = os.getcwd() self.register_input(rps.AGENT_EXECUTING_PENDING, rpc.AGENT_EXECUTING_QUEUE, self.work) self.register_output(rps.AGENT_STAGING_OUTPUT_PENDING, rpc.AGENT_STAGING_OUTPUT_QUEUE) self.register_publisher (rpc.AGENT_UNSCHEDULE_PUBSUB) self.register_subscriber(rpc.CONTROL_PUBSUB, self.command_cb) self._cancel_lock = ru.RLock() self._cus_to_cancel = list() self._watch_queue = queue.Queue () self._pid = self._cfg['pid'] self.task_map = {} self.task_map_lock = ru.Lock() # we needs the LaunchMethod to construct commands. assert(self._cfg['task_launch_method'] == self._cfg['mpi_launch_method' ] == "ORTE_LIB"), \ "ORTE_LIB spawner only works with ORTE_LIB LaunchMethod." self._task_launcher = rp.agent.LaunchMethod.create(name="ORTE_LIB", cfg=self._cfg, session=self._session) self._orte_initialized = False self._cu_environment = self._populate_cu_environment() self.gtod = "%s/gtod" % self._pwd self.tmpdir = tempfile.gettempdir()
def test_configure(self, mocked_init): session, configs = self.setUp() component = Default(cfg=None, session=None) component._cfg = mock.Mock() component._log = ru.Logger('dummy') component._rp_version = '0.0' component._session = session component._pmgr = 'pmgr.0' component._prof = ru.Config(cfg = {'enabled': False}) component._cache_lock = ru.Lock() component._cache = dict() component._sandboxes = dict() component._mod_dir = os.path.dirname(os.path.abspath(__file__)) component._root_dir = "%s/../../src/radical/pilot/" % component._mod_dir component._conf_dir = "%s/configs/" % component._root_dir component._rp_version = rp.version component._rp_sdist_name = rp.sdist_name component._rp_sdist_path = rp.sdist_path resource = 'local.localhost' rcfg = configs.local.localhost pilot = { 'uid' : 'pilot.0000', 'description' : {'cores' : 10, 'gpus' : 2, 'queue' : 'default', 'project' : 'foo', 'job_name' : None, 'runtime' : 10, 'app_comm' : 0, 'cleanup' : 0, 'memory' : 0, 'candidate_hosts': None, } } ret = component._prepare_pilot(resource, rcfg, pilot, {}) assert(ret['jd'].name == 'pilot.0000') pilot = { 'uid' : 'pilot.0000', 'description' : {'cores' : 10, 'gpus' : 2, 'queue' : 'default', 'project' : 'foo', 'job_name' : 'bar', 'runtime' : 10, 'app_comm' : 0, 'cleanup' : 0, 'memory' : 0, 'candidate_hosts': None, } } ret = component._prepare_pilot(resource, rcfg, pilot, {}) assert(ret['jd'].name == 'bar')
def test_locks(): ''' Test debug lock wrappers ''' os.environ['RADICAL_DEBUG'] = 'True' l = ru.Lock() rl = ru.RLock(name='bar') assert(not l.waits) assert(not rl.waits) with l: with rl: assert(not l.waits) assert(not rl.waits) assert(l.name in ru.debug._debug_helper.locks) # noqa assert(rl.name in ru.debug._debug_helper.rlocks) # noqa ru.debug._debug_helper.unregister_lock(l.name) # noqa ru.debug._debug_helper.unregister_rlock(rl.name) # noqa assert(l.name not in ru.debug._debug_helper.locks) # noqa assert(rl.name not in ru.debug._debug_helper.rlocks) # noqa
def initialize(self): self._sid = self._cfg['sid'] self._dburl = self._cfg['dburl'] # TODO: get db handle from a connected session _, db, _, _, _ = ru.mongodb_connect(self._dburl) self._mongo_db = db self._coll = self._mongo_db[self._sid] self._bulk = self._coll.initialize_ordered_bulk_op() self._last = time.time() # time of last bulk push self._uids = list() # list of collected uids self._lock = ru.Lock() # protect _bulk self._bulk_time = self._cfg.bulk_time self._bulk_size = self._cfg.bulk_size self.register_subscriber(rpc.STATE_PUBSUB, self._state_cb) self.register_timed_cb(self._idle_cb, timer=self._bulk_time)
def __init__(self, cfg): if isinstance(cfg, str): cfg = ru.Config(cfg=ru.read_json(cfg)) else : cfg = ru.Config(cfg=cfg) self._n_cores = cfg.cores self._n_gpus = cfg.gpus self._info = ru.Config(cfg=cfg.get('info', {})) self._session = Session(cfg=cfg, uid=cfg.sid, _primary=False) rpu.Component.__init__(self, cfg, self._session) self._term = mp.Event() # set to terminate self._res_evt = mp.Event() # set on free resources self._mlock = ru.Lock(self._uid) # lock `_modes` and `_mdata` self._modes = dict() # call modes (call, exec, eval, ...) self._mdata = dict() # call mode meta data # We need to make sure to run only up to `gpn` tasks using a gpu # within that pool, so need a separate counter for that. self._resources = {'cores' : [0] * self._n_cores, 'gpus' : [0] * self._n_gpus} # resources are initially all free self._res_evt.set() # # create a multiprocessing pool with `cpn` worker processors. Set # # `maxtasksperchild` to `1` so that we get a fresh process for each # # task. That will also allow us to run command lines via `exec`, # # effectively replacing the worker process in the pool for a specific # # task. # # # # We use a `fork` context to inherit log and profile handles. # # # # NOTE: The mp documentation is wrong; mp.Pool does *not* have a context # # parameters. Instead, the Pool has to be created within # # a context. # ctx = mp.get_context('fork') # self._pool = ctx.Pool(processes=self._n_cores, # initializer=None, # maxtasksperchild=1) # NOTE: a multiprocessing pool won't work, as pickle is not able to # serialize our worker object. So we use our own process pool. # It's not much of a loss since we want to respawn new processes for # each task anyway (to improve isolation). self._pool = dict() # map task uid to process instance self._plock = ru.Lock('p' + self._uid) # lock _pool # We also create a queue for communicating results back, and a thread to # watch that queue self._result_queue = mp.Queue() self._result_thead = mt.Thread(target=self._result_watcher) self._result_thead.daemon = True self._result_thead.start() # connect to master self.register_subscriber(rpc.CONTROL_PUBSUB, self._control_cb) self.register_publisher(rpc.CONTROL_PUBSUB) # run worker initialization *before* starting to work on requests. # the worker provides three builtin methods: # eval: evaluate a piece of python code # exec: execute a command line (fork/exec) # shell: execute a shell command # call: execute a method or function call self.register_mode('call', self._call) self.register_mode('eval', self._eval) self.register_mode('exec', self._exec) self.register_mode('shell', self._shell) self.pre_exec() # connect to the request / response ZMQ queues self._res_put = ru.zmq.Putter('to_res', self._info.res_addr_put) self._req_get = ru.zmq.Getter('to_req', self._info.req_addr_get, cb=self._request_cb) # the worker can return custom information which will be made available # to the master. This can be used to communicate, for example, worker # specific communication endpoints. # `info` is a placeholder for any additional meta data communicated to # the worker self.publish(rpc.CONTROL_PUBSUB, {'cmd': 'worker_register', 'arg': {'uid' : self._uid, 'info': self._info}})
def __init__(self, cfg=None, backend='zmq'): self._backend = backend # FIXME: use self._lock = ru.Lock('master') self._workers = dict() # wid: worker self._requests = dict() # bookkeeping of submitted requests self._lock = mt.Lock() # lock the request dist on updates cfg.sid = os.environ['RP_SESSION_ID'] cfg.base = os.environ['RP_PILOT_SANDBOX'] cfg.path = os.environ['RP_PILOT_SANDBOX'] self._session = Session(cfg=cfg, uid=cfg.sid, _primary=False) cfg = self._get_config(cfg) rpu.Component.__init__(self, cfg, self._session) self.register_output(rps.AGENT_STAGING_INPUT_PENDING, rpc.AGENT_STAGING_INPUT_QUEUE) self.register_subscriber(rpc.CONTROL_PUBSUB, self._control_cb) # set up RU ZMQ Queues for request distribution and result collection req_cfg = ru.Config( cfg={ 'channel': '%s.to_req' % self._uid, 'type': 'queue', 'uid': self._uid + '.req', 'path': os.getcwd(), 'stall_hwm': 0, 'bulk_size': 56 }) res_cfg = ru.Config( cfg={ 'channel': '%s.to_res' % self._uid, 'type': 'queue', 'uid': self._uid + '.res', 'path': os.getcwd(), 'stall_hwm': 0, 'bulk_size': 56 }) self._req_queue = ru.zmq.Queue(req_cfg) self._res_queue = ru.zmq.Queue(res_cfg) self._req_queue.start() self._res_queue.start() self._req_addr_put = str(self._req_queue.addr_put) self._req_addr_get = str(self._req_queue.addr_get) self._res_addr_put = str(self._res_queue.addr_put) self._res_addr_get = str(self._res_queue.addr_get) # this master will put requests onto the request queue, and will get # responses from the response queue. Note that the responses will be # delivered via an async callback (`self._result_cb`). self._req_put = ru.zmq.Putter('%s.to_req' % self._uid, self._req_addr_put) self._res_get = ru.zmq.Getter('%s.to_res' % self._uid, self._res_addr_get, cb=self._result_cb) # for the workers it is the opposite: they will get requests from the # request queue, and will send responses to the response queue. self._info = { 'req_addr_get': self._req_addr_get, 'res_addr_put': self._res_addr_put } # make sure the channels are up before allowing to submit requests time.sleep(1) # connect to the local agent self._log.debug('startup complete')
def _delayed_configure(self, cud): if self._configured: return self.chunk = { 'cpu_processes': cud['cpu_processes'], 'cpu_process_type': cud['cpu_process_type'], 'cpu_threads': cud['cpu_threads'], 'cpu_thread_type': cud['cpu_thread_type'], 'gpu_processes': cud['gpu_processes'], 'gpu_process_type': cud['gpu_process_type'], 'gpu_threads': cud['gpu_threads'], 'gpu_thread_type': cud['gpu_thread_type'], } self.cpn = self._rm_cores_per_node self.gpn = self._rm_gpus_per_node self.free = list() # list of free chunks self.lock = ru.Lock() # lock for the above list cores_needed = cud['cpu_processes'] * cud['cpu_threads'] gpus_needed = cud['gpu_processes'] # check if we need single or multi-node chunks if cud['cpu_process_type'] != 'MPI' and \ cud['gpu_process_type'] != 'MPI' : # single node task - check if it fits if cores_needed > self.cpn or \ gpus_needed > self.gpn: raise ValueError('unit does not fit on node') # --------------------------------------------------------------------- # create as many equal sized chunks from the available nodes as # possible, and put them into the `free` list. The actual scheduling # algorithm will blindly pick chunks from that list whenever a new CUD # arrives. cblock = cud['cpu_threads'] ncblocks = cud['cpu_processes'] cblocks = list() cidx = 0 while cidx + cblock <= self.cpn: cblocks.append(list(range(cidx, cidx + cblock))) cidx += cblock gblock = 1 ngblocks = cud['gpu_processes'] gblocks = list() gidx = 0 while gidx + gblock <= self.gpn: gblocks.append(list(range(gidx, gidx + gblock))) gidx += gblock self._log.debug('core blocks %s', cblocks) self._log.debug('gpu blocks %s', gblocks) for node in self.nodes: node['cblocks'] = copy.deepcopy(cblocks) node['gblocks'] = copy.deepcopy(gblocks) # ---------------------------------------------------------------------- def next_slot(slot=None): if slot: del (slot['ncblocks']) del (slot['ngblocks']) self.free.append(slot) return { 'nodes': list(), 'cores_per_node': self.cpn, 'gpus_per_node': self.gpn, 'lm_info': self._rm_lm_info, 'ncblocks': 0, 'ngblocks': 0 } # --------------------------------------------------------------------- nidx = 0 nnodes = len(self.nodes) slot = next_slot() while nidx < nnodes: if slot['ncblocks'] == ncblocks and \ slot['ngblocks'] == ngblocks : slot = next_slot(slot) node = self.nodes[nidx] nuid = node['uid'] nname = node['name'] ok = True while slot['ncblocks'] < ncblocks: if node['cblocks']: cblock = node['cblocks'].pop(0) slot['nodes'].append({ 'name': nname, 'uid': nuid, 'core_map': [cblock], 'gpu_map': [] }) slot['ncblocks'] += 1 else: ok = False break while slot['ngblocks'] < ngblocks: if node['gblocks']: # move the process onto core `0` (oversubscribed) # enabled) gblock = node['gblocks'].pop(0) slot['nodes'].append({ 'name': nname, 'uid': nuid, 'core_map': [[0]], 'gpu_map': [gblock] }) slot['ngblocks'] += 1 else: ok = False break if ok: self.free.append(slot) slot = next_slot() continue nidx += 1 if slot['ncblocks'] == ncblocks and \ slot['ngblocks'] == ngblocks : self.free.append(slot) if not self.free: raise RuntimeError( 'configuration cannot be used for this workload') # run this method only once self._configured = True
def __init__(self, session, cfg='default'): ''' Creates a new PilotManager and attaches is to the session. **Arguments:** * session [:class:`rp.Session`]: The session instance to use. * cfg (`dict` or `string`): The configuration or name of configuration to use. **Returns:** * A new `PilotManager` object [:class:`rp.PilotManager`]. ''' assert (session.primary), 'pmgr needs primary session' self._pilots = dict() self._pilots_lock = ru.RLock('pmgr.pilots_lock') self._callbacks = dict() self._pcb_lock = ru.RLock('pmgr.pcb_lock') self._terminate = mt.Event() self._closed = False self._rec_id = 0 # used for session recording self._uid = ru.generate_id('pmgr.%(item_counter)04d', ru.ID_CUSTOM, ns=session.uid) for m in rpc.PMGR_METRICS: self._callbacks[m] = dict() # NOTE: `name` and `cfg` are overloaded, the user cannot point to # a predefined config and amed it at the same time. This might # be ok for the session, but introduces a minor API inconsistency. # name = None if isinstance(cfg, str): name = cfg cfg = None cfg = ru.Config('radical.pilot.pmgr', name=name, cfg=cfg) cfg.uid = self._uid cfg.owner = self._uid cfg.sid = session.uid cfg.base = session.base cfg.path = session.path cfg.dburl = session.dburl cfg.heartbeat = session.cfg.heartbeat rpu.Component.__init__(self, cfg, session=session) self.start() self._log.info('started pmgr %s', self._uid) self._rep.info('<<create pilot manager') # create pmgr bridges and components, use session cmgr for that self._cmgr = rpu.ComponentManager(self._cfg) self._cmgr.start_bridges() self._cmgr.start_components() # The output queue is used to forward submitted pilots to the # launching component. self.register_output(rps.PMGR_LAUNCHING_PENDING, rpc.PMGR_LAUNCHING_QUEUE) # we also listen on the control pubsub, to learn about completed staging # directives self.register_subscriber(rpc.CONTROL_PUBSUB, self._staging_ack_cb) self._active_sds = dict() self._sds_lock = ru.Lock('pmgr_sds_lock') # register the state notification pull cb and hb pull cb # FIXME: we may want to have the frequency configurable # FIXME: this should be a tailing cursor in the update worker self.register_timed_cb(self._state_pull_cb, timer=self._cfg['db_poll_sleeptime']) self.register_timed_cb(self._pilot_heartbeat_cb, timer=self._cfg['db_poll_sleeptime']) # also listen to the state pubsub for pilot state changes self.register_subscriber(rpc.STATE_PUBSUB, self._state_sub_cb) # let session know we exist self._session._register_pmgr(self) self._prof.prof('setup_done', uid=self._uid) self._rep.ok('>>ok\n')
def __init__(self, workload, resource, replicas=None): self._uid = ru.generate_id('rx') self._prof = ru.Profiler('radical.repex') self._prof.prof('create', uid=self._uid) self._workload = ru.Config(cfg=workload) self._resource = ru.Config(cfg=resource) self._replicas = replicas # the replicas need to be aware about pre_exec directives self._workload.pre_exec = self._resource.pre_exec assert (self._workload.config.replicas or self._replicas) assert (self._workload.config.cycles) self._cycles = self._workload.config.cycles self._waitlist = list() if self._replicas: self._workload.config.replicas = len(self._replicas) else: self._replicas = [ Replica(workload=self._workload) for _ in range(self._workload.config.replicas) ] self._pre_alg = prepare_algs.get(self._workload.prepare.algorithm) self._sel_alg = selection_algs.get(self._workload.selection.algorithm) self._exc_alg = exchange_algs.get(self._workload.exchange.algorithm) # if the configured algorithms are not known (not hard-coded in RX), # then assume they point to user specified files and load them if not self._pre_alg: filename, funcname = self._workload.prepare.algorithm.split(':') syms = ru.import_file(filename) self._pre_alg = syms['functions'][funcname] if not self._sel_alg: filename, funcname = self._workload.selection.algorithm.split(':') syms = ru.import_file(filename) self._sel_alg = syms['functions'][funcname] if not self._exc_alg: filename, funcname = self._workload.exchange.algorithm.split(':') syms = ru.import_file(filename) self._exc_alg = syms['functions'][funcname] assert (self._pre_alg), 'preparation algorithm missing' assert (self._sel_alg), 'selection algorithm missing' assert (self._exc_alg), 'exchange algorithm missing' rmq_host = str(self._resource.get('rmq_host', 'localhost')) rmq_port = int(self._resource.get('rmq_port', '5672')) rmq_user = str(self._resource.get('rmq_user', 'guest')) rmq_pass = str(self._resource.get('rmq_pass', 'guest')) re.AppManager.__init__(self, autoterminate=True, hostname=rmq_host, port=rmq_port, username=rmq_user, password=rmq_pass) for r in self._replicas: r._initialize(check_ex=self._check_exchange, check_res=self._check_resume, sid=self.sid, prof=self._prof) self._lock = ru.Lock(name='rx') rd = copy.deepcopy(self._resource) if 'rmq_host' in rd: del (rd['rmq_host']) if 'rmq_port' in rd: del (rd['rmq_port']) if 'pre_exec' in rd: del (rd['pre_exec']) self.resource_desc = rd self._log = ru.Logger('radical.repex') self._dout = open('dump.log', 'a') self._dump(msg='startup') # run the replica pipelines self.workflow = set(self._replicas)