def _assert_profiler(key, val, res): try: os.environ[key] = val pname = 'ru.%d' % os.getpid() fname = '/tmp/%s.prof' % pname prof = ru.Profiler(name=pname, ns='radical.utils.test', path='/tmp/') prof.prof('foo') assert (res == os.path.isfile(fname)) assert (res == _cmd('grep -e "^[0-9\\.]*,foo,%s," %s' % (pname, fname))) finally: try: del (os.environ[key]) except: pass try: os.unlink(fname) except: pass
def test_profiler(): ''' create and check profile timestamps ''' pname = 'ru.%d' % os.getpid() fname = '/tmp/%s.prof' % pname now = time.time() try: os.environ['RADICAL_PROFILE'] = 'True' prof = ru.Profiler(name=pname, ns='radical.utils', path='/tmp/') prof.prof('foo') prof.prof('bar', uid='baz') prof.prof('buz', ts=now) assert (os.path.isfile(fname)) def _grep(pat): return _cmd('grep -e "%s" %s' % (pat, fname)) assert (_grep('^[0-9\\.]*,foo,%s,MainThread,,,$' % pname)) assert (_grep('^[0-9\\.]*,bar,%s,MainThread,baz,,$' % pname)) assert (_grep('^%.7f,buz,%s,MainThread,,,$' % (now, pname))) finally: try: del (os.environ['RADICAL_PROFILE']) except: pass try: os.unlink(fname) except: pass
def __init__(self, log=None, rep=None, prof=None): if log: self._log = log else: self._log = ru.Logger('radical.nge') if rep: self._rep = log else: self._rep = ru.Reporter('radical.nge') if prof: self._prof = prof else: self._prof = ru.Profiler('radical.nge') self._session = rp.Session() self._pmgr = rp.PilotManager(self._session) self._umgr = rp.UnitManager(self._session) self._pmgr.register_callback(self._pilot_state_cb) self._umgr.register_callback(self._unit_state_cb) # create a dir for data staging self._pwd = os.getcwd() self._data = 'data.%s' % self._session.uid os.makedirs('%s/%s/' % (self._pwd, self._data)) # track submitted tasks self._tcnt = 0 self._tasks = dict()
def __init__(self, number_of_replicas, systems=list(), workflow=None, cores=32, ligand=False, full=False, gibbs_steps, thermodynamic_states): self.number_of_replicas = number_of_replicas self.n_gibbs_steps = gibbs_steps self.thermo_state = thermodynamic_states self.ligand = '-ligands' if ligand else '' self.step_count = _full_steps if full else _reduced_steps self.systems = systems self.cores = cores self._id = uuid.uuid1() # generate id # self.workflow = workflow or ['gen_replicas', 'repex', 'rotation', 'translation', 'propagation'] # null workflow self.workflow = workflow or list(range(0,5)) # Profiler for TIES PoE self._uid = ru.generate_id('radical.yank.yank-repex') self._logger = ru.get_logger('radical.yank.yank-repex') self._prof = ru.Profiler(name=self._uid) self._prof.prof('create yank-repex instance', uid=self._uid)
def __init__(self, resource='local', comm_server=None): """The workhorse of high throughput binding affinity calculations. Manages arbitrary number of protocols on any resource (including supercomputers). Parameters ---------- resource: str The name of the resource where the protocols will be run. This is usually then name of the supercomputer or 'local' if the job will be executed locally. (the default is to try to run locally). comm_server: tuple(str, int) The communication server used by the execution system. Specify a hostname and port number as a tuple. If None, then the dedicated server might be used from the resource description if present. """ self.resource = yaml.load(resource_stream(__name__, 'resources.yaml'))[resource] if comm_server is None: comm_server = self.resource.get('dedicated_rabbitmq_server') self._protocols = list() self._app_manager = AppManager(*comm_server) # Profiler for Runner self._uid = ru.generate_id('radical.htbac.workflow_runner') self._logger = ru.get_logger('radical.htbac.workflow_runner') self._prof = ru.Profiler(name=self._uid) self._prof.prof('create workflow_runner obj', uid=self._uid) self._root_directories = list()
def __init__(self, resource_desc, sid, rts, rts_config): if not isinstance(resource_desc, dict): raise TypeError(expected_type=dict, actual_type=type(resource_desc)) self._resource_desc = resource_desc self._sid = sid self._rts = rts self._rts_config = rts_config # Resource reservation related parameters self._resource = None self._walltime = None self._cpus = 1 self._gpus = 0 self._project = None self._access_schema = None self._queue = None self._validated = False # Utility parameters self._uid = ru.generate_id('resource_manager.%(counter)04d', ru.ID_CUSTOM) self._path = os.getcwd() + '/' + self._sid name = 'radical.entk.%s' % self._uid self._logger = ru.Logger (name, path=self._path) self._prof = ru.Profiler(name, path=self._path) self._shared_data = list() self._outputs = None
def __init__(self, sid, workflow, pending_queue, completed_queue, resubmit_failed, rmq_conn_params): # Mandatory arguments self._sid = sid self._pending_queue = pending_queue self._completed_queue = completed_queue self._resubmit_failed = resubmit_failed self._rmq_conn_params = rmq_conn_params # Assign validated workflow self._workflow = workflow # Create logger and profiler at their specific locations using the sid self._path = os.getcwd() + '/' + self._sid self._uid = ru.generate_id('wfprocessor.%(item_counter)04d', ru.ID_CUSTOM, ns=self._sid) name = 'radical.entk.%s' % self._uid self._logger = ru.Logger(name, path=self._path) self._prof = ru.Profiler(name, path=self._path) self._report = ru.Reporter(name) # Defaults self._wfp_process = None self._enqueue_thread = None self._dequeue_thread = None self._rmq_ping_interval = os.getenv('RMQ_PING_INTERVAL', 10) self._logger.info('Created WFProcessor object: %s' % self._uid) self._prof.prof('create_wfp', uid=self._uid)
def __init__(self, config_path=None, hostname=None, port=None, reattempts=None, resubmit_failed=None, autoterminate=None, write_workflow=None, rts=None, rmq_cleanup=None, rts_config=None, name=None): # Create a session for each EnTK script execution if name: self._name = name self._sid = name else: self._name = str() self._sid = ru.generate_id('re.session', ru.ID_PRIVATE) self._read_config(config_path, hostname, port, reattempts, resubmit_failed, autoterminate, write_workflow, rts, rmq_cleanup, rts_config) # Create an uid + logger + profiles for AppManager, under the sid # namespace path = os.getcwd() + '/' + self._sid self._uid = ru.generate_id('appmanager.%(item_counter)04d', ru.ID_CUSTOM, namespace=self._sid) self._logger = ru.Logger('radical.entk.%s' % self._uid, path=path, targets=['2', '.']) self._prof = ru.Profiler(name='radical.entk.%s' % self._uid, path=path) self._report = ru.Reporter(name='radical.entk.%s' % self._uid) self._report.info('EnTK session: %s\n' % self._sid) self._prof.prof('create amgr obj', uid=self._uid) self._report.info('Creating AppManager') self._resource_manager = None # RabbitMQ Queues self._pending_queue = list() self._completed_queue = list() # Global parameters to have default values self._mqs_setup = False self._resource_desc = None self._task_manager = None self._workflow = None self._cur_attempt = 1 self._shared_data = list() self._rmq_ping_interval = os.getenv('RMQ_PING_INTERVAL', 10) self._logger.info('Application Manager initialized') self._prof.prof('amgr obj created', uid=self._uid) self._report.ok('>>ok\n')
def __init__(self, sid, pending_queue, completed_queue, rmgr, rmq_conn_params, rts): if not isinstance(sid, str): raise TypeError(expected_type=str, actual_type=type(sid)) if not isinstance(pending_queue, list): raise TypeError(expected_type=str, actual_type=type(pending_queue)) if not isinstance(completed_queue, list): raise TypeError(expected_type=str, actual_type=type(completed_queue)) if not isinstance(rmgr, Base_ResourceManager): raise TypeError(expected_type=Base_ResourceManager, actual_type=type(rmgr)) if not isinstance(rmq_conn_params, pika.connection.ConnectionParameters): raise TypeError(expected_type=pika.connection.ConnectionParameters, actual_type=type(rmq_conn_params)) self._sid = sid self._pending_queue = pending_queue self._completed_queue = completed_queue self._rmgr = rmgr self._rts = rts self._rmq_conn_params = rmq_conn_params # Utility parameters self._uid = ru.generate_id('task_manager.%(counter)04d', ru.ID_CUSTOM) self._path = os.getcwd() + '/' + self._sid name = 'radical.entk.%s' % self._uid self._log = ru.Logger(name, path=self._path) self._prof = ru.Profiler(name, path=self._path) self._dh = ru.DebugHelper(name=name) # Thread should run till terminate condtion is encountered mq_connection = pika.BlockingConnection(rmq_conn_params) self._hb_request_q = '%s-hb-request' % self._sid self._hb_response_q = '%s-hb-response' % self._sid mq_channel = mq_connection.channel() # To respond to heartbeat - get request from rpc_queue mq_channel.queue_delete(queue=self._hb_response_q) mq_channel.queue_declare(queue=self._hb_response_q) # To respond to heartbeat - get request from rpc_queue mq_channel.queue_delete(queue=self._hb_request_q) mq_channel.queue_declare(queue=self._hb_request_q) self._tmgr_process = None self._hb_thread = None self._hb_interval = int(os.getenv('ENTK_HB_INTERVAL', 30)) mq_connection.close()
def __init__(self, cfg): self._cfg = ru.Config('radical.pilot.cmgr', cfg=cfg) self._sid = self._cfg.sid self._uid = ru.generate_id('cmgr', ns=self._sid) self._uids = [self._uid] # uids to track hartbeats for (incl. own) self._prof = ru.Profiler(self._uid, ns='radical.pilot', path=self._cfg.path) self._log = ru.Logger(self._uid, ns='radical.pilot', path=self._cfg.path) self._prof.prof('init2', uid=self._uid, msg=self._cfg.path) # Every ComponentManager runs a HB pubsub bridge in a separate thread. # That HB channel should be used by all components and bridges created # under this CMGR. bcfg = ru.Config( cfg={ 'channel': 'heartbeat', 'type': 'pubsub', 'uid': self._uid + '.hb', 'stall_hwm': 1, 'bulk_size': 0, 'path': self._cfg.path }) self._hb_bridge = ru.zmq.PubSub(bcfg) self._hb_bridge.start() self._cfg.heartbeat.addr_pub = str(self._hb_bridge.addr_pub) self._cfg.heartbeat.addr_sub = str(self._hb_bridge.addr_sub) # runs a HB monitor on that channel self._hb = ru.Heartbeat( uid=self.uid, timeout=self._cfg.heartbeat.timeout, interval=self._cfg.heartbeat.interval, beat_cb=self._hb_beat_cb, # on every heartbeat term_cb=self._hb_term_cb, # on termination log=self._log) self._hb_pub = ru.zmq.Publisher('heartbeat', self._cfg.heartbeat.addr_pub, log=self._log, prof=self._prof) self._hb_sub = ru.zmq.Subscriber('heartbeat', self._cfg.heartbeat.addr_sub, topic='heartbeat', cb=self._hb_sub_cb, log=self._log, prof=self._prof) # confirm the bridge being usable by listening to our own heartbeat self._hb.start() self._hb.wait_startup(self._uid, self._cfg.heartbeat.timeout) self._log.info('heartbeat system up')
def __init__(self, cfg, session): self._cfg = cfg self._pid = cfg.pid self._pmgr = cfg.pmgr self._pwd = cfg.pilot_sandbox self._session = session self._log = session._log self._starttime = time.time() self._final_cause = None # this is the earliest point to sync bootstrap and agent profiles prof = ru.Profiler(ns='radical.pilot', name='agent.0') prof.prof('sync_rel', uid=cfg.pid, msg='agent.0') prof.prof('hostname', uid=cfg.pid, msg=ru.get_hostname()) # connect to MongoDB for state push/pull self._connect_db() # configure ResourceManager before component startup, as components need # ResourceManager information for function (scheduler, executor) self._configure_rm() # ensure that app communication channels are visible to workload self._configure_app_comm() # expose heartbeat channel to sub-agents, bridges and components, # and start those self._cmgr = rpu.ComponentManager(self._cfg) self._cfg.heartbeat = self._cmgr.cfg.heartbeat self._cmgr.start_bridges() self._cmgr.start_components() # create the sub-agent configs and start the sub agents self._write_sa_configs() self._start_sub_agents() # TODO: move to cmgr? # at this point the session is up and connected, and it should have # brought up all communication bridges and components. We are # ready to rumble! rpu.Worker.__init__(self, self._cfg, session) # run our own slow-paced heartbeat monitor to watch pgr heartbeats self._hb = ru.Heartbeat( uid=self._pid, timeout=10.0, # FIXME: configurable interval=1.0, # FIXME: configurable beat_cb=self._hb_check, # no own heartbeat(pmgr pulls) term_cb=self._hb_term_cb, log=self._log) self._hb.start() # register pmgr heartbeat self._log.info('hb init for %s', self._pmgr) self._hb.beat(uid=self._pmgr)
def test_wfp_workflow_incomplete(): p = Pipeline() s = Stage() t = Task() t.executable = ['/bin/date'] s.add_tasks(t) p.add_stages(s) amgr = Amgr(hostname=hostname, port=port) amgr._setup_mqs() wfp = WFprocessor(sid=amgr._sid, workflow=[p], pending_queue=amgr._pending_queue, completed_queue=amgr._completed_queue, mq_hostname=amgr._mq_hostname, port=amgr._port, resubmit_failed=False) wfp._initialize_workflow() assert wfp.workflow_incomplete() amgr.workflow = [p] profiler = ru.Profiler(name='radical.entk.temp') p.stages[0].state == states.SCHEDULING p.state == states.SCHEDULED for t in p.stages[0].tasks: t.state = states.COMPLETED import json import pika task_as_dict = json.dumps(t.to_dict()) mq_connection = pika.BlockingConnection( pika.ConnectionParameters(host=amgr._mq_hostname, port=amgr._port)) mq_channel = mq_connection.channel() mq_channel.basic_publish(exchange='', routing_key='%s-completedq-1' % amgr._sid, body=task_as_dict) amgr._terminate_sync = Event() sync_thread = Thread(target=amgr._synchronizer, name='synchronizer-thread') sync_thread.start() proc = Process(target=func_for_dequeue_test, name='temp-proc', args=(wfp, )) proc.start() proc.join() amgr._terminate_sync.set() sync_thread.join() assert not wfp.workflow_incomplete()
def _get_profiler(self, name): """ This is a thin wrapper around `ru.Profiler()` which makes sure that log files end up in a separate directory with the name of `session.uid`. """ prof = ru.Profiler(name=name, ns='radical.pilot', path=self._logdir) return prof
def __init__(self): self.book = [ ] #bookkeeping, maintains a record of all MD tasks carried out self.md_task_list = [] self.ex_task_list = [] self._uid = ru.generate_id('radical.repex.syncex') self._logger = ru.get_logger('radical.repex.syncex') self._prof = ru.Profiler(name=self._uid) self._prof.prof('Initinit', uid=self._uid)
def test_amgr_synchronizer(): logger = ru.get_logger('radical.entk.temp_logger') profiler = ru.Profiler(name='radical.entk.temp') amgr = Amgr(hostname=hostname, port=port) mq_connection = pika.BlockingConnection(pika.ConnectionParameters(host=hostname, port=port)) mq_channel = mq_connection.channel() amgr._setup_mqs() p = Pipeline() s = Stage() # Create and add 100 tasks to the stage for cnt in range(100): t = Task() t.executable = ['some-executable-%s' % cnt] s.add_tasks(t) p.add_stages(s) p._assign_uid(amgr._sid) p._validate() amgr.workflow = [p] for t in p.stages[0].tasks: assert t.state == states.INITIAL assert p.stages[0].state == states.INITIAL assert p.state == states.INITIAL # Start the synchronizer method in a thread amgr._terminate_sync = Event() sync_thread = Thread(target=amgr._synchronizer, name='synchronizer-thread') sync_thread.start() # Start the synchronizer method in a thread proc = Process(target=func_for_synchronizer_test, name='temp-proc', args=(amgr._sid, p, logger, profiler)) proc.start() proc.join() for t in p.stages[0].tasks: assert t.state == states.SCHEDULING assert p.stages[0].state == states.SCHEDULING assert p.state == states.SCHEDULING amgr._terminate_sync.set() sync_thread.join()
def test_wfp_enqueue(): p = Pipeline() s = Stage() t = Task() t.executable = ['/bin/date'] s.add_tasks(t) p.add_stages(s) amgr = Amgr(hostname=hostname, port=port) amgr._setup_mqs() wfp = WFprocessor(sid=amgr._sid, workflow=[p], pending_queue=amgr._pending_queue, completed_queue=amgr._completed_queue, mq_hostname=amgr._mq_hostname, port=amgr._port, resubmit_failed=False) wfp._initialize_workflow() amgr.workflow = [p] profiler = ru.Profiler(name='radical.entk.temp') for t in p.stages[0].tasks: assert t.state == states.INITIAL assert p.stages[0].state == states.INITIAL assert p.state == states.INITIAL amgr._terminate_sync = Event() sync_thread = Thread(target=amgr._synchronizer, name='synchronizer-thread') sync_thread.start() proc = Process(target=func_for_enqueue_test, name='temp-proc', args=(wfp, )) proc.start() proc.join() amgr._terminate_sync.set() sync_thread.join() for t in p.stages[0].tasks: assert t.state == states.SCHEDULED assert p.stages[0].state == states.SCHEDULED assert p.state == states.SCHEDULING
def __init__(self): self._cores = 0 self._protocols = list() self._hostname = None self._port = None self.ids = None self.app_manager = None self.total_replicas = 0 # Profiler for Runner self._uid = ru.generate_id('radical.yank.workflow_runner') self._logger = ru.get_logger('radical.yank.workflow_runner') self._prof = ru.Profiler(name=self._uid) self._prof.prof('create workflow_runner obj', uid=self._uid) self._root_directories = list() self.ids = dict()
def __init__(self): ''' initialize the service endpoint: - create logger, profile and reporter - set up accounts ''' self._log = ru.Logger('radical.nge.service') self._rep = ru.Reporter('radical.nge.service') self._prof = ru.Profiler('radical.nge.service') self._accounts = { 'andre': _Account('andre', 'erdna'), 'matteo': _Account('matteo', 'eottam'), 'daniel': _Account('daniel', 'leinad'), 'guest': _Account('guest', 'guest'), } self._rep.header('--- NGE (%s) ---' % rn.version)
def func_for_dequeue_test(wfp): wfp._dequeue_thread_terminate = Event() p = wfp._workflow[0] profiler = ru.Profiler(name='radical.entk.temp') thread = Thread(target=wfp._dequeue, args=(profiler, )) thread.start() flag = False while True: if (p.state == states.DONE) and (p.stages[0].state == states.DONE): for t in p.stages[0].tasks: if t.state == states.DONE: flag = True if flag: break wfp._dequeue_thread_terminate.set() thread.join()
def __init__(self, sid=None): self._worflows = list() # A list of workflows IDs # This will a hash table of workflows. The table will include the # following: # 'workflowsID': {'state': The state of the workflow based on the WFM, # 'endpoint': Process ID or object to WMF for the specific # workflow, # 'start_time': Epoch of when the workflow is submitted # to the WMF, # 'end_time': Epoch of when the workflow finished.} self._execution_status = dict() # This will create a hash table of workflows self._uid = ru.generate_id('enactor.%(counter)04d', mode=ru.ID_CUSTOM, ns=sid) path = os.getcwd() + '/' + sid name = self._uid self._logger = ru.Logger(name=self._uid, path=path, level='DEBUG') self._prof = ru.Profiler(name=name, path=path)
def __init__(self, url, log=None, rep=None, prof=None): if log: self._log = log else: self._log = ru.Logger('radical.nge') if rep: self._rep = log else: self._rep = ru.Reporter('radical.nge') if prof: self._prof = prof else: self._prof = ru.Profiler('radical.nge') self._cookies = list() self._url = ru.Url(url) self._qbase = ru.Url(url) # self._qbase.username = None # self._qbase.password = None self._qbase = str(self._qbase).rstrip('/') if self._url.username and self._url.password: self.login(self._url.username, self._url.password)
def __init__(self, workload, properties=None): self._workload = workload self._check_ex = None self._check_res = None if not properties: properties = dict() self._rid = ru.generate_id('rep.%(counter)04d', ru.ID_CUSTOM) # this is inefficient at scale... self._prof = ru.Profiler('radical.entk') self._prof.prof('create', uid=self._rid) self._props = properties self._cycle = -1 # increased when adding md stage self._ex_list = None # list of replicas used in exchange step re.Pipeline.__init__(self) self.name = 'p.%s' % self.rid self._log = ru.Logger('radical.repex')
def master(obj, obj_type, new_state): hostname = os.environ.get('RMQ_HOSTNAME', 'localhost') port = int(os.environ.get('RMQ_PORT', 5672)) mq_connection = pika.BlockingConnection( pika.ConnectionParameters(host=hostname, port=port)) mq_channel = mq_connection.channel() queue1 = 'test-1-2-3' # Expected queue name structure 'X-A-B-C' queue2 = 'test-3-2-1' # Expected queue name structure 'X-C-B-A' mq_channel.queue_declare(queue=queue1) mq_channel.queue_declare(queue=queue2) logger = ru.Logger('radical.entk.test') profiler = ru.Profiler('radical.entk.test') thread1 = Thread(target=func, args=(obj, obj_type, new_state, queue1, logger, profiler)) thread1.start() while True: method_frame, props, body = mq_channel.basic_get(queue=queue1) if body: msg = json.loads(body) assert msg['object']['state'] == new_state mq_channel.basic_publish(exchange='', routing_key=queue2, properties=pika.BasicProperties( correlation_id=props.correlation_id), body='ack') mq_channel.basic_ack(delivery_tag=method_frame.delivery_tag) break mq_channel.queue_delete(queue=queue1) mq_channel.queue_delete(queue=queue2) mq_connection.close() thread1.join()
res_dict = { "resource": Resource, "walltime": 30, "cpus": Pilot_Cores, "gpus_per_node": 0, "access_schema": 'gsissh', #'queue': 'debug', "queue": 'workq', "project": 'TG-MCB090174', #'project': 'bamm', } uid1 = ru.generate_id('radical.repex.run') logger = ru.get_logger('radical.repex.run') prof = ru.Profiler(name=uid1) prof.prof('Create_Workflow_0', uid=uid1) synchronousExchange = SynchronousExchange() appman = AppManager(autoterminate=False, port=33215) # Create Application Manager appman.resource_desc = res_dict # Assign resource manager to the Application Manager Exchange = synchronousExchange.InitCycle(Replicas, Replica_Cores, MD_Executable, ExchangeMethod, timesteps) appman.workflow = set([ Exchange ]) # Assign the workflow as a set of Pipelines to the Application Manager
def __init__(self, sid, pending_queue, completed_queue, rmgr, mq_hostname, port, rts): if isinstance(sid, str): self._sid = sid else: raise TypeError(expected_type=str, actual_type=type(sid)) if isinstance(pending_queue, list): self._pending_queue = pending_queue else: raise TypeError(expected_type=str, actual_type=type(pending_queue)) if isinstance(completed_queue, list): self._completed_queue = completed_queue else: raise TypeError(expected_type=str, actual_type=type(completed_queue)) if isinstance(mq_hostname, str): self._mq_hostname = mq_hostname else: raise TypeError(expected_type=str, actual_type=type(mq_hostname)) if isinstance(port, int): self._port = port else: raise TypeError(expected_type=int, actual_type=type(port)) if isinstance(rmgr, Base_ResourceManager): self._rmgr = rmgr else: raise TypeError(expected_type=ResourceManager, actual_type=type(rmgr)) self._rts = rts # Utility parameters self._uid = ru.generate_id('task_manager.%(item_counter)04d', ru.ID_CUSTOM, namespace=self._sid) self._path = os.getcwd() + '/' + self._sid self._logger = ru.Logger('radical.entk.%s' % self._uid, path=self._path, targets=['2', '.']) self._prof = ru.Profiler(name='radical.entk.%s' % self._uid + '-obj', path=self._path) # Thread should run till terminate condtion is encountered mq_connection = pika.BlockingConnection( pika.ConnectionParameters(host=mq_hostname, port=port)) self._hb_request_q = '%s-hb-request' % self._sid self._hb_response_q = '%s-hb-response' % self._sid mq_channel = mq_connection.channel() # To respond to heartbeat - get request from rpc_queue mq_channel.queue_delete(queue=self._hb_response_q) mq_channel.queue_declare(queue=self._hb_response_q) # To respond to heartbeat - get request from rpc_queue mq_channel.queue_delete(queue=self._hb_request_q) mq_channel.queue_declare(queue=self._hb_request_q) self._tmgr_process = None self._hb_thread = None self._hb_interval = int(os.getenv('ENTK_HB_INTERVAL', 30)) mq_connection.close()
def __init__(self, workload, resource, replicas=None): self._uid = ru.generate_id('rx') self._prof = ru.Profiler('radical.repex') self._prof.prof('create', uid=self._uid) self._workload = ru.Config(cfg=workload) self._resource = ru.Config(cfg=resource) self._replicas = replicas # the replicas need to be aware about pre_exec directives self._workload.pre_exec = self._resource.pre_exec assert (self._workload.config.replicas or self._replicas) assert (self._workload.config.cycles) self._cycles = self._workload.config.cycles self._waitlist = list() if self._replicas: self._workload.config.replicas = len(self._replicas) else: self._replicas = [ Replica(workload=self._workload) for _ in range(self._workload.config.replicas) ] self._pre_alg = prepare_algs.get(self._workload.prepare.algorithm) self._sel_alg = selection_algs.get(self._workload.selection.algorithm) self._exc_alg = exchange_algs.get(self._workload.exchange.algorithm) # if the configured algorithms are not known (not hard-coded in RX), # then assume they point to user specified files and load them if not self._pre_alg: filename, funcname = self._workload.prepare.algorithm.split(':') syms = ru.import_file(filename) self._pre_alg = syms['functions'][funcname] if not self._sel_alg: filename, funcname = self._workload.selection.algorithm.split(':') syms = ru.import_file(filename) self._sel_alg = syms['functions'][funcname] if not self._exc_alg: filename, funcname = self._workload.exchange.algorithm.split(':') syms = ru.import_file(filename) self._exc_alg = syms['functions'][funcname] assert (self._pre_alg), 'preparation algorithm missing' assert (self._sel_alg), 'selection algorithm missing' assert (self._exc_alg), 'exchange algorithm missing' rmq_host = str(self._resource.get('rmq_host', 'localhost')) rmq_port = int(self._resource.get('rmq_port', '5672')) rmq_user = str(self._resource.get('rmq_user', 'guest')) rmq_pass = str(self._resource.get('rmq_pass', 'guest')) re.AppManager.__init__(self, autoterminate=True, hostname=rmq_host, port=rmq_port, username=rmq_user, password=rmq_pass) for r in self._replicas: r._initialize(check_ex=self._check_exchange, check_res=self._check_resume, sid=self.sid, prof=self._prof) self._lock = ru.Lock(name='rx') rd = copy.deepcopy(self._resource) if 'rmq_host' in rd: del (rd['rmq_host']) if 'rmq_port' in rd: del (rd['rmq_port']) if 'pre_exec' in rd: del (rd['pre_exec']) self.resource_desc = rd self._log = ru.Logger('radical.repex') self._dout = open('dump.log', 'a') self._dump(msg='startup') # run the replica pipelines self.workflow = set(self._replicas)
def test_amgr_initialization(): amgr_name = ru.generate_id('test.appmanager.%(item_counter)04d', ru.ID_CUSTOM) amgr = Amgr(hostname=hostname, port=port, name=amgr_name) assert amgr._name.split('.') == amgr_name.split('.') assert amgr._sid.split('.') == amgr_name.split('.') assert amgr._uid.split('.') == ['appmanager', '0000'] assert type(amgr._logger) == type(ru.get_logger('radical.tests')) assert type(amgr._prof) == type(ru.Profiler('radical.tests')) assert type(amgr._report) == type(ru.Reporter('radical.tests')) assert isinstance(amgr.name, str) # RabbitMQ inits assert amgr._mq_hostname == hostname assert amgr._port == port # RabbitMQ Queues assert amgr._num_pending_qs == 1 assert amgr._num_completed_qs == 1 assert isinstance(amgr._pending_queue, list) assert isinstance(amgr._completed_queue, list) # Global parameters to have default values assert amgr._mqs_setup == False assert amgr._resource_desc == None assert amgr._task_manager == None assert amgr._workflow == None assert amgr._resubmit_failed == False assert amgr._reattempts == 3 assert amgr._cur_attempt == 1 assert amgr._autoterminate == True assert isinstance(amgr.shared_data, list) amgr = Amgr(hostname=hostname, port=port) assert amgr._uid.split('.') == ['appmanager', '0000'] assert type(amgr._logger) == type(ru.get_logger('radical.tests')) assert type(amgr._prof) == type(ru.Profiler('radical.tests')) assert type(amgr._report) == type(ru.Reporter('radical.tests')) assert isinstance(amgr.name, str) # RabbitMQ inits assert amgr._mq_hostname == hostname assert amgr._port == port # RabbitMQ Queues assert amgr._num_pending_qs == 1 assert amgr._num_completed_qs == 1 assert isinstance(amgr._pending_queue, list) assert isinstance(amgr._completed_queue, list) # Global parameters to have default values assert amgr._mqs_setup == False assert amgr._resource_desc == None assert amgr._task_manager == None assert amgr._workflow == None assert amgr._resubmit_failed == False assert amgr._reattempts == 3 assert amgr._cur_attempt == 1 assert amgr._autoterminate == True assert isinstance(amgr.shared_data, list)
def __init__(self, dburl=None, uid=None, cfg=None, _connect=True): """ Creates a new session. A new Session instance is created and stored in the database. **Arguments:** * **dburl** (`string`): The MongoDB URL. If none is given, RP uses the environment variable RADICAL_PILOT_DBURL. If that is not set, an error will be raises. * **uid** (`string`): Create a session with this UID. *Only use this when you know what you are doing!* **Returns:** * A new Session instance. **Raises:** * :class:`radical.pilot.DatabaseError` """ if os.uname()[0] == 'Darwin': # on MacOS, we are running out of file descriptors soon. The code # below attempts to increase the limit of open files - but any error # is silently ignored, so this is an best-effort, no guarantee. We # leave responsibility for system limits with the user. try: import resource limits = list(resource.getrlimit(resource.RLIMIT_NOFILE)) limits[0] = 512 resource.setrlimit(resource.RLIMIT_NOFILE, limits) except: pass self._dh = ru.DebugHelper() self._valid = True self._closed = False self._valid_iter = 0 # detect recursive calls of `is_valid()` # class state self._dbs = None self._uid = None self._dburl = None self._reconnected = False self._cache = dict() # cache sandboxes etc. self._cache_lock = threading.RLock() self._cache['resource_sandbox'] = dict() self._cache['session_sandbox'] = dict() self._cache['pilot_sandbox'] = dict() # before doing anything else, set up the debug helper for the lifetime # of the session. self._debug_helper = ru.DebugHelper() # Dictionaries holding all manager objects created during the session. # NOTE: should this also include agents? self._pmgrs = dict() self._umgrs = dict() self._bridges = list() self._components = list() # FIXME: we work around some garbage collection issues we don't yet # understand: instead of relying on the GC to eventually collect # some stuff, we actively free those on `session.close()`, at # least for the current process. Usually, all resources get # nicely collected on process termination - but not when we # create many sessions (one after the other) in the same # application instance (ie. the same process). This workarounf # takes care of that use case. # The clean solution would be to ensure clean termination # sequence, something which I seem to be unable to implement... # :/ self._to_close = list() self._to_stop = list() self._to_destroy = list() # cache the client sandbox # FIXME: this needs to be overwritten if configured differently in the # session config, as should be the case for any agent side # session instance. self._client_sandbox = os.getcwd() # The resource configuration dictionary associated with the session. self._resource_configs = {} # if a config is given, us its values: if cfg: self._cfg = copy.deepcopy(cfg) else: # otherwise we need a config self._cfg = ru.read_json("%s/configs/session_%s.json" \ % (os.path.dirname(__file__), os.environ.get('RADICAL_PILOT_SESSION_CFG', 'default'))) # fall back to config data where possible # sanity check on parameters if not uid : uid = self._cfg.get('session_id') if uid: self._uid = uid self._reconnected = True else: # generate new uid, reset all other ID counters # FIXME: this will screw up counters for *concurrent* sessions, # as the ID generation is managed in a process singleton. self._uid = ru.generate_id('rp.session', mode=ru.ID_PRIVATE) ru.reset_id_counters(prefix='rp.session', reset_all_others=True) if not self._cfg.get('session_id'): self._cfg['session_id'] = self._uid if not self._cfg.get('owner') : self._cfg['owner'] = self._uid if not self._cfg.get('debug') : self._cfg['debug'] = 'DEBUG' if not self._cfg.get('logdir') : self._cfg['logdir'] = '%s/%s' \ % (os.getcwd(), self._uid) self._logdir = self._cfg['logdir'] self._log = self._get_logger(self._cfg['owner'], self._cfg.get('debug')) if _connect: # we need a dburl to connect to. if not dburl: dburl = os.environ.get("RADICAL_PILOT_DBURL") if not dburl: dburl = self._cfg.get('default_dburl') if not dburl: dburl = self._cfg.get('dburl') if not dburl: # we forgive missing dburl on reconnect, but not otherwise raise RuntimeError("no database URL (set RADICAL_PILOT_DBURL)") self._dburl = ru.Url(dburl) self._cfg['dburl'] = str(self._dburl) # now we have config and uid - initialize base class (saga session) rs.Session.__init__(self, uid=self._uid) # ---------------------------------------------------------------------- # create new session if _connect: self._log.info("using database %s" % self._dburl) # if the database url contains a path element, we interpret that as # database name (without the leading slash) if not self._dburl.path or \ self._dburl.path[0] != '/' or \ len(self._dburl.path) <= 1 : if not uid: # we fake reconnnect if no DB is available -- but otherwise we # really really need a db connection... raise ValueError("incomplete DBURL '%s' no db name!" % self._dburl) # initialize profiling, but make sure profile ends up in our logdir self._prof = ru.Profiler(self._cfg['owner'], path=self._logdir) if not self._reconnected: self._prof.prof('session_start', uid=self._uid) self._log.report.info ('<<new session: ') self._log.report.plain('[%s]' % self._uid) self._log.report.info ('<<database : ') self._log.report.plain('[%s]' % self._dburl) self._load_resource_configs() self._rec = os.environ.get('RADICAL_PILOT_RECORD_SESSION') if self._rec: # NOTE: Session recording cannot handle reconnected sessions, yet. # We thus turn it off here with a warning if self._reconnected: self._log.warn("no session recording on reconnected session") else: # append session ID to recording path self._rec = "%s/%s" % (self._rec, self._uid) # create recording path and record session os.system('mkdir -p %s' % self._rec) ru.write_json({'dburl': str(self.dburl)}, "%s/session.json" % self._rec) self._log.info("recording session in %s" % self._rec) # create/connect database handle try: self._dbs = DBSession(sid=self.uid, dburl=str(self._dburl), cfg=self._cfg, logger=self._log, connect=_connect) # from here on we should be able to close the session again self._log.info("New Session created: %s." % self.uid) except Exception, ex: self._log.report.error(">>err\n") self._log.exception('session create failed') raise RuntimeError("Couldn't create new session (database URL '%s' incorrect?): %s" \ % (dburl, ex))
def test_mpi_unit_with_tagging(mocked_init, mocked_method, mocked_profiler, mocked_raise_on): cfg, session = setUp() component = Continuous(cfg=dict(), session=session) component._lrms_info = cfg['lrms_info'] component._lrms_lm_info = cfg['lrms_info']['lm_info'] component._lrms_node_list = cfg['lrms_info']['node_list'] component._lrms_cores_per_node = cfg['lrms_info']['cores_per_node'] component._lrms_gpus_per_node = cfg['lrms_info']['gpus_per_node'] component._lrms_lfs_per_node = cfg['lrms_info']['lfs_per_node'] component._slot_lock = threading.RLock() component._scattered = True component._log = ru.Logger('test.component') component._prof = ru.Profiler('test') component._tag_history = dict() component.nodes = [] for node, node_uid in component._lrms_node_list: component.nodes.append( copy.deepcopy({ 'name': node, 'uid': node_uid, 'cores': [rpc.FREE] * component._lrms_cores_per_node, 'gpus': [rpc.FREE] * component._lrms_gpus_per_node, 'lfs': component._lrms_lfs_per_node })) # Allocate first CUD -- should land on first node cu = mpi() cu['uid'] = 'unit.000000' cu['description']['cpu_processes'] = 2 cu['description']['cpu_threads'] = 1 cu['description']['lfs_per_process'] = 1024 component._try_allocation(cu) slot1 = cu['slots'] assert component._tag_history == {'unit.000000': [1]} assert slot1 == { 'cores_per_node': 2, 'lfs_per_node': component._lrms_lfs_per_node, 'nodes': [{ 'lfs': { 'size': 2048, 'path': 'abc' }, 'core_map': [[0], [1]], 'name': 'a', 'gpu_map': [], 'uid': 1 }], 'lm_info': 'INFO', 'gpus_per_node': 1 } # Assert resulting node list values after first CUD assert component.nodes == [{ 'lfs': { 'size': 3072, 'path': 'abc' }, 'cores': [1, 1], 'name': 'a', 'gpus': [0], 'uid': 1 }, { 'lfs': { 'size': 5120, 'path': 'abc' }, 'cores': [0, 0], 'name': 'b', 'gpus': [0], 'uid': 2 }, { 'lfs': { 'size': 5120, 'path': 'abc' }, 'cores': [0, 0], 'name': 'c', 'gpus': [0], 'uid': 3 }, { 'lfs': { 'size': 5120, 'path': 'abc' }, 'cores': [0, 0], 'name': 'd', 'gpus': [0], 'uid': 4 }, { 'lfs': { 'size': 5120, 'path': 'abc' }, 'cores': [0, 0], 'name': 'e', 'gpus': [0], 'uid': 5 }] # Allocate second CUD -- should return None as the first node is # not yet released cu = mpi() cu['uid'] = 'unit.000001' cu['description']['tag'] = 'unit.000000' component._try_allocation(cu) slot2 = cu['slots'] assert slot2 == None assert component._tag_history == {'unit.000000': [1]} # Allocate third CUD -- should land on second and third node cu = mpi() cu['uid'] = 'unit.000002' cu['description']['cpu_processes'] = 2 cu['description']['cpu_threads'] = 2 component._try_allocation(cu) slot3 = cu['slots'] assert slot3 == { 'cores_per_node': 2, 'lfs_per_node': component._lrms_lfs_per_node, 'nodes': [{ 'lfs': { 'size': 1024, 'path': 'abc' }, 'core_map': [[0, 1]], 'name': 'b', 'gpu_map': [], 'uid': 2 }, { 'lfs': { 'size': 1024, 'path': 'abc' }, 'core_map': [[0, 1]], 'name': 'c', 'gpu_map': [], 'uid': 3 }], 'lm_info': 'INFO', 'gpus_per_node': 1 } assert component._tag_history == { 'unit.000000': [1], 'unit.000002': [2, 3] } # Assert resulting node list values after second CUDslot release assert component.nodes == [{ 'lfs': { 'size': 3072, 'path': 'abc' }, 'cores': [1, 1], 'name': 'a', 'gpus': [0], 'uid': 1 }, { 'lfs': { 'size': 4096, 'path': 'abc' }, 'cores': [1, 1], 'name': 'b', 'gpus': [0], 'uid': 2 }, { 'lfs': { 'size': 4096, 'path': 'abc' }, 'cores': [1, 1], 'name': 'c', 'gpus': [0], 'uid': 3 }, { 'lfs': { 'size': 5120, 'path': 'abc' }, 'cores': [0, 0], 'name': 'd', 'gpus': [0], 'uid': 4 }, { 'lfs': { 'size': 5120, 'path': 'abc' }, 'cores': [0, 0], 'name': 'e', 'gpus': [0], 'uid': 5 }] # Allocate fourth CUD -- should return None as the second node is not # yet released cu = mpi() cu['uid'] = 'unit.000003' cu['description']['cpu_threads'] = 2 cu['description']['tag'] = 'unit.000002' component._try_allocation(cu) slot4 = cu['slots'] assert slot4 == None assert component._tag_history == { 'unit.000000': [1], 'unit.000002': [2, 3] } # Release first node and allocate second CUD again component._release_slot(slot1) assert component.nodes == [{ 'lfs': { 'size': 5120, 'path': 'abc' }, 'cores': [0, 0], 'name': 'a', 'gpus': [0], 'uid': 1 }, { 'lfs': { 'size': 4096, 'path': 'abc' }, 'cores': [1, 1], 'name': 'b', 'gpus': [0], 'uid': 2 }, { 'lfs': { 'size': 4096, 'path': 'abc' }, 'cores': [1, 1], 'name': 'c', 'gpus': [0], 'uid': 3 }, { 'lfs': { 'size': 5120, 'path': 'abc' }, 'cores': [0, 0], 'name': 'd', 'gpus': [0], 'uid': 4 }, { 'lfs': { 'size': 5120, 'path': 'abc' }, 'cores': [0, 0], 'name': 'e', 'gpus': [0], 'uid': 5 }] cu = mpi() cu['uid'] = 'unit.000001' cu['description']['tag'] = 'unit.000000' component._try_allocation(cu) slot2 = cu['slots'] assert slot2 == { 'cores_per_node': 2, 'lfs_per_node': component._lrms_lfs_per_node, 'nodes': [{ 'lfs': { 'size': 1024, 'path': 'abc' }, 'core_map': [[0]], 'name': 'a', 'gpu_map': [], 'uid': 1 }], 'lm_info': 'INFO', 'gpus_per_node': 1 } assert component._tag_history == { 'unit.000000': [1], 'unit.000001': [1], 'unit.000002': [2, 3] } # Release second and third nodes and allocate fourth CUD again component._release_slot(slot3) assert component.nodes == [{ 'lfs': { 'size': 4096, 'path': 'abc' }, 'cores': [1, 0], 'name': 'a', 'gpus': [0], 'uid': 1 }, { 'lfs': { 'size': 5120, 'path': 'abc' }, 'cores': [0, 0], 'name': 'b', 'gpus': [0], 'uid': 2 }, { 'lfs': { 'size': 5120, 'path': 'abc' }, 'cores': [0, 0], 'name': 'c', 'gpus': [0], 'uid': 3 }, { 'lfs': { 'size': 5120, 'path': 'abc' }, 'cores': [0, 0], 'name': 'd', 'gpus': [0], 'uid': 4 }, { 'lfs': { 'size': 5120, 'path': 'abc' }, 'cores': [0, 0], 'name': 'e', 'gpus': [0], 'uid': 5 }] cu = mpi() cu['uid'] = 'unit.000003' cu['description']['tag'] = 'unit.000002' cu['description']['cpu_threads'] = 2 component._try_allocation(cu) slot4 = cu['slots'] assert slot4 == { 'cores_per_node': 2, 'lfs_per_node': component._lrms_lfs_per_node, 'nodes': [{ 'lfs': { 'size': 1024, 'path': 'abc' }, 'core_map': [[0, 1]], 'name': 'b', 'gpu_map': [], 'uid': 2 }], 'lm_info': 'INFO', 'gpus_per_node': 1 } assert component._tag_history == { 'unit.000000': [1], 'unit.000001': [1], 'unit.000002': [2, 3], 'unit.000003': [2] } assert component.nodes == [{ 'lfs': { 'size': 4096, 'path': 'abc' }, 'cores': [1, 0], 'name': 'a', 'gpus': [0], 'uid': 1 }, { 'lfs': { 'size': 4096, 'path': 'abc' }, 'cores': [1, 1], 'name': 'b', 'gpus': [0], 'uid': 2 }, { 'lfs': { 'size': 5120, 'path': 'abc' }, 'cores': [0, 0], 'name': 'c', 'gpus': [0], 'uid': 3 }, { 'lfs': { 'size': 5120, 'path': 'abc' }, 'cores': [0, 0], 'name': 'd', 'gpus': [0], 'uid': 4 }, { 'lfs': { 'size': 5120, 'path': 'abc' }, 'cores': [0, 0], 'name': 'e', 'gpus': [0], 'uid': 5 }] tearDown()
def __init__(self, campaign, resources, objective=None, planner='random', sid=None): self._campaign = {'campaign': campaign, 'state': st.NEW} if sid: self._sid = sid else: self._sid = ru.generate_id('rcm.session', mode=ru.ID_PRIVATE) self._uid = ru.generate_id('bookkeper.%(counter)04d', mode=ru.ID_CUSTOM, ns=self._sid) self._resources = resources self._checkpoints = None self._plan = None self._objective = objective self._unavail_resources = [] self._workflows_state = dict() self._exec_state_lock = ru.RLock('workflows_state_lock') self._monitor_lock = ru.RLock('monitor_list_lock') self._time = 0 # The time in the campaign's world. self._workflows_to_monitor = list() self._est_end_times = dict() self._env = Environment() self._enactor = SimulatedEnactor(env=self._env, sid=self._sid) self._enactor.register_state_cb(self.state_update_cb) # Creating a thread to execute the monitoring and work methods. # One flag for both threads may be enough to monitor and check. self._terminate_event = mt.Event() # Thread event to terminate. self._work_thread = None # Private attribute that will hold the thread self._monitoring_thread = None # Private attribute that will hold the thread self._cont = False self._hold = False path = os.getcwd() + '/' + self._sid self._logger = ru.Logger(name=self._uid, path=path, level='DEBUG') self._prof = ru.Profiler(name=self._uid, path=path) num_oper = [ workflow['num_oper'] for workflow in self._campaign['campaign'] ] if planner.lower() == 'random': self._planner = RandomPlanner(campaign=self._campaign['campaign'], resources=self._resources, num_oper=num_oper, sid=self._sid) elif planner.lower() == 'heft': self._planner = HeftPlanner(campaign=self._campaign['campaign'], resources=self._resources, num_oper=num_oper, sid=self._sid) else: self._logger.warning('Planner %s is not implemented. Rolling to a \ random planner') self._planner = RandomPlanner(campaign=self._campaign['campaign'], resources=self._resources, num_oper=num_oper, sid=self._sid)