Esempi in Python per Profiler, esempi in Python per radical.utils.Profiler

Esempio n. 1

0

Mostra file

    def _assert_profiler(key, val, res):

        try:
            os.environ[key] = val

            pname = 'ru.%d' % os.getpid()
            fname = '/tmp/%s.prof' % pname
            prof = ru.Profiler(name=pname,
                               ns='radical.utils.test',
                               path='/tmp/')
            prof.prof('foo')

            assert (res == os.path.isfile(fname))
            assert (res == _cmd('grep -e "^[0-9\\.]*,foo,%s," %s' %
                                (pname, fname)))

        finally:
            try:
                del (os.environ[key])
            except:
                pass
            try:
                os.unlink(fname)
            except:
                pass

Esempio n. 2

0

Mostra file

def test_profiler():
    '''
    create and check profile timestamps
    '''

    pname = 'ru.%d' % os.getpid()
    fname = '/tmp/%s.prof' % pname
    now = time.time()

    try:
        os.environ['RADICAL_PROFILE'] = 'True'
        prof = ru.Profiler(name=pname, ns='radical.utils', path='/tmp/')

        prof.prof('foo')
        prof.prof('bar', uid='baz')
        prof.prof('buz', ts=now)

        assert (os.path.isfile(fname))

        def _grep(pat):
            return _cmd('grep -e "%s" %s' % (pat, fname))

        assert (_grep('^[0-9\\.]*,foo,%s,MainThread,,,$' % pname))
        assert (_grep('^[0-9\\.]*,bar,%s,MainThread,baz,,$' % pname))
        assert (_grep('^%.7f,buz,%s,MainThread,,,$' % (now, pname)))

    finally:
        try:
            del (os.environ['RADICAL_PROFILE'])
        except:
            pass
        try:
            os.unlink(fname)
        except:
            pass

Esempio n. 3

0

Mostra file

    def __init__(self, log=None, rep=None, prof=None):

        if log: self._log = log
        else: self._log = ru.Logger('radical.nge')

        if rep: self._rep = log
        else: self._rep = ru.Reporter('radical.nge')

        if prof: self._prof = prof
        else: self._prof = ru.Profiler('radical.nge')

        self._session = rp.Session()
        self._pmgr = rp.PilotManager(self._session)
        self._umgr = rp.UnitManager(self._session)

        self._pmgr.register_callback(self._pilot_state_cb)
        self._umgr.register_callback(self._unit_state_cb)

        # create a dir for data staging
        self._pwd = os.getcwd()
        self._data = 'data.%s' % self._session.uid
        os.makedirs('%s/%s/' % (self._pwd, self._data))

        # track submitted tasks
        self._tcnt = 0
        self._tasks = dict()

Esempio n. 4

0

Mostra file

File: yank-repex.py Progetto: jdakka/yank-entk

    def __init__(self, number_of_replicas,
                 systems=list(), 
                 workflow=None, 
                 cores=32, 
                 ligand=False, 
                 full=False,
                 gibbs_steps,
                 thermodynamic_states):

        self.number_of_replicas = number_of_replicas
        self.n_gibbs_steps =  gibbs_steps
        self.thermo_state = thermodynamic_states
        self.ligand = '-ligands' if ligand else ''
        self.step_count = _full_steps if full else _reduced_steps
        
        self.systems = systems
        
        self.cores = cores
        self._id = uuid.uuid1()  # generate id

        # self.workflow = workflow or ['gen_replicas', 'repex', 'rotation', 'translation', 'propagation']
        
        # null workflow
        self.workflow = workflow or list(range(0,5)) 


        # Profiler for TIES PoE

        self._uid = ru.generate_id('radical.yank.yank-repex')
        self._logger = ru.get_logger('radical.yank.yank-repex')
        self._prof = ru.Profiler(name=self._uid)
        self._prof.prof('create yank-repex instance', uid=self._uid)

Esempio n. 5

0

Mostra file

File: htbac.py Progetto: SrinivasMushnoori/htbac

    def __init__(self, resource='local', comm_server=None):
        """The workhorse of high throughput binding affinity calculations.

        Manages arbitrary number of protocols on any resource (including supercomputers).

        Parameters
        ----------
        resource: str
            The name of the resource where the protocols will be run. This is usually then name of the supercomputer
            or 'local' if the job will be executed locally. (the default is to try to run locally).
        comm_server: tuple(str, int)
            The communication server used by the execution system. Specify a hostname and port number as a tuple. If
            None, then the dedicated server might be used from the resource description if present.
        """

        self.resource = yaml.load(resource_stream(__name__, 'resources.yaml'))[resource]

        if comm_server is None:
            comm_server = self.resource.get('dedicated_rabbitmq_server')

        self._protocols = list()
        self._app_manager = AppManager(*comm_server)

        # Profiler for Runner
        self._uid = ru.generate_id('radical.htbac.workflow_runner')
        self._logger = ru.get_logger('radical.htbac.workflow_runner')
        self._prof = ru.Profiler(name=self._uid)
        self._prof.prof('create workflow_runner obj', uid=self._uid)
        self._root_directories = list()

Esempio n. 6

0

Mostra file

    def __init__(self, resource_desc, sid, rts, rts_config):

        if not isinstance(resource_desc, dict):
            raise TypeError(expected_type=dict, actual_type=type(resource_desc))

        self._resource_desc = resource_desc
        self._sid           = sid
        self._rts           = rts
        self._rts_config    = rts_config

        # Resource reservation related parameters
        self._resource      = None
        self._walltime      = None
        self._cpus          = 1
        self._gpus          = 0
        self._project       = None
        self._access_schema = None
        self._queue         = None
        self._validated     = False

        # Utility parameters
        self._uid = ru.generate_id('resource_manager.%(counter)04d',
                                   ru.ID_CUSTOM)
        self._path = os.getcwd() + '/' + self._sid

        name = 'radical.entk.%s' % self._uid
        self._logger = ru.Logger  (name, path=self._path)
        self._prof   = ru.Profiler(name, path=self._path)

        self._shared_data = list()
        self._outputs     = None

Esempio n. 7

0

Mostra file

    def __init__(self, sid, workflow, pending_queue, completed_queue,
                 resubmit_failed, rmq_conn_params):

        # Mandatory arguments
        self._sid = sid
        self._pending_queue = pending_queue
        self._completed_queue = completed_queue
        self._resubmit_failed = resubmit_failed
        self._rmq_conn_params = rmq_conn_params

        # Assign validated workflow
        self._workflow = workflow

        # Create logger and profiler at their specific locations using the sid
        self._path = os.getcwd() + '/' + self._sid
        self._uid = ru.generate_id('wfprocessor.%(item_counter)04d',
                                   ru.ID_CUSTOM,
                                   ns=self._sid)

        name = 'radical.entk.%s' % self._uid
        self._logger = ru.Logger(name, path=self._path)
        self._prof = ru.Profiler(name, path=self._path)
        self._report = ru.Reporter(name)

        # Defaults
        self._wfp_process = None
        self._enqueue_thread = None
        self._dequeue_thread = None
        self._rmq_ping_interval = os.getenv('RMQ_PING_INTERVAL', 10)

        self._logger.info('Created WFProcessor object: %s' % self._uid)
        self._prof.prof('create_wfp', uid=self._uid)

Esempio n. 8

0

Mostra file

File: appmanager.py Progetto: lee212/radical.entk

    def __init__(self,
                 config_path=None,
                 hostname=None,
                 port=None,
                 reattempts=None,
                 resubmit_failed=None,
                 autoterminate=None,
                 write_workflow=None,
                 rts=None,
                 rmq_cleanup=None,
                 rts_config=None,
                 name=None):

        # Create a session for each EnTK script execution
        if name:
            self._name = name
            self._sid = name
        else:
            self._name = str()
            self._sid = ru.generate_id('re.session', ru.ID_PRIVATE)

        self._read_config(config_path, hostname, port, reattempts,
                          resubmit_failed, autoterminate, write_workflow, rts,
                          rmq_cleanup, rts_config)

        # Create an uid + logger + profiles for AppManager, under the sid
        # namespace
        path = os.getcwd() + '/' + self._sid
        self._uid = ru.generate_id('appmanager.%(item_counter)04d',
                                   ru.ID_CUSTOM,
                                   namespace=self._sid)
        self._logger = ru.Logger('radical.entk.%s' % self._uid,
                                 path=path,
                                 targets=['2', '.'])
        self._prof = ru.Profiler(name='radical.entk.%s' % self._uid, path=path)
        self._report = ru.Reporter(name='radical.entk.%s' % self._uid)

        self._report.info('EnTK session: %s\n' % self._sid)
        self._prof.prof('create amgr obj', uid=self._uid)
        self._report.info('Creating AppManager')

        self._resource_manager = None
        # RabbitMQ Queues
        self._pending_queue = list()
        self._completed_queue = list()

        # Global parameters to have default values
        self._mqs_setup = False
        self._resource_desc = None
        self._task_manager = None
        self._workflow = None
        self._cur_attempt = 1
        self._shared_data = list()

        self._rmq_ping_interval = os.getenv('RMQ_PING_INTERVAL', 10)

        self._logger.info('Application Manager initialized')
        self._prof.prof('amgr obj created', uid=self._uid)
        self._report.ok('>>ok\n')

Esempio n. 9

0

Mostra file

File: task_manager.py Progetto: karahbit/radical.entk

    def __init__(self, sid, pending_queue, completed_queue, rmgr,
                 rmq_conn_params, rts):

        if not isinstance(sid, str):
            raise TypeError(expected_type=str, actual_type=type(sid))

        if not isinstance(pending_queue, list):
            raise TypeError(expected_type=str, actual_type=type(pending_queue))

        if not isinstance(completed_queue, list):
            raise TypeError(expected_type=str,
                            actual_type=type(completed_queue))

        if not isinstance(rmgr, Base_ResourceManager):
            raise TypeError(expected_type=Base_ResourceManager,
                            actual_type=type(rmgr))

        if not isinstance(rmq_conn_params,
                          pika.connection.ConnectionParameters):
            raise TypeError(expected_type=pika.connection.ConnectionParameters,
                            actual_type=type(rmq_conn_params))

        self._sid = sid
        self._pending_queue = pending_queue
        self._completed_queue = completed_queue
        self._rmgr = rmgr
        self._rts = rts
        self._rmq_conn_params = rmq_conn_params

        # Utility parameters
        self._uid = ru.generate_id('task_manager.%(counter)04d', ru.ID_CUSTOM)
        self._path = os.getcwd() + '/' + self._sid

        name = 'radical.entk.%s' % self._uid
        self._log = ru.Logger(name, path=self._path)
        self._prof = ru.Profiler(name, path=self._path)
        self._dh = ru.DebugHelper(name=name)

        # Thread should run till terminate condtion is encountered
        mq_connection = pika.BlockingConnection(rmq_conn_params)

        self._hb_request_q = '%s-hb-request' % self._sid
        self._hb_response_q = '%s-hb-response' % self._sid

        mq_channel = mq_connection.channel()

        # To respond to heartbeat - get request from rpc_queue
        mq_channel.queue_delete(queue=self._hb_response_q)
        mq_channel.queue_declare(queue=self._hb_response_q)

        # To respond to heartbeat - get request from rpc_queue
        mq_channel.queue_delete(queue=self._hb_request_q)
        mq_channel.queue_declare(queue=self._hb_request_q)

        self._tmgr_process = None
        self._hb_thread = None
        self._hb_interval = int(os.getenv('ENTK_HB_INTERVAL', 30))

        mq_connection.close()

Esempio n. 10

0

Mostra file

File: component.py Progetto: karahbit/radical.pilot

    def __init__(self, cfg):

        self._cfg = ru.Config('radical.pilot.cmgr', cfg=cfg)
        self._sid = self._cfg.sid
        self._uid = ru.generate_id('cmgr', ns=self._sid)
        self._uids = [self._uid]  # uids to track hartbeats for (incl. own)

        self._prof = ru.Profiler(self._uid,
                                 ns='radical.pilot',
                                 path=self._cfg.path)
        self._log = ru.Logger(self._uid,
                              ns='radical.pilot',
                              path=self._cfg.path)

        self._prof.prof('init2', uid=self._uid, msg=self._cfg.path)

        # Every ComponentManager runs a HB pubsub bridge in a separate thread.
        # That HB channel should be used by all components and bridges created
        # under this CMGR.
        bcfg = ru.Config(
            cfg={
                'channel': 'heartbeat',
                'type': 'pubsub',
                'uid': self._uid + '.hb',
                'stall_hwm': 1,
                'bulk_size': 0,
                'path': self._cfg.path
            })
        self._hb_bridge = ru.zmq.PubSub(bcfg)
        self._hb_bridge.start()

        self._cfg.heartbeat.addr_pub = str(self._hb_bridge.addr_pub)
        self._cfg.heartbeat.addr_sub = str(self._hb_bridge.addr_sub)

        # runs a HB monitor on that channel
        self._hb = ru.Heartbeat(
            uid=self.uid,
            timeout=self._cfg.heartbeat.timeout,
            interval=self._cfg.heartbeat.interval,
            beat_cb=self._hb_beat_cb,  # on every heartbeat
            term_cb=self._hb_term_cb,  # on termination
            log=self._log)

        self._hb_pub = ru.zmq.Publisher('heartbeat',
                                        self._cfg.heartbeat.addr_pub,
                                        log=self._log,
                                        prof=self._prof)
        self._hb_sub = ru.zmq.Subscriber('heartbeat',
                                         self._cfg.heartbeat.addr_sub,
                                         topic='heartbeat',
                                         cb=self._hb_sub_cb,
                                         log=self._log,
                                         prof=self._prof)

        # confirm the bridge being usable by listening to our own heartbeat
        self._hb.start()
        self._hb.wait_startup(self._uid, self._cfg.heartbeat.timeout)
        self._log.info('heartbeat system up')

Esempio n. 11

0

Mostra file

    def __init__(self, cfg, session):

        self._cfg = cfg
        self._pid = cfg.pid
        self._pmgr = cfg.pmgr
        self._pwd = cfg.pilot_sandbox
        self._session = session
        self._log = session._log

        self._starttime = time.time()
        self._final_cause = None

        # this is the earliest point to sync bootstrap and agent profiles
        prof = ru.Profiler(ns='radical.pilot', name='agent.0')
        prof.prof('sync_rel', uid=cfg.pid, msg='agent.0')
        prof.prof('hostname', uid=cfg.pid, msg=ru.get_hostname())

        # connect to MongoDB for state push/pull
        self._connect_db()

        # configure ResourceManager before component startup, as components need
        # ResourceManager information for function (scheduler, executor)
        self._configure_rm()

        # ensure that app communication channels are visible to workload
        self._configure_app_comm()

        # expose heartbeat channel to sub-agents, bridges and components,
        # and start those
        self._cmgr = rpu.ComponentManager(self._cfg)
        self._cfg.heartbeat = self._cmgr.cfg.heartbeat

        self._cmgr.start_bridges()
        self._cmgr.start_components()

        # create the sub-agent configs and start the sub agents
        self._write_sa_configs()
        self._start_sub_agents()  # TODO: move to cmgr?

        # at this point the session is up and connected, and it should have
        # brought up all communication bridges and components.  We are
        # ready to rumble!
        rpu.Worker.__init__(self, self._cfg, session)

        # run our own slow-paced heartbeat monitor to watch pgr heartbeats
        self._hb = ru.Heartbeat(
            uid=self._pid,
            timeout=10.0,  # FIXME:  configurable
            interval=1.0,  # FIXME:  configurable
            beat_cb=self._hb_check,  # no own heartbeat(pmgr pulls)
            term_cb=self._hb_term_cb,
            log=self._log)
        self._hb.start()

        # register pmgr heartbeat
        self._log.info('hb init for %s', self._pmgr)
        self._hb.beat(uid=self._pmgr)

Esempio n. 12

0

Mostra file

def test_wfp_workflow_incomplete():

    p = Pipeline()
    s = Stage()
    t = Task()
    t.executable = ['/bin/date']
    s.add_tasks(t)
    p.add_stages(s)

    amgr = Amgr(hostname=hostname, port=port)
    amgr._setup_mqs()

    wfp = WFprocessor(sid=amgr._sid,
                      workflow=[p],
                      pending_queue=amgr._pending_queue,
                      completed_queue=amgr._completed_queue,
                      mq_hostname=amgr._mq_hostname,
                      port=amgr._port,
                      resubmit_failed=False)

    wfp._initialize_workflow()

    assert wfp.workflow_incomplete()

    amgr.workflow = [p]
    profiler = ru.Profiler(name='radical.entk.temp')

    p.stages[0].state == states.SCHEDULING
    p.state == states.SCHEDULED
    for t in p.stages[0].tasks:
        t.state = states.COMPLETED

    import json
    import pika

    task_as_dict = json.dumps(t.to_dict())
    mq_connection = pika.BlockingConnection(
        pika.ConnectionParameters(host=amgr._mq_hostname, port=amgr._port))
    mq_channel = mq_connection.channel()
    mq_channel.basic_publish(exchange='',
                             routing_key='%s-completedq-1' % amgr._sid,
                             body=task_as_dict)

    amgr._terminate_sync = Event()
    sync_thread = Thread(target=amgr._synchronizer, name='synchronizer-thread')
    sync_thread.start()

    proc = Process(target=func_for_dequeue_test,
                   name='temp-proc',
                   args=(wfp, ))
    proc.start()
    proc.join()

    amgr._terminate_sync.set()
    sync_thread.join()

    assert not wfp.workflow_incomplete()

Esempio n. 13

0

Mostra file

File: session.py Progetto: hpcanalytics/radical.pilot

    def _get_profiler(self, name):
        """
        This is a thin wrapper around `ru.Profiler()` which makes sure that
        log files end up in a separate directory with the name of `session.uid`.
        """

        prof = ru.Profiler(name=name, ns='radical.pilot', path=self._logdir)

        return prof

Esempio n. 14

0

Mostra file

    def __init__(self):

        self.book = [
        ]  #bookkeeping, maintains a record of all MD tasks carried out
        self.md_task_list = []
        self.ex_task_list = []

        self._uid = ru.generate_id('radical.repex.syncex')
        self._logger = ru.get_logger('radical.repex.syncex')
        self._prof = ru.Profiler(name=self._uid)
        self._prof.prof('Initinit', uid=self._uid)

Esempio n. 15

0

Mostra file

def test_amgr_synchronizer():

    logger = ru.get_logger('radical.entk.temp_logger')
    profiler = ru.Profiler(name='radical.entk.temp')
    amgr = Amgr(hostname=hostname, port=port)

    mq_connection = pika.BlockingConnection(pika.ConnectionParameters(host=hostname, port=port))
    mq_channel = mq_connection.channel()

    amgr._setup_mqs()

    p = Pipeline()
    s = Stage()

    # Create and add 100 tasks to the stage
    for cnt in range(100):

        t = Task()
        t.executable = ['some-executable-%s' % cnt]

        s.add_tasks(t)

    p.add_stages(s)
    p._assign_uid(amgr._sid)
    p._validate()

    amgr.workflow = [p]

    for t in p.stages[0].tasks:
        assert t.state == states.INITIAL

    assert p.stages[0].state == states.INITIAL
    assert p.state == states.INITIAL

    # Start the synchronizer method in a thread
    amgr._terminate_sync = Event()
    sync_thread = Thread(target=amgr._synchronizer, name='synchronizer-thread')
    sync_thread.start()

    # Start the synchronizer method in a thread
    proc = Process(target=func_for_synchronizer_test, name='temp-proc',
                   args=(amgr._sid, p, logger, profiler))

    proc.start()
    proc.join()

    for t in p.stages[0].tasks:
        assert t.state == states.SCHEDULING

    assert p.stages[0].state == states.SCHEDULING
    assert p.state == states.SCHEDULING

    amgr._terminate_sync.set()
    sync_thread.join()

Esempio n. 16

0

Mostra file

def test_wfp_enqueue():

    p = Pipeline()
    s = Stage()
    t = Task()
    t.executable = ['/bin/date']
    s.add_tasks(t)
    p.add_stages(s)

    amgr = Amgr(hostname=hostname, port=port)
    amgr._setup_mqs()

    wfp = WFprocessor(sid=amgr._sid,
                      workflow=[p],
                      pending_queue=amgr._pending_queue,
                      completed_queue=amgr._completed_queue,
                      mq_hostname=amgr._mq_hostname,
                      port=amgr._port,
                      resubmit_failed=False)

    wfp._initialize_workflow()

    amgr.workflow = [p]
    profiler = ru.Profiler(name='radical.entk.temp')

    for t in p.stages[0].tasks:
        assert t.state == states.INITIAL

    assert p.stages[0].state == states.INITIAL
    assert p.state == states.INITIAL

    amgr._terminate_sync = Event()
    sync_thread = Thread(target=amgr._synchronizer, name='synchronizer-thread')
    sync_thread.start()

    proc = Process(target=func_for_enqueue_test,
                   name='temp-proc',
                   args=(wfp, ))
    proc.start()
    proc.join()

    amgr._terminate_sync.set()
    sync_thread.join()

    for t in p.stages[0].tasks:
        assert t.state == states.SCHEDULED

    assert p.stages[0].state == states.SCHEDULED
    assert p.state == states.SCHEDULING

Esempio n. 17

0

Mostra file

    def __init__(self):
        self._cores = 0
        self._protocols = list()
        self._hostname = None
        self._port = None
        self.ids = None
        self.app_manager = None
        self.total_replicas = 0

        # Profiler for Runner
        self._uid = ru.generate_id('radical.yank.workflow_runner')
        self._logger = ru.get_logger('radical.yank.workflow_runner')
        self._prof = ru.Profiler(name=self._uid)
        self._prof.prof('create workflow_runner obj', uid=self._uid)
        self._root_directories = list()
        self.ids = dict()

Esempio n. 18

0

Mostra file

    def __init__(self):
        '''
        initialize the service endpoint:

          - create logger, profile and reporter
          - set up accounts
        '''

        self._log = ru.Logger('radical.nge.service')
        self._rep = ru.Reporter('radical.nge.service')
        self._prof = ru.Profiler('radical.nge.service')
        self._accounts = {
            'andre': _Account('andre', 'erdna'),
            'matteo': _Account('matteo', 'eottam'),
            'daniel': _Account('daniel', 'leinad'),
            'guest': _Account('guest', 'guest'),
        }

        self._rep.header('--- NGE (%s) ---' % rn.version)

Esempio n. 19

0

Mostra file

def func_for_dequeue_test(wfp):

    wfp._dequeue_thread_terminate = Event()
    p = wfp._workflow[0]
    profiler = ru.Profiler(name='radical.entk.temp')
    thread = Thread(target=wfp._dequeue, args=(profiler, ))
    thread.start()

    flag = False
    while True:
        if (p.state == states.DONE) and (p.stages[0].state == states.DONE):
            for t in p.stages[0].tasks:
                if t.state == states.DONE:
                    flag = True
        if flag:
            break

    wfp._dequeue_thread_terminate.set()
    thread.join()

Esempio n. 20

0

Mostra file

    def __init__(self, sid=None):

        self._worflows = list()  # A list of workflows IDs
        # This will a hash table of workflows. The table will include the
        # following:
        # 'workflowsID': {'state': The state of the workflow based on the WFM,
        #                 'endpoint': Process ID or object to WMF for the specific
        #                             workflow,
        #                 'start_time': Epoch of when the workflow is submitted
        #                               to the WMF,
        #                 'end_time': Epoch of when the workflow finished.}
        self._execution_status = dict()  # This will create a hash table of workflows

        self._uid = ru.generate_id('enactor.%(counter)04d', mode=ru.ID_CUSTOM,
                                    ns=sid)
        path = os.getcwd() + '/' + sid
        name = self._uid

        self._logger = ru.Logger(name=self._uid, path=path, level='DEBUG')
        self._prof   = ru.Profiler(name=name, path=path)

Esempio n. 21

0

Mostra file

File: nge_rs.py Progetto: radical-cybertools/radical.nge

    def __init__(self, url, log=None, rep=None, prof=None):

        if log: self._log = log
        else: self._log = ru.Logger('radical.nge')

        if rep: self._rep = log
        else: self._rep = ru.Reporter('radical.nge')

        if prof: self._prof = prof
        else: self._prof = ru.Profiler('radical.nge')

        self._cookies = list()
        self._url = ru.Url(url)

        self._qbase = ru.Url(url)
        # self._qbase.username = None
        # self._qbase.password = None
        self._qbase = str(self._qbase).rstrip('/')

        if self._url.username and self._url.password:
            self.login(self._url.username, self._url.password)

Esempio n. 22

0

Mostra file

    def __init__(self, workload, properties=None):

        self._workload  = workload
        self._check_ex  = None
        self._check_res = None

        if not properties:
            properties  = dict()

        self._rid       = ru.generate_id('rep.%(counter)04d', ru.ID_CUSTOM)

        # this is inefficient at scale...
        self._prof      = ru.Profiler('radical.entk')
        self._prof.prof('create', uid=self._rid)

        self._props     = properties
        self._cycle     = -1    # increased when adding md stage
        self._ex_list   = None  # list of replicas used in exchange step

        re.Pipeline.__init__(self)
        self.name = 'p.%s' % self.rid
        self._log = ru.Logger('radical.repex')

Esempio n. 23

0

Mostra file

def master(obj, obj_type, new_state):

    hostname = os.environ.get('RMQ_HOSTNAME', 'localhost')
    port = int(os.environ.get('RMQ_PORT', 5672))

    mq_connection = pika.BlockingConnection(
        pika.ConnectionParameters(host=hostname, port=port))
    mq_channel = mq_connection.channel()

    queue1 = 'test-1-2-3'  # Expected queue name structure 'X-A-B-C'
    queue2 = 'test-3-2-1'  # Expected queue name structure 'X-C-B-A'
    mq_channel.queue_declare(queue=queue1)
    mq_channel.queue_declare(queue=queue2)

    logger = ru.Logger('radical.entk.test')
    profiler = ru.Profiler('radical.entk.test')

    thread1 = Thread(target=func,
                     args=(obj, obj_type, new_state, queue1, logger, profiler))
    thread1.start()

    while True:
        method_frame, props, body = mq_channel.basic_get(queue=queue1)
        if body:

            msg = json.loads(body)
            assert msg['object']['state'] == new_state
            mq_channel.basic_publish(exchange='',
                                     routing_key=queue2,
                                     properties=pika.BasicProperties(
                                         correlation_id=props.correlation_id),
                                     body='ack')
            mq_channel.basic_ack(delivery_tag=method_frame.delivery_tag)
            break

    mq_channel.queue_delete(queue=queue1)
    mq_channel.queue_delete(queue=queue2)
    mq_connection.close()
    thread1.join()

Esempio n. 24

0

Mostra file

    res_dict = {
        "resource": Resource,
        "walltime": 30,
        "cpus": Pilot_Cores,
        "gpus_per_node": 0,
        "access_schema": 'gsissh',
        #'queue': 'debug',
        "queue": 'workq',
        "project": 'TG-MCB090174',
        #'project': 'bamm',
    }

    uid1 = ru.generate_id('radical.repex.run')
    logger = ru.get_logger('radical.repex.run')
    prof = ru.Profiler(name=uid1)
    prof.prof('Create_Workflow_0', uid=uid1)

    synchronousExchange = SynchronousExchange()

    appman = AppManager(autoterminate=False,
                        port=33215)  # Create Application Manager
    appman.resource_desc = res_dict  # Assign resource manager to the Application Manager

    Exchange = synchronousExchange.InitCycle(Replicas, Replica_Cores,
                                             MD_Executable, ExchangeMethod,
                                             timesteps)

    appman.workflow = set([
        Exchange
    ])  # Assign the workflow as a set of Pipelines to the Application Manager

Esempio n. 25

0

Mostra file

File: task_manager.py Progetto: dalg24/radical.entk

    def __init__(self, sid, pending_queue, completed_queue, rmgr, mq_hostname,
                 port, rts):

        if isinstance(sid, str):
            self._sid = sid
        else:
            raise TypeError(expected_type=str, actual_type=type(sid))

        if isinstance(pending_queue, list):
            self._pending_queue = pending_queue
        else:
            raise TypeError(expected_type=str, actual_type=type(pending_queue))

        if isinstance(completed_queue, list):
            self._completed_queue = completed_queue
        else:
            raise TypeError(expected_type=str,
                            actual_type=type(completed_queue))

        if isinstance(mq_hostname, str):
            self._mq_hostname = mq_hostname
        else:
            raise TypeError(expected_type=str, actual_type=type(mq_hostname))

        if isinstance(port, int):
            self._port = port
        else:
            raise TypeError(expected_type=int, actual_type=type(port))

        if isinstance(rmgr, Base_ResourceManager):
            self._rmgr = rmgr
        else:
            raise TypeError(expected_type=ResourceManager,
                            actual_type=type(rmgr))

        self._rts = rts

        # Utility parameters
        self._uid = ru.generate_id('task_manager.%(item_counter)04d',
                                   ru.ID_CUSTOM,
                                   namespace=self._sid)
        self._path = os.getcwd() + '/' + self._sid
        self._logger = ru.Logger('radical.entk.%s' % self._uid,
                                 path=self._path,
                                 targets=['2', '.'])
        self._prof = ru.Profiler(name='radical.entk.%s' % self._uid + '-obj',
                                 path=self._path)

        # Thread should run till terminate condtion is encountered
        mq_connection = pika.BlockingConnection(
            pika.ConnectionParameters(host=mq_hostname, port=port))

        self._hb_request_q = '%s-hb-request' % self._sid
        self._hb_response_q = '%s-hb-response' % self._sid

        mq_channel = mq_connection.channel()

        # To respond to heartbeat - get request from rpc_queue
        mq_channel.queue_delete(queue=self._hb_response_q)
        mq_channel.queue_declare(queue=self._hb_response_q)

        # To respond to heartbeat - get request from rpc_queue
        mq_channel.queue_delete(queue=self._hb_request_q)
        mq_channel.queue_declare(queue=self._hb_request_q)

        self._tmgr_process = None
        self._hb_thread = None
        self._hb_interval = int(os.getenv('ENTK_HB_INTERVAL', 30))

        mq_connection.close()

Esempio n. 26

0

Mostra file

File: exchange.py Progetto: radical-cybertools/radical.repex

    def __init__(self, workload, resource, replicas=None):

        self._uid = ru.generate_id('rx')
        self._prof = ru.Profiler('radical.repex')
        self._prof.prof('create', uid=self._uid)

        self._workload = ru.Config(cfg=workload)
        self._resource = ru.Config(cfg=resource)
        self._replicas = replicas

        # the replicas need to be aware about pre_exec directives
        self._workload.pre_exec = self._resource.pre_exec

        assert (self._workload.config.replicas or self._replicas)
        assert (self._workload.config.cycles)

        self._cycles = self._workload.config.cycles
        self._waitlist = list()

        if self._replicas:
            self._workload.config.replicas = len(self._replicas)
        else:
            self._replicas = [
                Replica(workload=self._workload)
                for _ in range(self._workload.config.replicas)
            ]

        self._pre_alg = prepare_algs.get(self._workload.prepare.algorithm)
        self._sel_alg = selection_algs.get(self._workload.selection.algorithm)
        self._exc_alg = exchange_algs.get(self._workload.exchange.algorithm)

        # if the configured algorithms are not known (not hard-coded in RX),
        # then assume they point to user specified files and load them
        if not self._pre_alg:
            filename, funcname = self._workload.prepare.algorithm.split(':')
            syms = ru.import_file(filename)
            self._pre_alg = syms['functions'][funcname]

        if not self._sel_alg:
            filename, funcname = self._workload.selection.algorithm.split(':')
            syms = ru.import_file(filename)
            self._sel_alg = syms['functions'][funcname]

        if not self._exc_alg:
            filename, funcname = self._workload.exchange.algorithm.split(':')
            syms = ru.import_file(filename)
            self._exc_alg = syms['functions'][funcname]

        assert (self._pre_alg), 'preparation algorithm missing'
        assert (self._sel_alg), 'selection algorithm missing'
        assert (self._exc_alg), 'exchange algorithm missing'

        rmq_host = str(self._resource.get('rmq_host', 'localhost'))
        rmq_port = int(self._resource.get('rmq_port', '5672'))
        rmq_user = str(self._resource.get('rmq_user', 'guest'))
        rmq_pass = str(self._resource.get('rmq_pass', 'guest'))
        re.AppManager.__init__(self,
                               autoterminate=True,
                               hostname=rmq_host,
                               port=rmq_port,
                               username=rmq_user,
                               password=rmq_pass)

        for r in self._replicas:
            r._initialize(check_ex=self._check_exchange,
                          check_res=self._check_resume,
                          sid=self.sid,
                          prof=self._prof)

        self._lock = ru.Lock(name='rx')

        rd = copy.deepcopy(self._resource)
        if 'rmq_host' in rd: del (rd['rmq_host'])
        if 'rmq_port' in rd: del (rd['rmq_port'])
        if 'pre_exec' in rd: del (rd['pre_exec'])

        self.resource_desc = rd

        self._log = ru.Logger('radical.repex')
        self._dout = open('dump.log', 'a')
        self._dump(msg='startup')

        # run the replica pipelines
        self.workflow = set(self._replicas)

Esempio n. 27

0

Mostra file

def test_amgr_initialization():
    amgr_name = ru.generate_id('test.appmanager.%(item_counter)04d',
                               ru.ID_CUSTOM)
    amgr = Amgr(hostname=hostname, port=port, name=amgr_name)

    assert amgr._name.split('.') == amgr_name.split('.')
    assert amgr._sid.split('.') == amgr_name.split('.')
    assert amgr._uid.split('.') == ['appmanager', '0000']
    assert type(amgr._logger) == type(ru.get_logger('radical.tests'))
    assert type(amgr._prof) == type(ru.Profiler('radical.tests'))
    assert type(amgr._report) == type(ru.Reporter('radical.tests'))
    assert isinstance(amgr.name, str)

    # RabbitMQ inits
    assert amgr._mq_hostname == hostname
    assert amgr._port == port

    # RabbitMQ Queues
    assert amgr._num_pending_qs == 1
    assert amgr._num_completed_qs == 1
    assert isinstance(amgr._pending_queue, list)
    assert isinstance(amgr._completed_queue, list)

    # Global parameters to have default values
    assert amgr._mqs_setup == False
    assert amgr._resource_desc == None
    assert amgr._task_manager == None
    assert amgr._workflow == None
    assert amgr._resubmit_failed == False
    assert amgr._reattempts == 3
    assert amgr._cur_attempt == 1
    assert amgr._autoterminate == True
    assert isinstance(amgr.shared_data, list)

    amgr = Amgr(hostname=hostname, port=port)

    assert amgr._uid.split('.') == ['appmanager', '0000']
    assert type(amgr._logger) == type(ru.get_logger('radical.tests'))
    assert type(amgr._prof) == type(ru.Profiler('radical.tests'))
    assert type(amgr._report) == type(ru.Reporter('radical.tests'))
    assert isinstance(amgr.name, str)

    # RabbitMQ inits
    assert amgr._mq_hostname == hostname
    assert amgr._port == port

    # RabbitMQ Queues
    assert amgr._num_pending_qs == 1
    assert amgr._num_completed_qs == 1
    assert isinstance(amgr._pending_queue, list)
    assert isinstance(amgr._completed_queue, list)

    # Global parameters to have default values
    assert amgr._mqs_setup == False
    assert amgr._resource_desc == None
    assert amgr._task_manager == None
    assert amgr._workflow == None
    assert amgr._resubmit_failed == False
    assert amgr._reattempts == 3
    assert amgr._cur_attempt == 1
    assert amgr._autoterminate == True
    assert isinstance(amgr.shared_data, list)

Esempio n. 28

0

Mostra file

File: session.py Progetto: kristofarkas/radical.pilot

    def __init__(self, dburl=None, uid=None, cfg=None, _connect=True):
        """
        Creates a new session.  A new Session instance is created and 
        stored in the database.

        **Arguments:**
            * **dburl** (`string`): The MongoDB URL.  If none is given,
              RP uses the environment variable RADICAL_PILOT_DBURL.  If that is
              not set, an error will be raises.

            * **uid** (`string`): Create a session with this UID.  
              *Only use this when you know what you are doing!*

        **Returns:**
            * A new Session instance.

        **Raises:**
            * :class:`radical.pilot.DatabaseError`

        """

        if os.uname()[0] == 'Darwin':
            # on MacOS, we are running out of file descriptors soon.  The code
            # below attempts to increase the limit of open files - but any error
            # is silently ignored, so this is an best-effort, no guarantee.  We
            # leave responsibility for system limits with the user.
            try:
                import resource
                limits    = list(resource.getrlimit(resource.RLIMIT_NOFILE))
                limits[0] = 512
                resource.setrlimit(resource.RLIMIT_NOFILE, limits)
            except:
                pass

        self._dh          = ru.DebugHelper()
        self._valid       = True
        self._closed      = False
        self._valid_iter  = 0  # detect recursive calls of `is_valid()`

        # class state
        self._dbs         = None
        self._uid         = None
        self._dburl       = None
        self._reconnected = False

        self._cache       = dict()  # cache sandboxes etc.
        self._cache_lock  = threading.RLock()

        self._cache['resource_sandbox'] = dict()
        self._cache['session_sandbox']  = dict()
        self._cache['pilot_sandbox']    = dict()

        # before doing anything else, set up the debug helper for the lifetime
        # of the session.
        self._debug_helper = ru.DebugHelper()

        # Dictionaries holding all manager objects created during the session.
        # NOTE: should this also include agents?
        self._pmgrs      = dict()
        self._umgrs      = dict()
        self._bridges    = list()
        self._components = list()

        # FIXME: we work around some garbage collection issues we don't yet
        #        understand: instead of relying on the GC to eventually collect
        #        some stuff, we actively free those on `session.close()`, at
        #        least for the current process.  Usually, all resources get
        #        nicely collected on process termination - but not when we
        #        create many sessions (one after the other) in the same
        #        application instance (ie. the same process).  This workarounf
        #        takes care of that use case.
        #        The clean solution would be to ensure clean termination
        #        sequence, something which I seem to be unable to implement...
        #        :/
        self._to_close   = list()
        self._to_stop    = list()
        self._to_destroy = list()

        # cache the client sandbox
        # FIXME: this needs to be overwritten if configured differently in the
        #        session config, as should be the case for any agent side
        #        session instance.
        self._client_sandbox = os.getcwd()

        # The resource configuration dictionary associated with the session.
        self._resource_configs = {}

        # if a config is given, us its values:
        if cfg:
            self._cfg = copy.deepcopy(cfg)
        else:
            # otherwise we need a config
            self._cfg = ru.read_json("%s/configs/session_%s.json" \
                    % (os.path.dirname(__file__),
                       os.environ.get('RADICAL_PILOT_SESSION_CFG', 'default')))

        # fall back to config data where possible
        # sanity check on parameters
        if not uid : 
            uid = self._cfg.get('session_id')

        if uid:
            self._uid         = uid
            self._reconnected = True
        else:
            # generate new uid, reset all other ID counters
            # FIXME: this will screw up counters for *concurrent* sessions, 
            #        as the ID generation is managed in a process singleton.
            self._uid = ru.generate_id('rp.session',  mode=ru.ID_PRIVATE)
            ru.reset_id_counters(prefix='rp.session', reset_all_others=True)

        if not self._cfg.get('session_id'): self._cfg['session_id'] = self._uid 
        if not self._cfg.get('owner')     : self._cfg['owner']      = self._uid 
        if not self._cfg.get('debug')     : self._cfg['debug']      = 'DEBUG' 
        if not self._cfg.get('logdir')    : self._cfg['logdir']     = '%s/%s' \
                                                     % (os.getcwd(), self._uid)

        self._logdir = self._cfg['logdir']
        self._log    = self._get_logger(self._cfg['owner'], self._cfg.get('debug'))

        if _connect:
            # we need a dburl to connect to.
        
            if not dburl:
                dburl = os.environ.get("RADICAL_PILOT_DBURL")

            if not dburl:
                dburl = self._cfg.get('default_dburl')

            if not dburl:
                dburl = self._cfg.get('dburl')

            if not dburl:
                # we forgive missing dburl on reconnect, but not otherwise
                raise RuntimeError("no database URL (set RADICAL_PILOT_DBURL)")  


        self._dburl = ru.Url(dburl)
        self._cfg['dburl'] = str(self._dburl)

        # now we have config and uid - initialize base class (saga session)
        rs.Session.__init__(self, uid=self._uid)


        # ----------------------------------------------------------------------
        # create new session
        if _connect:
            self._log.info("using database %s" % self._dburl)

            # if the database url contains a path element, we interpret that as
            # database name (without the leading slash)
            if  not self._dburl.path         or \
                self._dburl.path[0]   != '/' or \
                len(self._dburl.path) <=  1  :
                if not uid:
                    # we fake reconnnect if no DB is available -- but otherwise we
                    # really really need a db connection...
                    raise ValueError("incomplete DBURL '%s' no db name!" % self._dburl)

        # initialize profiling, but make sure profile ends up in our logdir
        self._prof = ru.Profiler(self._cfg['owner'], path=self._logdir)

        if not self._reconnected:
            self._prof.prof('session_start', uid=self._uid)
            self._log.report.info ('<<new session: ')
            self._log.report.plain('[%s]' % self._uid)
            self._log.report.info ('<<database   : ')
            self._log.report.plain('[%s]' % self._dburl)

        self._load_resource_configs()

        self._rec = os.environ.get('RADICAL_PILOT_RECORD_SESSION')
        if self._rec:
            # NOTE: Session recording cannot handle reconnected sessions, yet.
            #       We thus turn it off here with a warning
            if self._reconnected:
                self._log.warn("no session recording on reconnected session")

            else:
                # append session ID to recording path
                self._rec = "%s/%s" % (self._rec, self._uid)

                # create recording path and record session
                os.system('mkdir -p %s' % self._rec)
                ru.write_json({'dburl': str(self.dburl)}, 
                              "%s/session.json" % self._rec)
                self._log.info("recording session in %s" % self._rec)


        # create/connect database handle
        try:
            self._dbs = DBSession(sid=self.uid, dburl=str(self._dburl),
                                  cfg=self._cfg, logger=self._log, 
                                  connect=_connect)

            # from here on we should be able to close the session again
            self._log.info("New Session created: %s." % self.uid)

        except Exception, ex:
            self._log.report.error(">>err\n")
            self._log.exception('session create failed')
            raise RuntimeError("Couldn't create new session (database URL '%s' incorrect?): %s" \
                            % (dburl, ex))

Esempio n. 29

0

Mostra file

def test_mpi_unit_with_tagging(mocked_init, mocked_method, mocked_profiler,
                               mocked_raise_on):

    cfg, session = setUp()

    component = Continuous(cfg=dict(), session=session)
    component._lrms_info = cfg['lrms_info']
    component._lrms_lm_info = cfg['lrms_info']['lm_info']
    component._lrms_node_list = cfg['lrms_info']['node_list']
    component._lrms_cores_per_node = cfg['lrms_info']['cores_per_node']
    component._lrms_gpus_per_node = cfg['lrms_info']['gpus_per_node']
    component._lrms_lfs_per_node = cfg['lrms_info']['lfs_per_node']
    component._slot_lock = threading.RLock()
    component._scattered = True
    component._log = ru.Logger('test.component')
    component._prof = ru.Profiler('test')
    component._tag_history = dict()

    component.nodes = []
    for node, node_uid in component._lrms_node_list:
        component.nodes.append(
            copy.deepcopy({
                'name': node,
                'uid': node_uid,
                'cores': [rpc.FREE] * component._lrms_cores_per_node,
                'gpus': [rpc.FREE] * component._lrms_gpus_per_node,
                'lfs': component._lrms_lfs_per_node
            }))

    # Allocate first CUD -- should land on first node
    cu = mpi()
    cu['uid'] = 'unit.000000'
    cu['description']['cpu_processes'] = 2
    cu['description']['cpu_threads'] = 1
    cu['description']['lfs_per_process'] = 1024
    component._try_allocation(cu)
    slot1 = cu['slots']
    assert component._tag_history == {'unit.000000': [1]}
    assert slot1 == {
        'cores_per_node':
        2,
        'lfs_per_node':
        component._lrms_lfs_per_node,
        'nodes': [{
            'lfs': {
                'size': 2048,
                'path': 'abc'
            },
            'core_map': [[0], [1]],
            'name': 'a',
            'gpu_map': [],
            'uid': 1
        }],
        'lm_info':
        'INFO',
        'gpus_per_node':
        1
    }

    # Assert resulting node list values after first CUD
    assert component.nodes == [{
        'lfs': {
            'size': 3072,
            'path': 'abc'
        },
        'cores': [1, 1],
        'name': 'a',
        'gpus': [0],
        'uid': 1
    }, {
        'lfs': {
            'size': 5120,
            'path': 'abc'
        },
        'cores': [0, 0],
        'name': 'b',
        'gpus': [0],
        'uid': 2
    }, {
        'lfs': {
            'size': 5120,
            'path': 'abc'
        },
        'cores': [0, 0],
        'name': 'c',
        'gpus': [0],
        'uid': 3
    }, {
        'lfs': {
            'size': 5120,
            'path': 'abc'
        },
        'cores': [0, 0],
        'name': 'd',
        'gpus': [0],
        'uid': 4
    }, {
        'lfs': {
            'size': 5120,
            'path': 'abc'
        },
        'cores': [0, 0],
        'name': 'e',
        'gpus': [0],
        'uid': 5
    }]

    # Allocate second CUD -- should return None as the first node is
    # not yet released
    cu = mpi()
    cu['uid'] = 'unit.000001'
    cu['description']['tag'] = 'unit.000000'
    component._try_allocation(cu)
    slot2 = cu['slots']
    assert slot2 == None
    assert component._tag_history == {'unit.000000': [1]}

    # Allocate third CUD -- should land on second and third node
    cu = mpi()
    cu['uid'] = 'unit.000002'
    cu['description']['cpu_processes'] = 2
    cu['description']['cpu_threads'] = 2
    component._try_allocation(cu)
    slot3 = cu['slots']
    assert slot3 == {
        'cores_per_node':
        2,
        'lfs_per_node':
        component._lrms_lfs_per_node,
        'nodes': [{
            'lfs': {
                'size': 1024,
                'path': 'abc'
            },
            'core_map': [[0, 1]],
            'name': 'b',
            'gpu_map': [],
            'uid': 2
        }, {
            'lfs': {
                'size': 1024,
                'path': 'abc'
            },
            'core_map': [[0, 1]],
            'name': 'c',
            'gpu_map': [],
            'uid': 3
        }],
        'lm_info':
        'INFO',
        'gpus_per_node':
        1
    }
    assert component._tag_history == {
        'unit.000000': [1],
        'unit.000002': [2, 3]
    }

    # Assert resulting node list values after second CUDslot release
    assert component.nodes == [{
        'lfs': {
            'size': 3072,
            'path': 'abc'
        },
        'cores': [1, 1],
        'name': 'a',
        'gpus': [0],
        'uid': 1
    }, {
        'lfs': {
            'size': 4096,
            'path': 'abc'
        },
        'cores': [1, 1],
        'name': 'b',
        'gpus': [0],
        'uid': 2
    }, {
        'lfs': {
            'size': 4096,
            'path': 'abc'
        },
        'cores': [1, 1],
        'name': 'c',
        'gpus': [0],
        'uid': 3
    }, {
        'lfs': {
            'size': 5120,
            'path': 'abc'
        },
        'cores': [0, 0],
        'name': 'd',
        'gpus': [0],
        'uid': 4
    }, {
        'lfs': {
            'size': 5120,
            'path': 'abc'
        },
        'cores': [0, 0],
        'name': 'e',
        'gpus': [0],
        'uid': 5
    }]

    # Allocate fourth CUD -- should return None as the second node is not
    # yet released
    cu = mpi()
    cu['uid'] = 'unit.000003'
    cu['description']['cpu_threads'] = 2
    cu['description']['tag'] = 'unit.000002'
    component._try_allocation(cu)
    slot4 = cu['slots']
    assert slot4 == None
    assert component._tag_history == {
        'unit.000000': [1],
        'unit.000002': [2, 3]
    }

    # Release first node and allocate second CUD again
    component._release_slot(slot1)

    assert component.nodes == [{
        'lfs': {
            'size': 5120,
            'path': 'abc'
        },
        'cores': [0, 0],
        'name': 'a',
        'gpus': [0],
        'uid': 1
    }, {
        'lfs': {
            'size': 4096,
            'path': 'abc'
        },
        'cores': [1, 1],
        'name': 'b',
        'gpus': [0],
        'uid': 2
    }, {
        'lfs': {
            'size': 4096,
            'path': 'abc'
        },
        'cores': [1, 1],
        'name': 'c',
        'gpus': [0],
        'uid': 3
    }, {
        'lfs': {
            'size': 5120,
            'path': 'abc'
        },
        'cores': [0, 0],
        'name': 'd',
        'gpus': [0],
        'uid': 4
    }, {
        'lfs': {
            'size': 5120,
            'path': 'abc'
        },
        'cores': [0, 0],
        'name': 'e',
        'gpus': [0],
        'uid': 5
    }]

    cu = mpi()
    cu['uid'] = 'unit.000001'
    cu['description']['tag'] = 'unit.000000'
    component._try_allocation(cu)
    slot2 = cu['slots']
    assert slot2 == {
        'cores_per_node':
        2,
        'lfs_per_node':
        component._lrms_lfs_per_node,
        'nodes': [{
            'lfs': {
                'size': 1024,
                'path': 'abc'
            },
            'core_map': [[0]],
            'name': 'a',
            'gpu_map': [],
            'uid': 1
        }],
        'lm_info':
        'INFO',
        'gpus_per_node':
        1
    }
    assert component._tag_history == {
        'unit.000000': [1],
        'unit.000001': [1],
        'unit.000002': [2, 3]
    }

    # Release second and third nodes and allocate fourth CUD again
    component._release_slot(slot3)

    assert component.nodes == [{
        'lfs': {
            'size': 4096,
            'path': 'abc'
        },
        'cores': [1, 0],
        'name': 'a',
        'gpus': [0],
        'uid': 1
    }, {
        'lfs': {
            'size': 5120,
            'path': 'abc'
        },
        'cores': [0, 0],
        'name': 'b',
        'gpus': [0],
        'uid': 2
    }, {
        'lfs': {
            'size': 5120,
            'path': 'abc'
        },
        'cores': [0, 0],
        'name': 'c',
        'gpus': [0],
        'uid': 3
    }, {
        'lfs': {
            'size': 5120,
            'path': 'abc'
        },
        'cores': [0, 0],
        'name': 'd',
        'gpus': [0],
        'uid': 4
    }, {
        'lfs': {
            'size': 5120,
            'path': 'abc'
        },
        'cores': [0, 0],
        'name': 'e',
        'gpus': [0],
        'uid': 5
    }]

    cu = mpi()
    cu['uid'] = 'unit.000003'
    cu['description']['tag'] = 'unit.000002'
    cu['description']['cpu_threads'] = 2
    component._try_allocation(cu)
    slot4 = cu['slots']
    assert slot4 == {
        'cores_per_node':
        2,
        'lfs_per_node':
        component._lrms_lfs_per_node,
        'nodes': [{
            'lfs': {
                'size': 1024,
                'path': 'abc'
            },
            'core_map': [[0, 1]],
            'name': 'b',
            'gpu_map': [],
            'uid': 2
        }],
        'lm_info':
        'INFO',
        'gpus_per_node':
        1
    }
    assert component._tag_history == {
        'unit.000000': [1],
        'unit.000001': [1],
        'unit.000002': [2, 3],
        'unit.000003': [2]
    }

    assert component.nodes == [{
        'lfs': {
            'size': 4096,
            'path': 'abc'
        },
        'cores': [1, 0],
        'name': 'a',
        'gpus': [0],
        'uid': 1
    }, {
        'lfs': {
            'size': 4096,
            'path': 'abc'
        },
        'cores': [1, 1],
        'name': 'b',
        'gpus': [0],
        'uid': 2
    }, {
        'lfs': {
            'size': 5120,
            'path': 'abc'
        },
        'cores': [0, 0],
        'name': 'c',
        'gpus': [0],
        'uid': 3
    }, {
        'lfs': {
            'size': 5120,
            'path': 'abc'
        },
        'cores': [0, 0],
        'name': 'd',
        'gpus': [0],
        'uid': 4
    }, {
        'lfs': {
            'size': 5120,
            'path': 'abc'
        },
        'cores': [0, 0],
        'name': 'e',
        'gpus': [0],
        'uid': 5
    }]

    tearDown()

Esempio n. 30

0

Mostra file

File: bookkeeper.py Progetto: radical-project/campaign_manager

    def __init__(self,
                 campaign,
                 resources,
                 objective=None,
                 planner='random',
                 sid=None):

        self._campaign = {'campaign': campaign, 'state': st.NEW}
        if sid:
            self._sid = sid
        else:
            self._sid = ru.generate_id('rcm.session', mode=ru.ID_PRIVATE)
        self._uid = ru.generate_id('bookkeper.%(counter)04d',
                                   mode=ru.ID_CUSTOM,
                                   ns=self._sid)

        self._resources = resources
        self._checkpoints = None
        self._plan = None
        self._objective = objective
        self._unavail_resources = []
        self._workflows_state = dict()

        self._exec_state_lock = ru.RLock('workflows_state_lock')
        self._monitor_lock = ru.RLock('monitor_list_lock')
        self._time = 0  # The time in the campaign's world.
        self._workflows_to_monitor = list()
        self._est_end_times = dict()
        self._env = Environment()
        self._enactor = SimulatedEnactor(env=self._env, sid=self._sid)
        self._enactor.register_state_cb(self.state_update_cb)

        # Creating a thread to execute the monitoring and work methods.
        # One flag for both threads may be enough  to monitor and check.
        self._terminate_event = mt.Event()  # Thread event to terminate.
        self._work_thread = None  # Private attribute that will hold the thread
        self._monitoring_thread = None  # Private attribute that will hold the thread
        self._cont = False
        self._hold = False

        path = os.getcwd() + '/' + self._sid

        self._logger = ru.Logger(name=self._uid, path=path, level='DEBUG')
        self._prof = ru.Profiler(name=self._uid, path=path)

        num_oper = [
            workflow['num_oper'] for workflow in self._campaign['campaign']
        ]
        if planner.lower() == 'random':
            self._planner = RandomPlanner(campaign=self._campaign['campaign'],
                                          resources=self._resources,
                                          num_oper=num_oper,
                                          sid=self._sid)
        elif planner.lower() == 'heft':
            self._planner = HeftPlanner(campaign=self._campaign['campaign'],
                                        resources=self._resources,
                                        num_oper=num_oper,
                                        sid=self._sid)
        else:
            self._logger.warning('Planner %s is not implemented. Rolling to a \
                                  random planner')
            self._planner = RandomPlanner(campaign=self._campaign['campaign'],
                                          resources=self._resources,
                                          num_oper=num_oper,
                                          sid=self._sid)