Example #1
0
    def __init__(self, command, logger=None):
        """
        The class constructor, which runs (execvpe) command in a separately
        forked process.  The bew process will inherit the environment of the
        application process.

        :type  command: string or list of strings
        :param command: The given command is what is run as a child, and
        fed/drained via pty pipes.  If given as string, command is split into an
        array of strings, using :func:`shlex.split`.

        :type  logger:  :class:`radical.utils.logger.Logger` instance
        :param logger:  logger stream to send status messages to.
        """

        self.logger = logger
        if not self.logger: self.logger = ru.Logger('radical.saga.pty')
        self.logger.debug("PTYProcess init %s" % self)

        if isinstance(command, basestring):
            command = shlex.split(command)

        if not isinstance(command, list):
            raise se.BadParameter("PTYProcess expects string or list command")

        if len(command) < 1:
            raise se.BadParameter("PTYProcess expects non-empty command")

        self.rlock = ru.RLock("pty process %s" % command)

        self.command = command  # list of strings too run()

        self.cache = ""  # data cache
        self.tail = ""  # tail of data data cache for error messages
        self.child = None  # the process as created by subprocess.Popen
        self.ptyio = None  # the process' io channel, from pty.fork()

        self.exit_code = None  # child died with code (may be revived)
        self.exit_signal = None  # child kill by signal (may be revived)

        self.recover_max = 3  # TODO: make configure option.  This does not
        self.recover_attempts = 0  # apply for recovers triggered by gc_timeout!

        try:
            self.initialize()

        except Exception as e:
            raise ptye.translate_exception(e, "pty or process creation failed")
Example #2
0
def test_construct_command(mocked_init, mocked_configure):

    component = Srun(name=None, cfg=None, session=None)

    component._log = ru.Logger('dummy')
    component._cfg = {}

    component.name = 'srun'
    component.launch_command = '/bin/srun'

    test_cases = setUp('lm', 'srun')
    for unit, result in test_cases:

        if result != "RuntimeError":
            command, hop = component.construct_command(unit, None)
            assert ([command, hop] == result), unit['uid']
Example #3
0
    def __init__(self):

        # Engine manages cpis from adaptors
        self._adaptor_registry = dict()

        # get angine, adaptor and pty configs
        self._cfg = ru.Config('radical.saga.engine')
        self._pty_cfg = ru.Config('radical.saga.pty')
        self._registry = ru.Config('radical.saga.registry')

        # Initialize the logging, and log version (this is a singleton!)
        self._logger = ru.Logger('radical.saga')
        self._logger.info('radical.saga         version: %s' % version_detail)

        # load adaptors
        self._load_adaptors()
Example #4
0
    def test_unschedule_unit(self, mocked_init):

        component = Continuous(cfg=None, session=None)
        _, cfg = self.setUp()
        unit = dict()
        unit['description'] = cfg[1]['unit']['description']
        unit['slots'] = cfg[1]['setup']['lm']['slots']
        component.nodes = cfg[1]['setup']['lm']['slots']['nodes']
        component._log = ru.Logger('dummy')
        component.unschedule_unit(unit)
        try:
            self.assertEqual(component.nodes[0]['cores'], [0])
            self.assertEqual(component.nodes[0]['gpus'], [0])
        except:
            with pytest.raises(AssertionError):
                raise
Example #5
0
def test_construct_command(mocked_init, mocked_configure, mocked_raise_on):

    test_cases = setUp('lm', 'yarn')
    component = Yarn(cfg=None, session=None, name=None)

    component._log = ru.Logger('dummy')
    component.launch_command = 'yarn'
    component.name = "YARN"

    for unit, result in test_cases:
        if result == "RuntimeError":
            with pytest.raises(RuntimeError):
                command, hop = component.construct_command(unit, None)
        else:
            command, hop = component.construct_command(unit, None)
            assert ([command, hop] == result)
Example #6
0
    def test_schedule_unit(self, mocked_init, mocked_configure,
                           mocked_find_resources):

        _, cfg = self.setUp()
        component = Continuous(cfg=None, session=None)
        unit = dict()
        unit['uid'] = cfg[1]['unit']['uid']
        unit['description'] = cfg[1]['unit']['description']
        component.nodes = cfg[1]['setup']['lm']['slots']['nodes']

        component._rm_cores_per_node = 32
        component._rm_gpus_per_node = 2
        component._rm_lfs_per_node = {"size": 0, "path": "/dev/null"}
        component._rm_mem_per_node = 1024
        component._rm_lm_info = 'INFO'
        component._log = ru.Logger('dummy')
        component._node_offset = 0
        test_slot = {
            'cores_per_node':
            32,
            'gpus_per_node':
            2,
            'lfs_per_node': {
                'path': '/dev/null',
                'size': 0
            },
            'lm_info':
            'INFO',
            'mem_per_node':
            1024,
            'nodes': [{
                'core_map': [[0]],
                'gpu_map': [[0]],
                'lfs': {
                    'path': '/dev/null',
                    'size': 1234
                },
                'mem': 128,
                'name': 'a',
                'uid': 1
            }]
        }
        try:
            self.assertEqual(component.schedule_unit(unit), test_slot)
        except:
            with pytest.raises(AssertionError):
                raise
Example #7
0
def test_construct_command(mocked_init, mocked_get_mpi_info, mocked_raise_on):

    test_cases = setUp('lm', 'mpirun')

    component = MPIRun(name=None, cfg=None, session=None)
    component.name = 'MPIRun'
    component._configure()

    component._log = ru.Logger('dummy')
    component.mpi_flavor = None
    component.launch_command = 'mpirun'
    component._ccmrun = ''
    component._dplace = ''

    for unit, result in test_cases:
        command, hop = component.construct_command(unit, None)
        assert ([command, hop] == result), unit['uid']
Example #8
0
    def __init__(self, adaptor_info, adaptor_options=None, expand_env=True):

        # FIXME: engine is loading cfg already, here we load again...

        self._info    = adaptor_info
        self._name    = adaptor_info['name']
        self._schemas = adaptor_info['schemas']

        self._lock    = mt.RLock()
        self._logger  = ru.Logger('radical.saga.api')

        self._cfg     = ru.Config(module='radical.saga.adaptors',
                                  name=self._name,
                                  expand=expand_env)

        if 'enabled' not in self._cfg:
            self._cfg['enabled'] = True
Example #9
0
    def __init__(self, ensemble_size, exchange_size, window_size, md_cycles):

        self._en_size = ensemble_size
        self._ex_size = exchange_size
        self._cycles = md_cycles
        self._window_size = window_size
        self._lock = mt.Lock()
        self._log = ru.Logger('radical.repex.exc')
        self._dout = open('dump.log', 'a')

        re.AppManager.__init__(self, autoterminate=False, port=32769)
        self.resource_desc = {
            "resource": "xsede.bridges",
            "walltime": 60,
            "cpus": 28,
            "gpus_per_node": 0,
            "access_schema": "gsissh",
            "queue": "RM",
            "project": "mr560ip"
        }

        #{"resource"      : "xsede.comet_ssh",
        #"walltime"      : 30,
        #"cpus"          : 24,
        #"gpus_per_node" : 0,
        #access_schema" : "gsissh",
        #queue"         : "debug",
        #"project" : "rut129" }

        # self.resource_desc = {"resource" : 'local.localhost',
        #                       "walltime" : 30,
        #                       "cpus"     : 64}

        self._replicas = list()
        self._waitlist = list()

        # create the required number of replicas
        for i in range(self._en_size):

            replica = Replica(check_ex=self._check_exchange,
                              check_res=self._check_resume,
                              rid=i)

            self._replicas.append(replica)

        self._dump(msg='startup')
Example #10
0
    def __init__(self, uid=None):

        ok = False
        try:
            ok = hasattr(self, '_apitype')

        except:
            pass

        if not ok:
            self._apitype = self._get_apitype()

        self._logger = ru.Logger('radical.saga')
        if uid:
            self._id = uid
        else:
            self._id = ru.generate_id(self._apitype, mode=ru.ID_SIMPLE)
Example #11
0
    def __init__(self, adaptor_info, adaptor_options=None, expand_env=True):

        # FIXME: engine is loading cfg already, here we load again...

        self._info = adaptor_info
        self._name = adaptor_info['name']
        self._schemas = adaptor_info['schemas']

        self._lock = ru.RLock(self._name)
        self._logger = ru.Logger('radical.saga.api')

        # we need to expand later once we got env from the remote resource
        self._cfg = ru.Config(module='radical.saga',
                              name=self._name,
                              expand=expand_env)

        if 'enabled' not in self._cfg:
            self._cfg['enabled'] = True
Example #12
0
    def __init__(self, check_ex, rid, sbox, cores, exe):

        self._check_ex = check_ex  # is called when checking for exchange
        # self._check_res = check_res  # is called when exchange is done
        self._rid = rid
        self._sbox = sbox
        self._cores = cores
        self._exe = exe
        self._cycle = 0  # initial cycle

        self._log = ru.Logger('radical.repex.rep')

        # entk pipeline initialization
        re.Pipeline.__init__(self)
        self.name = 'p_%s' % self.rid

        # add an initial md stage
        self.add_md_stage()
Example #13
0
    def __init__(self, workload, properties):

        self._workload = workload
        self._check_ex = None
        self._check_res = None

        if 'rid' in properties:
            self._rid = properties['rid']
        else:
            self._rid = ru.generate_id('replica..%(counter)06d', ru.ID_CUSTOM)

        self._props = properties
        self._cycle = 0  # initial cycle
        self._ex_list = None  # list of replicas used in exchange step

        re.Pipeline.__init__(self)
        self.name = 'p_%s' % self.rid
        self._log = ru.Logger('radical.repex')
Example #14
0
    def __init__(self, check_ex, check_res, rid, sbox, cores, exe):

        self._check_ex = check_ex
        self._check_res = check_res
        self._rid = rid
        self._sbox = sbox
        self._cores = cores
        self._exe = exe

        self._cycle = 0  # initial cycle
        self._ex_list = None  # list of replicas used in exchange step

        re.Pipeline.__init__(self)
        self.name = 'p_%s' % self.rid
        self._log = ru.Logger('radical.repex.rep')

        # add an initial md stage
        self.add_md_stage()
def test_plan4(mocked_init, mocked_raise_on):

    actual_plan = [('W1', {'id': 1, 'performance': 523}, 0, 102.5793499043977)]
    planner = HeftPlanner(None, None, None)
    planner._campaign = ['W1']
    planner._resources = [{
        'id': 1,
        'performance': 523
    }, {
        'id': 2,
        'performance': 487
    }, {
        'id': 3,
        'performance': 96
    }]
    planner._num_oper = [53649]
    planner._logger = ru.Logger('dummy')
    est_plan = planner.plan()
    assert est_plan == actual_plan
    def __init__(self, size, exchange_size, window_size, min_cycles, min_temp,
                 max_temp, timesteps, basename, executable, cores):

        self._size = size
        self._exchange_size = exchange_size
        self._window_size = window_size
        self._min_cycles = min_cycles
        self._min_temp = min_temp
        self._max_temp = max_temp
        self._timesteps = timesteps
        self._basename = basename
        self._executable = executable
        self._cores = cores

        self._log = ru.Logger('radical.repex.exc')

        # inintialize the entk app manager
        re.AppManager.__init__(self, autoterminate=False, port=RMQ_PORT)
        self.resource_desc = {
            "resource": 'local.localhost',
            "walltime": 30,
            "cpus": 4
        }

        # this is ugly
        self._sbox = '$Pipeline_untarPipe_Stage_untarStg_Task_untarTsk'
        self._cnt = 0  # count exchanges
        self._replicas = list()
        self._waitlist = list()
        self._exchange_list = list(
        )  # Sublist of self._waitlist that performs an exchange

        # create the required number of replicas
        for i in range(self._size):

            replica = Replica(check_ex=self._check_exchange,
                              check_res=self._check_resume,
                              rid=i,
                              sbox=self._sbox,
                              cores=self._cores,
                              exe=self._executable)

            self._replicas.append(replica)
Example #17
0
def test_construct_command(mocked_init, mocked_configure, mocked_raise_on):

    test_cases = setUp('lm', 'rsh')
    component = RSH(name=None, cfg=None, session=None)
    component._log = ru.Logger('dummy')
    component.name = 'RSH'
    component.mpi_flavor = None
    component.launch_command = 'rsh'

    for unit, result in test_cases:
        if result == "ValueError":
            with pytest.raises(ValueError):
                command, hop = component.construct_command(unit, None)
        elif result == "RuntimeError":
            with pytest.raises(RuntimeError):
                command, hop = component.construct_command(unit, 1)
        else:
            command, hop = component.construct_command(unit, 1)
            assert ([command, hop] == result)
Example #18
0
    def test_handle_cuda(self, mocked_init):

        tests = self.setUp()
        setups = tests['handle_cuda']['setup']
        units = tests['handle_cuda']['unit']
        results = tests['handle_cuda']['results']
        component = AgentSchedulingComponent()
        component._log = ru.Logger('dummy')

        for setup, unit, result in zip(setups, units, results):
            component._cfg = setup
            if result == 'ValueError':
                with pytest.raises(ValueError):
                    component._handle_cuda(unit)
            else:
                component._handle_cuda(unit)
                self.assertEqual(
                    unit['description']['environment']['CUDA_VISIBLE_DEVICES'],
                    result)
Example #19
0
def translate_exception(e, msg=None):
    """
    In many cases, we should be able to roughly infer the exception cause
    from the error message -- this is centrally done in this method.  If
    possible, it will return a new exception with a more concise error
    message and appropriate exception type.
    """

    if not issubclass(e.__class__, se.SagaException):
        # we do not touch non-saga exceptions
        return e

    if not issubclass(e.__class__, se.NoSuccess):
        # this seems to have a specific cause already, leave it alone
        return e

    ru.Logger('radical.saga.pty').debug(traceback.format_exc())

    cmsg = e._plain_message

    if msg:
        cmsg = "%s (%s)" % (cmsg, msg)

    lmsg = cmsg.lower()

    if 'could not resolve hostname' in lmsg: e = se.BadParameter(cmsg)
    elif 'connection timed out' in lmsg: e = se.BadParameter(cmsg)
    elif 'connection refused' in lmsg: e = se.BadParameter(cmsg)
    elif 'auth' in lmsg: e = se.AuthorizationFailed(cmsg)
    elif 'pass' in lmsg: e = se.AuthenticationFailed(cmsg)
    elif 'denied' in lmsg: e = se.PermissionDenied(cmsg)
    elif 'man-in-the-middle' in lmsg:
        e = se.AuthenticationFailed("ssh key mismatch: %s" % cmsg)
    elif 'ssh_exchange_identification' in lmsg:
        e = se.AuthenticationFailed("too many login attempts: %s" % cmsg)
    elif 'shared connection' in lmsg:
        e = se.NoSuccess("Insufficient system resources: %s" % cmsg)
    elif 'pty allocation' in lmsg:
        e = se.NoSuccess("Insufficient system resources: %s" % cmsg)
    elif 'Connection to master closed' in lmsg:
        e = se.NoSuccess("Connection closed by system: %s" % cmsg)

    return e
Example #20
0
def test_configure(mocked_init, mocked_init_continuous, mocked_subscriber):
    '''
    Test 1 check configuration setup
    '''
    cfg = dict()
    component = ContinuousOrdered(cfg=None, session=None)
    component._trigger_state = rps.UMGR_STAGING_OUTPUT_PENDING
    component._lock = mt.RLock()
    component._cfg = cfg
    component._ru_terminating = True
    component._uid = None
    component._log = ru.Logger('dummy')
    component._units = dict()
    component._unordered = list()
    component._ns = dict()
    component._ns_init = {'current': 0}
    component._order_init = {'size': 0, 'uids': list(), 'done': list()}

    component._configure()
Example #21
0
    def __init__(self):
        '''
        initialize the service endpoint:

          - create logger, profile and reporter
          - set up accounts
        '''

        self._log = ru.Logger('radical.nge.service')
        self._rep = ru.Reporter('radical.nge.service')
        self._prof = ru.Profiler('radical.nge.service')
        self._accounts = {
            'andre': _Account('andre', 'erdna'),
            'matteo': _Account('matteo', 'eottam'),
            'daniel': _Account('daniel', 'leinad'),
            'guest': _Account('guest', 'guest'),
        }

        self._rep.header('--- NGE (%s) ---' % rn.version)
Example #22
0
    def __init__(self, ensemble_size, exchange_size, window_size, md_cycles,
                 min_temp, max_temp, timesteps, basename, executable, cores):

        self._en_size = ensemble_size
        self._ex_size = exchange_size
        self._window_size = window_size
        self._cycles = md_cycles
        self._min_temp = min_temp
        self._max_temp = max_temp
        self._timesteps = timesteps
        self._basename = basename
        self._executable = executable
        self._cores = cores
        self._lock = mt.Lock()
        self._log = ru.Logger('radical.repex.exc')
        self._dout = open('dump.log', 'a')

        re.AppManager.__init__(self, autoterminate=False, port=5672)
        self.resource_desc = {
            "resource": 'local.localhost',
            "walltime": 30,
            "cpus": 32
        }

        self._sbox = '$Pipeline_untarPipe_Stage_untarStg_Task_untarTsk'
        self._cnt = 0  # count exchanges
        self._replicas = list()
        self._waitlist = list()

        # create the required number of replicas
        for i in range(self._en_size):

            replica = Replica(check_ex=self._check_exchange,
                              check_res=self._check_resume,
                              rid=i,
                              sbox=self._sbox,
                              cores=self._cores,
                              exe=self._executable)

            self._replicas.append(replica)

        self._dump(msg='startup')
Example #23
0
def test_construct_command(mocked_init, mocked_configure):

    test_cases = setUp('lm', 'prte')

    component = PRTE(name=None, cfg=None, session=None)

    component.name = 'prte'
    component._verbose = None
    component._log = ru.Logger('dummy')
    component.launch_command = 'prun'

    for unit, result in test_cases:

        if result == "RuntimeError":
            with pytest.raises(RuntimeError):
                command, hop = component.construct_command(unit, None)

        else:
            command, hop = component.construct_command(unit, None)
            assert ([command, hop] == result), unit['uid']
    def test_orte_nompi_construct(self, mocked_init, mocked_raise_on):
        launch_method = ORTE(cfg={'Testing'}, session=self._session)
        launch_method.launch_command = 'orterun'
        launch_method._log = ru.Logger('dummy')

        orte_cmd, _ = launch_method.construct_command(self._cu,
                                                      launch_script_hop=1)

        os.environ['LD_LIBRARY_PATH'] = ''
        os.environ['PYTHONPATH'] = ''
        env_string = ' '.join([
            '-x "%s"' % (var)
            for var in ['LD_LIBRARY_PATH', 'PATH', 'PYTHONPATH']
            if var in os.environ
        ])

        self.assertTrue(
            orte_cmd ==
            'orterun  --hnp "test" -np 1 --bind-to none -host 1 %s  test_exe' %
            env_string)
Example #25
0
    def __init__(self, cfg_path):

        self._uid = ru.generate_id('wlms', mode=ru.ID_UNIQUE)
        self._logger = ru.Logger('radical.wlms.%s' % self._uid)
        self._ts_criteria = None
        self._rs_criteria = None
        self._sb_criteria = None
        self._tb_criteria = None
        self._host = None
        self._port = None

        self._workload = None
        self._resource = None
        self._early_binding = None

        with open(cfg_path, 'r') as stream:
            cfg = load(stream)
            self._parse_cfg(cfg)

        self._setup_msg_sys()
Example #26
0
    def __init__(self, sid=None):

        self._worflows = list()  # A list of workflows IDs
        # This will a hash table of workflows. The table will include the
        # following:
        # 'workflowsID': {'state': The state of the workflow based on the WFM,
        #                 'endpoint': Process ID or object to WMF for the specific
        #                             workflow,
        #                 'start_time': Epoch of when the workflow is submitted
        #                               to the WMF,
        #                 'end_time': Epoch of when the workflow finished.}
        self._execution_status = dict()  # This will create a hash table of workflows

        self._uid = ru.generate_id('enactor.%(counter)04d', mode=ru.ID_CUSTOM,
                                    ns=sid)
        path = os.getcwd() + '/' + sid
        name = self._uid

        self._logger = ru.Logger(name=self._uid, path=path, level='DEBUG')
        self._prof   = ru.Profiler(name=name, path=path)
Example #27
0
    def __init__(self, url, log=None, rep=None, prof=None):

        if log: self._log = log
        else: self._log = ru.Logger('radical.nge')

        if rep: self._rep = log
        else: self._rep = ru.Reporter('radical.nge')

        if prof: self._prof = prof
        else: self._prof = ru.Profiler('radical.nge')

        self._cookies = list()
        self._url = ru.Url(url)

        self._qbase = ru.Url(url)
        # self._qbase.username = None
        # self._qbase.password = None
        self._qbase = str(self._qbase).rstrip('/')

        if self._url.username and self._url.password:
            self.login(self._url.username, self._url.password)
Example #28
0
    def test_try_allocation(self, mocked_init, mocked_schedule_unit,
                            mocked_handle_cuda, mocked_change_slot_states):

        component = AgentSchedulingComponent()
        component._log = ru.Logger('dummy')
        component._allocate_slot = mock.Mock(
            side_effect=[None, {
                'slot': 'test_slot'
            }])
        component._prof = mock.Mock()
        component._prof.prof = mock.Mock(return_value=True)
        component._wait_pool = list()
        component._wait_lock = threading.RLock()
        component._slot_lock = threading.RLock()
        unit = {'description': {'note': 'this is a unit'}, 'uid': 'test'}
        component._try_allocation(unit=unit)
        self.assertEqual(
            unit['slots'], {
                "cores_per_node":
                16,
                "lfs_per_node": {
                    "size": 0,
                    "path": "/dev/null"
                },
                "nodes": [{
                    "lfs": {
                        "path": "/dev/null",
                        "size": 0
                    },
                    "core_map": [[0]],
                    "name": "a",
                    "gpu_map": None,
                    "uid": 1,
                    "mem": None
                }],
                "lm_info":
                "INFO",
                "gpus_per_node":
                6,
            })
Example #29
0
def master(obj, obj_type, new_state):

    hostname = os.environ.get('RMQ_HOSTNAME', 'localhost')
    port = int(os.environ.get('RMQ_PORT', 5672))

    mq_connection = pika.BlockingConnection(
        pika.ConnectionParameters(host=hostname, port=port))
    mq_channel = mq_connection.channel()

    queue1 = 'test-1-2-3'  # Expected queue name structure 'X-A-B-C'
    queue2 = 'test-3-2-1'  # Expected queue name structure 'X-C-B-A'
    mq_channel.queue_declare(queue=queue1)
    mq_channel.queue_declare(queue=queue2)

    logger = ru.Logger('radical.entk.test')
    profiler = ru.Profiler('radical.entk.test')

    thread1 = Thread(target=func,
                     args=(obj, obj_type, new_state, queue1, logger, profiler))
    thread1.start()

    while True:
        method_frame, props, body = mq_channel.basic_get(queue=queue1)
        if body:

            msg = json.loads(body)
            assert msg['object']['state'] == new_state
            mq_channel.basic_publish(exchange='',
                                     routing_key=queue2,
                                     properties=pika.BasicProperties(
                                         correlation_id=props.correlation_id),
                                     body='ack')
            mq_channel.basic_ack(delivery_tag=method_frame.delivery_tag)
            break

    mq_channel.queue_delete(queue=queue1)
    mq_channel.queue_delete(queue=queue2)
    mq_connection.close()
    thread1.join()
Example #30
0
def test_construct_command(mocked_init, mocked_configure, mocked_raise_on):

    component = IBRun(name=None, cfg=None, session=None)

    component._log = ru.Logger('dummy')
    component._cfg = {'cores_per_node': 0}
    component._node_list = [['node1'], ['node2']]

    component.name = 'IBRun'
    component.launch_command = 'ibrun'

    test_cases = setUp('lm', 'ibrun')
    for unit, result in test_cases:
        if result == 'RuntimeError':
            with pytest.raises(RuntimeError):
                command, hop = component.construct_command(unit, None)
        elif result == 'AssertionError':
            with pytest.raises(AssertionError):
                command, hop = component.construct_command(unit, None)
        else:
            command, hop = component.construct_command(unit, None)
            assert ([command, hop] == result), unit['uid']