def test__pilot_errors(self): """ Test if pilot errors are raised properly. """ session = rp.Session() try: pm = rp.PilotManager(session=session) cpd = rp.ComputePilotDescription() cpd.resource = "local.localhost" cpd.cores = 1 cpd.runtime = 1 cpd.sandbox = "/non-/existing/directory..." cpd.cleanup = True pilot = pm.submit_pilots(descriptions=cpd) pilot.wait(timeout=300) assert pilot.state == rp.FAILED, "State is '%s' instead of 'Failed'." % pilot.state cpd = rp.ComputePilotDescription() cpd.resource = "local.localhost" cpd.cores = 100000000000 # This should fail - at least in 2014 ;-) cpd.runtime = 1 cpd.sandbox = "/tmp/rp.sandbox.unittests" cpd.cleanup = True pilot = pm.submit_pilots(descriptions=cpd) pilot.wait(timeout=300) assert pilot.state == rp.FAILED, ("state should be %s and not %s" % (rp.FAILED, pilot.state)) finally: session.close()
def test__pilotmanager_wait(self): """Test if wait() waits until all (2) pilots have reached 'DONE' state. """ session = rp.Session() pmgr = rp.PilotManager(session=session) cpd1 = rp.ComputePilotDescription() cpd1.resource = "local.localhost" cpd1.cores = 1 cpd1.runtime = 1 cpd1.sandbox = "/tmp/rp.sandbox.unittests" cpd1.cleanup = True cpd2 = rp.ComputePilotDescription() cpd2.resource = "local.localhost" cpd2.cores = 1 cpd2.runtime = 2 cpd2.sandbox = "/tmp/rp.sandbox.unittests" cpd2.cleanup = True pilots = pmgr.submit_pilots([cpd1, cpd2]) pmgr.wait_pilots(timeout=300) for pilot in pilots: assert pilot.state == rp.DONE, "Expected state 'Done' but state is %s" % pilot.state assert pilot.stop_time is not None assert pilot.start_time is not None session.close()
def test__unitmanager_pilot_assoc(self): """ Test if unit manager <-> pilot association works as expected. """ session = rp.Session() pm = rp.PilotManager(session=session) cpd = rp.ComputePilotDescription() cpd.resource = "local.localhost" cpd.cores = 1 cpd.runtime = 1 cpd.sandbox = "/tmp/rp.sandbox.unittests" cpd.cleanup = True p1 = pm.submit_pilots(descriptions=cpd) um = rp.UnitManager(session=session, scheduler='round_robin') assert um.list_pilots() == [], "Wrong list of pilots" um.add_pilots(p1) assert um.list_pilots() == [p1.uid], "Wrong list of pilots" # adding the same pilot twice should be ignored um.add_pilots(p1) assert um.list_pilots() == [p1.uid], "Wrong list of pilots" um.remove_pilots(p1.uid) assert um.list_pilots() == [], "Wrong list of pilots" pilot_list = [] for x in range(0, 2): cpd = rp.ComputePilotDescription() cpd.resource = "local.localhost" cpd.cores = 1 cpd.runtime = 1 cpd.sandbox = "/tmp/rp.sandbox.unittests" cpd.cleanup = True p = pm.submit_pilots(descriptions=cpd) um.add_pilots(p) pilot_list.append(p) pl = um.list_pilots() assert len(pl) == 2, "Wrong number of associated pilots" for l in pilot_list: assert l in pilot_list, "Unknown pilot in list" um.remove_pilots(l.uid) assert um.list_pilots() == [], "Wrong list of pilots" session.close()
def rp_setup_state(request): session = rp.Session(database_url=db_url) try: pmgr = rp.PilotManager(session=session) umgr = rp.UnitManager(session=session, scheduler=rp.SCHED_DIRECT_SUBMISSION, output_transfer_workers=4, input_transfer_workers=4) pdesc = rp.ComputePilotDescription() pdesc.resource = "local.localhost" pdesc.runtime = 20 pdesc.cores = 1 pdesc.cleanup = True pilot = pmgr.submit_pilots(pdesc) pilot.register_callback(pilot_state_cb) umgr.add_pilots(pilot) except Exception as e: print 'test failed' raise def fin(): print 'closing session' session.close() request.addfinalizer(fin) return pilot, pmgr, umgr
def test_03_multiple_pilots(self): """Test multiple pilots""" # Have to hard-code list of resources # TODO: get real list of resources resources = ['local.localhost'] # Create multiple pilot descriptions, one for each resource pilot_descriptions = list() resource_count = len(resources) for resource in resources: pd_init = { 'resource': resource, 'runtime': 15, # pilot runtime (min) 'exit_on_error': True, 'project': self.config[resource]['project'], 'queue': self.config[resource]['queue'], 'access_schema': self.config[resource]['schema'], 'cores': self.config[resource]['cores'], } pilot_descriptions.append(rp.ComputePilotDescription(pd_init)) # Launch the pilot. pilot = self.pmgr.submit_pilots(pilot_descriptions) pilot_count = len(pilot) self.umgr.add_pilots(pilot) # Create a workload of ComputeUnits. # Each compute unit runs '/bin/date'. cuds = list() for i in range(0, self.n): # create a new CU description, and fill it. # Here we don't use dict initialization. cud = rp.ComputeUnitDescription() cud.executable = '/bin/date' cuds.append(cud) # Submit the previously created ComputeUnit descriptions to the # PilotManager. This will trigger the selected scheduler to start # assigning ComputeUnits to the ComputePilots. units = self.umgr.submit_units(cuds) # Wait for all compute units to reach a final state (DONE, CANCELED or # FAILED). self.umgr.wait_units() # Verify that 100% of the units came back with 'DONE' status done_units = 0 for description in units: if description.state == "DONE": done_units += 1 self.assertEquals( (float(done_units) / float(self.n)), 1.0, "Only {0}% of CUs were DONE.".format( str((float(done_units) / float(self.n)) * 100.00))) # Finally assert that the number of requested vs submitted pilots are # the same self.assertEquals(resource_count, pilot_count)
def rp_setup_short(request): session = rp.Session(database_url=db_url) try: pmgr = rp.PilotManager(session=session) umgr = rp.UnitManager(session=session, scheduler=rp.SCHED_DIRECT_SUBMISSION) pdesc = rp.ComputePilotDescription() pdesc.resource = "local.localhost" pdesc.runtime = 1 pdesc.cores = 1 pdesc.sandbox = "/tmp/radical.pilot.sandbox.unittests" pdesc.cleanup = True pilot = pmgr.submit_pilots(pdesc) pilot.register_callback(pilot_state_cb) umgr.add_pilots(pilot) except Exception as e: print 'test failed' raise def fin(): pmgr.cancel_pilots() pmgr.wait_pilots() print 'closing session' session.close() request.addfinalizer(fin) return pilot, pmgr, umgr
def test__pilot_cancel(self): """ Test if we can cancel a pilot. """ session = rp.Session() try: pm = rp.PilotManager(session=session) cpd = rp.ComputePilotDescription() cpd.resource = "local.localhost" cpd.cores = 1 cpd.runtime = 1 cpd.sandbox = "/tmp/rp.sandbox.unittests" cpd.cleanup = True pilot = pm.submit_pilots(descriptions=cpd) assert pilot is not None assert pilot.start_time is None assert pilot.stop_time is None pilot.wait(state=[rp.PMGR_ACTIVE, rp.FAILED], timeout=300) assert pilot.submission_time is not None assert pilot.state == rp.PMGR_ACTIVE assert pilot.start_time is not None # the pilot should finish after it has reached run_time pilot.cancel() pilot.wait(timeout=300) assert pilot.state == rp.CANCELED assert pilot.stop_time is not None finally: session.close()
def test__pilotmanager_list_pilots_after_reconnect(self): """ Test if listing pilots after a reconnect works as expected. """ session = rp.Session() pm1 = rp.PilotManager(session=session) assert len(pm1.list_pilots()) == 0, "Wrong number of pilots returned." pm2 = rp.PilotManager(session=session) assert len(pm2.list_pilots()) == 0, "Wrong number of pilots returned." for i in range(0, 2): cpd = rp.ComputePilotDescription() cpd.resource = "local.localhost" cpd.cores = 1 cpd.runtime = 1 cpd.sandbox = "/tmp/rp.sandbox.unittests" cpd.cleanup = True pm1.submit_pilots(descriptions=cpd) pm2.submit_pilots(descriptions=cpd) assert len(pm1.list_pilots()) == 2, "Wrong number of pilots returned." assert len(pm2.list_pilots()) == 2, "Wrong number of pilots returned." pm1_r = session.get_pilot_managers(pilot_manager_ids=pm1.uid) pm2_r = session.get_pilot_managers(pilot_manager_ids=pm2.uid) assert len( pm1_r.list_pilots()) == 2, "Wrong number of pilots returned." assert len( pm2_r.list_pilots()) == 2, "Wrong number of pilots returned." session.close()
def test__issue_114_part_3(self): """ https://github.com/radical-cybertools/radical.pilot/issues/114 """ session = rp.Session(database_url=DBURL, database_name=DBNAME) pm = rp.PilotManager(session=session) cpd = rp.ComputePilotDescription() cpd.resource = "local.localhost" cpd.cores = 1 cpd.runtime = 1 cpd.sandbox = "/tmp/radical.pilot.sandbox.unittests" cpd.cleanup = True pilot = pm.submit_pilots(pilot_descriptions=cpd) um = rp.UnitManager(session=session, scheduler=rp.SCHED_DIRECT_SUBMISSION) um.add_pilots(pilot) state = pm.wait_pilots(state=[rp.PMGR_ACTIVE, rp.DONE, rp.FAILED], timeout=10 * 60) assert state == [rp.PMGR_ACTIVE], 'state : %s' % state assert pilot.state == rp.PMGR_ACTIVE, 'pilot state: %s' % pilot.state state = pm.wait_pilots(timeout=3 * 60) assert state == [rp.DONE], 'state : %s' % state assert pilot.state == rp.DONE, 'pilot state: %s' % pilot.state session.close()
def test__add_resource_config_2(self): """ Test if we can wait for different pilot states. """ session = rp.Session() rc = rp.ResourceConfig("mylocalhost") rc.task_launch_method = "LOCAL" rc.mpi_launch_method = "MPIRUN" rc.job_manager_endpoint = "fork://localhost" rc.filesystem_endpoint = "file://localhost/" rc.bootstrapper = "default_bootstrapper.sh" pm = rp.PilotManager(session=session) session.add_resource_config(rc) pd = rp.ComputePilotDescription() pd.resource = "mylocalhost" pd.cores = 1 pd.runtime = 1 pd.sandbox = "/tmp/rp.sandbox.unittests" pd.cleanup = True pilot = pm.submit_pilots(pd) pilot.wait(timeout=300) pilot.cancel() session.close()
def test_01_unit_details(self): """Test unit details, units has all details accessible via api """ # Detail keys to be checked in unit dictionary expected_detail_keys = [ 'type', 'umgr', 'uid', 'name', 'state', 'exit_code', 'stdout', 'stderr', 'pilot', 'sandbox', 'description', ] # Create description object from template description pilot_desc = rp.ComputePilotDescription(self.pd_init) # Launch the pilot. pilot = self.pmgr.submit_pilots(pilot_desc) self.umgr.add_pilots(pilot) # Create a workload of ComputeUnits. # Each compute unit runs '/bin/date'. cuds = list() for i in range(1, self.n + 1): # create a new CU description, and fill it. # Here we don't use dict initialization. cud = rp.ComputeUnitDescription() cud.executable = '/bin/date' cuds.append(cud) # Submit the previously created ComputeUnit descriptions to the # PilotManager. This will trigger the selected scheduler to start # assigning ComputeUnits to the ComputePilots. units = self.umgr.submit_units(cuds) # Wait for all compute units to reach a final state (DONE, CANCELED or # FAILED). self.umgr.wait_units() # Not asserting for 100% completion, that is not the idea here... # Check that all items in the dictionary # match the expected keys and that all # values are *not NONE* for unit in units: unit_dict = unit.as_dict() for key, val in unit_dict.iteritems(): self.assertIn(key, expected_detail_keys) self.assertIsNotNone(val, msg="'{0}' unexpectedly None".format(key))
def desc(self): resource = self.resource pd = { 'resource': self.rp_resource, 'runtime': self.runtime, 'exit_on_error': resource.exit_on_error, 'project': resource.project, 'queue': self.queue, 'access_schema': resource.access_schema, 'cores': self.cores } return rp.ComputePilotDescription(pd)
def test_pass_issue_57(): for i in [16, 32, 64]: session = rp.Session(database_url=db_url) try: c = rp.Context('ssh') c.user_id = CONFIG["xsede.stampede"]["user_id"] session.add_context(c) pmgr = rp.PilotManager(session=session) umgr = rp.UnitManager(session=session, scheduler=rp.SCHED_ROUND_ROBIN) pdesc = rp.ComputePilotDescription() pdesc.resource = "xsede.stampede" pdesc.project = CONFIG["xsede.stampede"]["project"] pdesc.cores = i pdesc.runtime = 20 pdesc.cleanup = False pilots = pmgr.submit_pilots(pdesc) umgr.add_pilots(pilots) unit_descrs = [] for k in range(0, i * 2): cu = rp.ComputeUnitDescription() cu.cores = 1 cu.executable = "/bin/date" unit_descrs.append(cu) units = umgr.submit_units(unit_descrs) try: umgr.wait_units() for unit in units: unit.wait() except: pass pmgr.cancel_pilots() pmgr.wait_pilots() except Exception as e: print "TEST FAILED" raise finally: session.close()
def desc(self): # question: This splitting in resource does not make sense to me # I would say that these should be coupled with the resource definition pd = { 'resource': self.resource, 'runtime': self.runtime, 'exit_on_error': self.exit_on_error, 'project': self.project, 'queue': self.queue, 'access_schema': self.access_schema, 'cores': self.cores, } return rp.ComputePilotDescription(pd)
def test__issue_262(self): """ https://github.com/radical-cybertools/radical.pilot/issues/18 """ session = rp.Session() pmgr = rp.PilotManager(session=session) # Create a local pilot with a million cores. This will most likely # fail as not enough cores will be available. That means the pilot will # go quickly into failed state, and trigger the callback from above. pd = rp.ComputePilotDescription() pd.resource = "local.localhost" pd.cores = 1 pd.runtime = 1 pilot = pmgr.submit_pilots(pd) umgr = rp.UnitManager( session=session, scheduler=rp.SCHED_DIRECT_SUBMISSION) umgr.add_pilots(pilot) cud = rp.ComputeUnitDescription() cud.executable = "/bin/sleep" cud.arguments = ["10"] cud.cores = 1 cud.input_staging = ["/etc/group"] unit = umgr.submit_units(cud) umgr.wait_units() for log_entry in pilot.log: ld = log_entry.as_dict() assert "timestamp" in ld assert "message" in ld s = "%s" % log_entry assert type(s) == unicode for log_entry in unit.log: ld = log_entry.as_dict() assert "timestamp" in ld assert "message" in ld s = "%s" % log_entry assert type(s) == unicode session.close()
def start_pilot(cr=None): """ In order to start a pilot on the newly created CR, we need to define a resource description for that CR. To do so, we programatically create a clone of the local.localhost description, and replace the job submission URL with an ssh:// URL pointing to the CR. """ if not cr: class _CR(object): def __init__(self): self.access = 'ssh://remote.host.net:1234/' cr = _CR() # get the local resource config session = rp.Session() cfg = session.get_resource_config('local.localhost') # create a new config based on the local one, and add it back new_cfg = rp.ResourceConfig('ec2.vm', cfg) new_cfg.schemas = ['ssh'] new_cfg['ssh']['job_manager_endpoint'] = cr.access new_cfg['ssh']['filesystem_endpoint'] = cr.access # the new config needs to make sure we can bootstrap on the VM new_cfg['pre_bootstrap_1'] = [ 'sudo apt-get update', 'sudo apt-get install -y python-virtualenv python-dev dnsutils bc' ] session.add_resource_config(new_cfg) # use the *same* ssh key for ssh access to the VM ssh_ctx = rs.Context('SSH') ssh_ctx.user_id = 'admin' ssh_ctx.user_key = os.environ['EC2_KEYPAIR'] session.contexts.append(ssh_ctx) # submit a pilot to it. pd = rp.ComputePilotDescription() pd.resource = 'ec2.vm' pd.runtime = 10 pd.cores = 1 pd.exit_on_error = True, pmgr = rp.PilotManager(session=session) return pmgr.submit_pilots(pd)
def setup_gordon(request): session1 = rp.Session() print "session id gordon: {0}".format(session1.uid) c = rp.Context('ssh') c.user_id = CONFIG["xsede.gordon"]["user_id"] session1.add_context(c) try: pmgr1 = rp.PilotManager(session=session1) print "pm id gordon: {0}".format(pmgr1.uid) umgr1 = rp.UnitManager (session=session1, scheduler=rp.SCHEDULER_DIRECT_SUBMISSION) pdesc1 = rp.ComputePilotDescription() pdesc1.resource = "xsede.gordon" pdesc1.project = CONFIG["xsede.gordon"]["project"] pdesc1.runtime = 30 pdesc1.cores = 16 pdesc1.cleanup = False pilot1 = pmgr1.submit_pilots(pdesc1) pilot1.register_callback(pilot_state_cb) umgr1.add_pilots(pilot1) except Exception as e: print 'test failed' raise def fin(): print "finalizing..." pmgr1.cancel_pilots() pmgr1.wait_pilots() print 'closing session' session1.close() time.sleep(5) request.addfinalizer(fin) return session1, pilot1, pmgr1, umgr1, "xsede.gordon"
def setup_stampede_two(request): session3 = rp.Session() print "session id stampede: {0}".format(session3.uid) c = rp.Context('ssh') c.user_id = CONFIG["xsede.stampede"]["user_id"] session3.add_context(c) try: pmgr3 = rp.PilotManager(session=session3) print "pm id stampede: {0}".format(pmgr3.uid) umgr3 = rp.UnitManager(session=session3, scheduler=rp.SCHED_DIRECT_SUBMISSION) pdesc3 = rp.ComputePilotDescription() pdesc3.resource = "xsede.stampede" pdesc3.project = CONFIG["xsede.stampede"]["project"] pdesc3.runtime = 20 pdesc3.cores = int(CONFIG["xsede.stampede"]["cores"]) * 2 pdesc3.cleanup = False pilot3 = pmgr3.submit_pilots(pdesc3) pilot3.register_callback(pilot_state_cb) umgr3.add_pilots(pilot3) except Exception as e: print 'test failed' raise def fin(): print "finalizing..." pmgr3.cancel_pilots() pmgr3.wait_pilots() print 'closing session' session3.close() request.addfinalizer(fin) return session3, pilot3, pmgr3, umgr3, "xsede.stampede"
def test__issue_114_part_1(self): """ https://github.com/radical-cybertools/radical.pilot/issues/114 """ session = rp.Session(database_url=DBURL, database_name=DBNAME) pm = rp.PilotManager(session=session) cpd = rp.ComputePilotDescription() cpd.resource = "local.localhost" cpd.cores = 1 cpd.runtime = 5 cpd.sandbox = "/tmp/radical.pilot.sandbox.unittests" cpd.cleanup = True pilot = pm.submit_pilots(pilot_descriptions=cpd) state = pm.wait_pilots(state=[rp.PMGR_ACTIVE, rp.DONE, rp.FAILED], timeout=5 * 60) assert (pilot.state == rp.PMGR_ACTIVE), "pilot state: %s" % pilot.state um = rp.UnitManager(session=session, scheduler=rp.SCHED_DIRECT_SUBMISSION) um.add_pilots(pilot) all_tasks = [] for i in range(0, 2): cudesc = rp.ComputeUnitDescription() cudesc.cores = 1 cudesc.executable = "/bin/sleep" cudesc.arguments = ['60'] all_tasks.append(cudesc) units = um.submit_units(all_tasks) states = um.wait_units(state=[rp.SCHEDULING, rp.AGENT_EXECUTING], timeout=2 * 60) assert rp.SCHEDULING in states, "states: %s" % states states = um.wait_units(state=[rp.AGENT_EXECUTING, rp.DONE], timeout=1 * 60) assert rp.AGENT_EXECUTING in states, "states: %s" % states session.close()
def test_rp_basic_task(rp_config): rp = rp_config['rp'] # Note: Session creation will fail with a FileNotFound error unless venv is explicitly `activate`d. # TODO: Figure out what `activate` does that `rp-venv/bin/python` doesn't do. with rp.Session() as session: # Based on `radical.pilot/examples/config.json` # TODO: Does the Session have a default spec for 'local.localhost'? Can/should we reference it? # See also https://github.com/radical-cybertools/radical.pilot/issues/2181 resource = 'local.localhost' resource_config = {resource: {}} if resource in rp_config['config']: resource_config[resource].update(rp_config.config[resource]) resource_config[resource].update({ 'project': None, 'queue': None, 'schema': None, 'cores': 1, 'gpus': 0 }) pilot_description = dict(resource=resource, runtime=30, exit_on_error=True, project=resource_config[resource]['project'], queue=resource_config[resource]['queue'], cores=resource_config[resource]['cores'], gpus=resource_config[resource]['gpus']) task_description = { 'executable': '/bin/date', 'cpu_processes': 1, } pmgr = rp.PilotManager(session=session) umgr = rp.UnitManager(session=session) pilot = pmgr.submit_pilots( rp.ComputePilotDescription(pilot_description)) task = umgr.submit_units(rp.ComputeUnitDescription(task_description)) umgr.add_pilots(pilot) umgr.wait_units() assert task.exit_code == 0 assert session.closed
def test_02_failing_units(self): """Test failing units, about ~50% of the units will fail""" # Create description object from template description pilot_desc = rp.ComputePilotDescription(self.pd_init) # Launch the pilot. pilot = self.pmgr.submit_pilots(pilot_desc) self.umgr.add_pilots(pilot) # Create a workload of ComputeUnits. # Each compute unit runs '/bin/date'. # About ~50% of them will fail cuds = list() for i in range(1, self.n + 1): # create a new CU description, and fill it. # Here we don't use dict initialization. cud = rp.ComputeUnitDescription() if i % 2: cud.executable = '/bin/date' else: # trigger an error now and then cud.executable = '/bin/data' # does not exist cuds.append(cud) # Submit the previously created ComputeUnit descriptions to the # PilotManager. This will trigger the selected scheduler to start # assigning ComputeUnits to the ComputePilots. units = self.umgr.submit_units(cuds) # Wait for all compute units to reach a final state (DONE, CANCELED or # FAILED). self.umgr.wait_units() # Verify that >= 50% of the units came back with 'DONE' status # TODO: better checks for failures... done_units = 0 for description in units: if description.state == "DONE": done_units += 1 self.assertGreaterEqual( (float(done_units) / float(self.n)), 0.50, "Only {0}% of CUs were DONE.".format( str((float(done_units) / float(self.n)) * 100.00)))
def setup_comet(request): session2 = rp.Session() print "session id comet: {0}".format(session2.uid) c = rp.Context('ssh') c.user_id = CONFIG["xsede.comet"]["user_id"] session2.add_context(c) try: pmgr2 = rp.PilotManager(session=session2) print "pm id gordon: {0}".format(pmgr2.uid) umgr2 = rp.UnitManager(session=session2, scheduler=rp.SCHED_DIRECT_SUBMISSION) pdesc2 = rp.ComputePilotDescription() pdesc2.resource = "xsede.comet" pdesc2.project = CONFIG["xsede.comet"]["project"] pdesc2.runtime = 30 pdesc2.cores = 24 pdesc2.cleanup = False pilot2 = pmgr2.submit_pilots(pdesc2) pilot2.register_callback(pilot_state_cb) umgr2.add_pilots(pilot2) except Exception as e: print 'test failed' raise def fin(): print "finalizing..." pmgr2.cancel_pilots() pmgr2.wait_pilots() print 'closing session' session2.close() request.addfinalizer(fin) return session2, pilot2, pmgr2, umgr2, "xsede.comet"
def setup_stampede_683(request): session = rp.Session() print "session id stampede: {0}".format(session.uid) c = rp.Context('ssh') c.user_id = CONFIG["xsede.stampede"]["user_id"] session.add_context(c) try: pmgr = rp.PilotManager(session=session) umgr = rp.UnitManager(session=session, scheduler=rp.SCHEDULER_BACKFILLING) pdesc = rp.ComputePilotDescription() pdesc.resource = "xsede.stampede" pdesc.project = CONFIG["xsede.stampede"]["project"] pdesc.runtime = 40 pdesc.cores = 683 pdesc.cleanup = False pilot = pmgr.submit_pilots(pdesc) pilot.register_callback(pilot_state_cb) umgr.add_pilots(pilot) except Exception as e: print 'test failed' raise def fin(): print "finalizing..." pmgr.cancel_pilots() pmgr.wait_pilots() print 'closing session' session.close() request.addfinalizer(fin) return session, pilot, pmgr, umgr, "xsede.stampede"
def _pilots_backfill(self, requests): ''' Request new backfill pilots, chunked by the given max_cores and max_walltime. The given request_stub is used as template for the pilot descriptions. ''' self._rep.info('\nrequesting backfilled pilots\n') pds = list() for request in requests: del (request['backfill']) policy = request['policy'] partition = request['partition'] PWD = os.path.dirname(__file__) policy = ru.read_json('%s/policies/%s.json' % (PWD, request['policy'])) max_cores = policy.get('max_cores', MAX_CORES) max_walltime = policy.get('max_walltime', MAX_WALLTIME) self._rep.info('\nrequesting backfill pilots\n') bf = get_backfill(request['partition'], max_cores, max_walltime) for [partition, cores, walltime] in bf: pd = { 'resource': request.get('resource', 'local.localhost'), 'project': request.get('project'), 'queue': request.get('queue'), 'cores': cores, 'runtime': walltime } self._rep.ok( 'backfill @ %s [%5dcores * %4dmin] @ %10s(%10s)]\n' % (pd['resource'], pd['cores'], pd['runtime'], pd['queue'], pd['project'])) # pprint.pprint(pd) pds.append(rp.ComputePilotDescription(pd)) return pds
def test__issue_114_part_2(self): """ https://github.com/radical-cybertools/radical.pilot/issues/114 """ session = rp.Session(database_url=DBURL, database_name=DBNAME) pm = rp.PilotManager(session=session) cpd = rp.ComputePilotDescription() cpd.resource = "local.localhost" cpd.cores = 1 cpd.runtime = 5 cpd.sandbox = "/tmp/radical.pilot.sandbox.unittests" cpd.cleanup = True pilot = pm.submit_pilots(pilot_descriptions=cpd) um = rp.UnitManager(session=session, scheduler=rp.SCHED_DIRECT_SUBMISSION) um.add_pilots(pilot) state = pm.wait_pilots(state=[rp.ACTIVE, rp.DONE, rp.FAILED], timeout=5 * 60) assert (pilot.state == rp.ACTIVE), "pilot state: %s" % pilot.state cudesc = rp.ComputeUnitDescription() cudesc.cores = 1 cudesc.executable = "/bin/sleep" cudesc.arguments = ['60'] cu = um.submit_units(cudesc) state = um.wait_units(state=[rp.EXECUTING], timeout=60) assert state == [rp.EXECUTING], 'state : %s' % state assert cu.state == rp.EXECUTING, 'cu state: %s' % cu.state state = um.wait_units(timeout=2 * 60) assert state == [rp.DONE], 'state : %s' % state assert cu.state == rp.DONE, 'cu state: %s' % cu.state session.close()
def createWorkload(options, config, withOrte, nodes, nthreads, runtime): resource = "ornl.titan_lib" cuList = [] for i in range(0, nodes * 16 / nthreads): cuList += [CUDef.createTAUGromacsCU(nthreads)] print(len(cuList)) pd_init = { 'resource': resource, 'runtime': runtime, # pilot runtime (min) 'exit_on_error': True, 'project': config[resource]['project'], 'queue': config[resource]['queue'], 'access_schema': config[resource]['schema'], 'cores': 16 * nnodes + 16, # Additional 16 cores are for ORTE } pilots = [] pilots.append(rp.ComputePilotDescription(pd_init)) return (pilots, cuList)
def setup_local_1(request): session1 = rp.Session() print "session id local_1: {0}".format(session1.uid) try: pmgr1 = rp.PilotManager(session=session1) print "pm id local_1: {0}".format(pmgr1.uid) umgr1 = rp.UnitManager(session=session1, scheduler=rp.SCHED_DIRECT_SUBMISSION) pdesc1 = rp.ComputePilotDescription() pdesc1.resource = "local.localhost" pdesc1.runtime = 30 pdesc1.cores = 1 pdesc1.cleanup = False pilot1 = pmgr1.submit_pilots(pdesc1) pilot1.register_callback(pilot_state_cb) umgr1.add_pilots(pilot1) except Exception as e: print 'test failed' raise def fin(): print "finalizing..." pmgr1.cancel_pilots() pmgr1.wait_pilots() print 'closing session' session1.close() time.sleep(5) request.addfinalizer(fin) return session1, pilot1, pmgr1, umgr1, "local.localhost"
def test__pilotmanager_get_pilots(self): session = rp.Session() pm1 = rp.PilotManager(session=session) assert len(pm1.list_pilots()) == 0, "Wrong number of pilots returned." pm2 = rp.PilotManager(session=session) assert len(pm2.list_pilots()) == 0, "Wrong number of pilots returned." pm1_pilot_uids = [] pm2_pilot_uids = [] for i in range(0, 2): cpd = rp.ComputePilotDescription() cpd.resource = "local.localhost" cpd.cores = 1 cpd.runtime = 1 cpd.sandbox = "/tmp/rp.sandbox.unittests" cpd.cleanup = True pilot_pm1 = pm1.submit_pilots(descriptions=cpd) pm1_pilot_uids.append(pilot_pm1.uid) pilot_pm2 = pm2.submit_pilots(descriptions=cpd) pm2_pilot_uids.append(pilot_pm2.uid) for i in pm1.list_pilots(): pilot = pm1.get_pilots(i) assert pilot.uid in pm1_pilot_uids, "Wrong pilot ID %s (not in %s)" % ( pilot.uid, pm1_pilot_uids) assert len(pm1.get_pilots()) == 2, "Wrong number of pilots." for i in pm2.list_pilots(): pilot = pm2.get_pilots(i) assert pilot.uid in pm2_pilot_uids, "Wrong pilot ID %s" % pilot.uid assert len(pm2.get_pilots()) == 2, "Wrong number of pilots." session.close()
def test_00_getting_started(self): """Test a standard pilot run""" # Create description object from template description pilot_desc = rp.ComputePilotDescription(self.pd_init) # Launch the pilot. pilot = self.pmgr.submit_pilots(pilot_desc) self.umgr.add_pilots(pilot) # Create a workload of ComputeUnits. # Each compute unit runs '/bin/date'. cuds = list() for i in range(0, self.n): # create a new CU description, and fill it. # Here we don't use dict initialization. cud = rp.ComputeUnitDescription() cud.executable = '/bin/date' cuds.append(cud) # Submit the previously created ComputeUnit descriptions to the # PilotManager. This will trigger the selected scheduler to start # assigning ComputeUnits to the ComputePilots. units = self.umgr.submit_units(cuds) # Wait for all compute units to reach a final state (DONE, CANCELED or # FAILED). self.umgr.wait_units() # Verify that 100% of the units came back with 'DONE' status done_units = 0 for description in units: if description.state == "DONE": done_units += 1 self.assertEquals( (float(done_units) / float(self.n)), 1.0, "Only {0}% of CUs were DONE.".format( str((float(done_units) / float(self.n)) * 100.00)))
def _pilots_queue(self, requests): ''' submit a new pilot to the batchs system ''' self._rep.info('\nrequesting dedicated pilots\n') pds = list() for request in requests: pd = { 'resource': request.get('resource', 'local.localhost'), 'project': request.get('project'), 'queue': request.get('queue'), 'cores': request['cores'], 'runtime': request['walltime'] } self._rep.ok( 'provision on %s [%5dcores * %4dmin] @ %10s(%10s)]\n' % (pd['resource'], pd['cores'], pd['runtime'], pd['queue'], pd['project'])) pds.append(rp.ComputePilotDescription(pd)) return pds