def generate_trajectorygenerationtask_cud(task_desc, db, shared_path, project): # Compute Unit Description cud = rp.ComputeUnitDescription() cud.name = task_desc['_id'] # Get each component of the task pre_task_details = task_desc['_dict'].get('pre', list()) main_task_details = task_desc['_dict']['_main'] post_task_details = task_desc['_dict'].get('post', list()) #resource_requirements = task_desc['_dict']['resource_requirements'] # First, extract environment variables cud.environment = get_environment_from_task(task_desc) # Next, extract things we need to add to the PATH # TODO: finish adding path directive paths = get_paths_from_task(task_desc) # Next, get input staging # We get "ALL" COPY/LINK directives from the pre_exec staging_directives = get_input_staging(pre_task_details, db, shared_path, project, break_after_non_dict=False) # We get "ALL" COPY/LINK directives from the main *before* the first non-dictionary entry staging_directives.extend( get_input_staging(main_task_details, db, shared_path, project)) cud.input_staging = staging_directives # Next, get pre execution steps pre_exec = list() pre_exec = [ 'mkdir -p traj', 'mkdir -p extension', ] pre_exec.extend(get_commands(pre_task_details, shared_path, project)) cud.pre_exec = pre_exec # Now, do main executable exe, args = get_executable_arguments(main_task_details, shared_path, project) cud.executable = str(exe) cud.arguments = args # Now, get output staging steps # We get "ALL" COPY/LINK directives from the post_exec staging_directives = get_output_staging(task_desc, post_task_details, db, shared_path, project, continue_before_non_dict=False) # We get "ALL" COPY/LINK directives from the main *after* the first non-dictionary entry staging_directives.extend( get_output_staging(task_desc, main_task_details, db, shared_path, project)) cud.output_staging = staging_directives # Get all post-execution steps post_exec = list() post_exec.extend(get_commands(post_task_details, shared_path, project)) cud.post_exec = post_exec describe_compute_setup(cud, task_desc) return cud
pdesc.cores = 1 # Launch the pilot. pilot = pmgr.submit_pilots(pdesc) pilot.register_callback(pilot_state_cb) # Combine the ComputePilot, the ComputeUnits and a scheduler via # a UnitManager object. umgr = rp.UnitManager(session=session, scheduler=rp.SCHEDULER_DIRECT_SUBMISSION) # Add the previsouly created ComputePilot to the UnitManager. umgr.add_pilots(pilot) # Create a workload of 8 ComputeUnits (tasks). unit_descr = rp.ComputeUnitDescription() unit_descr.executable = "/bin/sleep" unit_descr.arguments = ['10'] unit_descr.cores = 1 # Submit the previously created ComputeUnit descriptions to the # PilotManager. This will trigger the selected scheduler to start # assigning ComputeUnits to the ComputePilots. units = umgr.submit_units(unit_descr) # Wait for all compute units to finish. for unit in umgr.get_units(): unit.register_callback(unit_state_change_cb) umgr.wait_units()
def test_generate_pythontask_cud(self): """Test proper Compute Unit Description generation for PythonTask""" task_descriptions = self.db.get_task_descriptions() # PythonTask task_desc = dict() for task in task_descriptions: if task['_id'] == "04f01b52-8c69-11e7-9eb2-0000000000fe": task_desc = task break # Get the input.json example with open('{}/{}'.format(directory, ptask_in_example)) as json_data: inpu_json_data = json.load(json_data) cud = utils.generate_pythontask_cud(task_desc, self.db, '/home/example', self.db.project) actual_cud = rp.ComputeUnitDescription() actual_cud.name = "04f01b52-8c69-11e7-9eb2-0000000000fe" actual_cud.environment = {"TEST3": "3", "TEST4": "4"} actual_cud.input_staging = [{ "action": "Link", "source": "pilot:///_run_.py", "target": "unit:///_run_.py" }, { "action": "Link", "source": "pilot:///alanine.pdb", "target": "unit:///input.pdb" }] actual_cud.pre_exec = [ 'mkdir -p traj', 'mkdir -p extension', 'echo \'{}\' > \'{}\''.format( json.dumps(inpu_json_data['contents']), inpu_json_data['target']), # stage input.json "source /home/test/venv/bin/activate" ] actual_cud.executable = 'python' actual_cud.arguments = ['_run_.py'] actual_cud.output_staging = [{ "action": "Copy", "source": "output.json", "target": "/home/example/projects/{}//models/model.0x4f01b528c6911e79eb20000000000feL.json" .format(self.db.project) }] actual_cud.post_exec = ["deactivate"] actual_cud.cpu_process_type = 'POSIX' actual_cud.gpu_process_type = 'POSIX' actual_cud.cpu_thread_type = 'POSIX' actual_cud.gpu_thread_type = 'CUDA' actual_cud.cpu_processes = 10 actual_cud.gpu_processes = 1 actual_cud.cpu_threads = 1 actual_cud.gpu_threads = 1 # compare all parts of the cuds self.maxDiff = None self.assertEquals(cud.name, actual_cud.name) self.assertDictEqual(cud.environment, actual_cud.environment) self.assertListEqual(cud.input_staging, actual_cud.input_staging) self.assertListEqual(cud.pre_exec, actual_cud.pre_exec) self.assertEquals(cud.executable, actual_cud.executable) self.assertListEqual(cud.arguments, actual_cud.arguments) self.assertListEqual(cud.output_staging, actual_cud.output_staging) self.assertListEqual(cud.post_exec, actual_cud.post_exec) self.assertEquals(cud.cpu_process_type, actual_cud.cpu_process_type) self.assertEquals(cud.cpu_thread_type, actual_cud.cpu_thread_type) self.assertEquals(cud.cpu_processes, actual_cud.cpu_processes) self.assertEquals(cud.cpu_threads, actual_cud.cpu_threads)
def test_local_integration(): # if __name__ == '__main__': # Create a new session. No need to try/except this: if session creation # fails, there is not much we can do anyways... session = rp.Session() # Add a Pilot Manager. Pilot managers manage one or more ComputePilots. pmgr = rp.PilotManager(session=session) # Update localhost lfs path and size cfg = session.get_resource_config('local.localhost') new_cfg = rp.ResourceConfig('local.localhost', cfg) new_cfg.lfs_path_per_node = '/tmp' new_cfg.lfs_size_per_node = 1024 # MB session.add_resource_config(new_cfg) cfg = session.get_resource_config('local.localhost') # Check that the updated config is read by the session assert 'lfs_path_per_node' in cfg.keys() assert 'lfs_size_per_node' in cfg.keys() assert cfg['lfs_path_per_node'] == '/tmp' assert cfg['lfs_size_per_node'] == 1024 # Define an [n]-core local pilot that runs for [x] minutes # Here we use a dict to initialize the description object pd_init = { 'resource': 'local.localhost', 'runtime': 15, # pilot runtime (min) 'cores': 4 } pdesc = rp.ComputePilotDescription(pd_init) # Launch the pilot. pilot = pmgr.submit_pilots(pdesc) # Register the ComputePilot in a UnitManager object. umgr = rp.UnitManager(session=session) umgr.add_pilots(pilot) # Run 16 tasks that each require 1 core and 10MB of LFS n = 16 cuds = list() for i in range(0, n): # create a new CU description, and fill it. # Here we don't use dict initialization. cud = rp.ComputeUnitDescription() cud.executable = '/bin/hostname' cud.arguments = ['>', 's1_t%s_hostname.txt' % i] cud.cpu_processes = 1 cud.cpu_threads = 1 # cud.cpu_process_type = rp.MPI cud.lfs_per_process = 10 # MB cud.output_staging = { 'source': 'unit:///s1_t%s_hostname.txt' % i, 'target': 'client:///s1_t%s_hostname.txt' % i, 'action': rp.TRANSFER } cuds.append(cud) # Submit the previously created ComputeUnit descriptions to the # PilotManager. This will trigger the selected scheduler to start # assigning ComputeUnits to the ComputePilots. cus = umgr.submit_units(cuds) # Wait for all units to finish umgr.wait_units() n = 16 cuds2 = list() for i in range(0, n): # create a new CU description, and fill it. # Here we don't use dict initialization. cud = rp.ComputeUnitDescription() cud.tag = cus[i].uid cud.executable = '/bin/hostname' cud.arguments = ['>', 's2_t%s_hostname.txt' % i] cud.cpu_processes = 1 cud.cpu_threads = 1 # cud.cpu_process_type = rp.MPI cud.lfs_per_process = 10 # MB cud.output_staging = { 'source': 'unit:///s2_t%s_hostname.txt' % i, 'target': 'client:///s2_t%s_hostname.txt' % i, 'action': rp.TRANSFER } cuds2.append(cud) # # Submit the previously created ComputeUnit descriptions to the # # PilotManager. This will trigger the selected scheduler to start # # assigning ComputeUnits to the ComputePilots. cus2 = umgr.submit_units(cuds2) # # Wait for all units to finish umgr.wait_units() for i in range(0, n): assert open('s1_t%s_hostname.txt' % i, 'r').readline().strip() == open('s2_t%s_hostname.txt' % i, 'r').readline().strip() session.close() txts = glob('%s/*.txt' % os.getcwd()) for f in txts: os.remove(f)
def test_local_tagging(): # we use a reporter class for nicer output report = ru.Reporter(name='radical.pilot') report.title('Getting Started (RP version %s)' % rp.version) # Create a new session. No need to try/except this: if session creation # fails, there is not much we can do anyways... session = rp.Session() # Add a Pilot Manager. Pilot managers manage one or more ComputePilots. pmgr = rp.PilotManager(session=session) # Define an [n]-core local pilot that runs for [x] minutes # Here we use a dict to initialize the description object pd_init = {'resource': 'local.localhost', 'runtime': 10, # pilot runtime (min) 'exit_on_error': True, 'cores': 4 } pdesc = rp.ComputePilotDescription(pd_init) # Launch the pilot. pilot = pmgr.submit_pilots(pdesc) report.header('submit units') # Register the ComputePilot in a UnitManager object. umgr = rp.UnitManager(session=session) umgr.add_pilots(pilot) # Create a workload of ComputeUnits. n = 5 # number of units to run report.info('create %d unit description(s)\n\t' % n) cuds = list() for i in range(0, n): # create a new CU description, and fill it. # Here we don't use dict initialization. cud = rp.ComputeUnitDescription() cud.executable = '/bin/hostname' cud.arguments = ['>', 's1_t%s_hostname.txt' % i] cud.cpu_processes = 1 cud.cpu_threads = 1 # cud.cpu_process_type = rp.MPI # cud.cpu_thread_type = rp.OpenMP cud.output_staging = {'source': 'unit:///s1_t%s_hostname.txt' % i, 'target': 'client:///s1_t%s_hostname.txt' % i, 'action': rp.TRANSFER} cuds.append(cud) report.progress() report.ok('>>ok\n') # Submit the previously created ComputeUnit descriptions to the # PilotManager. This will trigger the selected scheduler to start # assigning ComputeUnits to the ComputePilots. cus = umgr.submit_units(cuds) # Wait for all compute units to reach a final state # (DONE, CANCELED or FAILED). report.header('gather results') umgr.wait_units() n = 5 # number of units to run report.info('create %d unit description(s)\n\t' % n) cuds = list() for i in range(0, n): # create a new CU description, and fill it. # Here we don't use dict initialization. cud = rp.ComputeUnitDescription() cud.executable = '/bin/hostname' cud.arguments = ['>', 's2_t%s_hostname.txt' % i] cud.cpu_processes = 1 cud.cpu_threads = 1 cud.tag = cus[i].uid # cud.cpu_process_type = rp.MPI # cud.cpu_thread_type = rp.OpenMP cud.output_staging = {'source': 'unit:///s2_t%s_hostname.txt' % i, 'target': 'client:///s2_t%s_hostname.txt' % i, 'action': rp.TRANSFER} cuds.append(cud) report.progress() report.ok('>>ok\n') # Submit the previously created ComputeUnit descriptions to the # PilotManager. This will trigger the selected scheduler to start # assigning ComputeUnits to the ComputePilots. cus = umgr.submit_units(cuds) # Wait for all compute units to reach a final state (DONE, CANCELED or FAILED). report.header('gather results') umgr.wait_units() for i in range(0, n): assert open('s1_t%s_hostname.txt' % i,'r').readline().strip() == \ open('s2_t%s_hostname.txt' % i,'r').readline().strip() report.header('finalize') session.close(download=True) report.header() for f in glob.glob('%s/*.txt' % os.getcwd()): os.remove(f)
def test_bw_integration(): # Create a new session. No need to try/except this: if session creation # fails, there is not much we can do anyways... session = rp.Session() # Add a Pilot Manager. Pilot managers manage one or more ComputePilots. pmgr = rp.PilotManager(session=session) # Define an [n]-core local pilot that runs for [x] minutes # Here we use a dict to initialize the description object pd_init = { 'resource': 'ncsa.bw_aprun', 'runtime': 10, # pilot runtime (min) 'cores': 128, 'project': 'gk4', 'queue': 'high' } pdesc = rp.ComputePilotDescription(pd_init) # Launch the pilot. pilot = pmgr.submit_pilots(pdesc) # Register the ComputePilot in a UnitManager object. umgr = rp.UnitManager(session=session) umgr.add_pilots(pilot) # Run 16 tasks that each require 1 core and 10MB of LFS n = 4 cuds = list() for i in range(0, n): # create a new CU description, and fill it. # Here we don't use dict initialization. cud = rp.ComputeUnitDescription() cud.executable = '/bin/hostname' cud.arguments = ['>', 's1_t%s_hostname.txt' % i] cud.cpu_processes = 1 cud.cpu_threads = 16 # cud.cpu_process_type = None # cud.cpu_process_type = rp.MPI cud.lfs_per_process = 10 # MB cud.output_staging = { 'source': 'unit:///s1_t%s_hostname.txt' % i, 'target': 'client:///s1_t%s_hostname.txt' % i, 'action': rp.TRANSFER } cuds.append(cud) # Submit the previously created ComputeUnit descriptions to the # PilotManager. This will trigger the selected scheduler to start # assigning ComputeUnits to the ComputePilots. cus = umgr.submit_units(cuds) # Wait for all units to finish umgr.wait_units() n = 4 cuds2 = list() for i in range(0, n): # create a new CU description, and fill it. # Here we don't use dict initialization. cud = rp.ComputeUnitDescription() cud.tag = cus[i].uid cud.executable = '/bin/hostname' cud.arguments = ['>', 's2_t%s_hostname.txt' % i] cud.cpu_processes = 1 cud.cpu_threads = 16 cud.cpu_process_type = None # cud.cpu_process_type = rp.MPI cud.lfs_per_process = 10 # MB cud.output_staging = { 'source': 'unit:///s2_t%s_hostname.txt' % i, 'target': 'client:///s2_t%s_hostname.txt' % i, 'action': rp.TRANSFER } cuds2.append(cud) # Submit the previously created ComputeUnit descriptions to the # PilotManager. This will trigger the selected scheduler to start # assigning ComputeUnits to the ComputePilots. cus2 = umgr.submit_units(cuds2) # Wait for all units to finish umgr.wait_units() # Check that all units succeeded for i in range(0, n): assert open('s1_t%s_hostname.txt' % i, 'r').readline().strip() == open('s2_t%s_hostname.txt' % i, 'r').readline().strip() session.close() txts = glob('%s/*.txt' % os.getcwd()) for f in txts: os.remove(f)
def test_pass_issue_359(): session = rp.Session() try: c = rp.Context('ssh') c.user_id = CONFIG["xsede.stampede"]["user_id"] session.add_context(c) pmgr = rp.PilotManager(session=session) pmgr.register_callback(pilot_state_cb) core_configs = [8, 16, 17, 32, 33] umgr_list = [] for cores in core_configs: umgr = rp.UnitManager(session=session, scheduler=rp.SCHED_DIRECT_SUBMISSION) umgr.register_callback(unit_state_cb) pdesc = rp.ComputePilotDescription() pdesc.resource = "xsede.stampede" pdesc.project = CONFIG["xsede.stampede"]["project"] pdesc.runtime = 10 pdesc.cores = cores pilot = pmgr.submit_pilots(pdesc) umgr.add_pilots(pilot) umgr_list.append(umgr) unit_list = [] for umgr in umgr_list: test_task = rp.ComputeUnitDescription() test_task.pre_exec = CONFIG["xsede.stampede"]["pre_exec"] test_task.input_staging = ["../helloworld_mpi.py"] test_task.executable = "python" test_task.arguments = ["helloworld_mpi.py"] test_task.mpi = True test_task.cores = 8 unit = umgr.submit_units(test_task) unit_list.append(unit) for umgr in umgr_list: umgr.wait_units() for unit in unit_list: print "* Task %s - state: %s, exit code: %s, started: %s, finished: %s, stdout: %s" \ % (unit.uid, unit.state, unit.exit_code, unit.start_time, unit.stop_time, unit.stdout) assert (unit.state == rp.DONE) except Exception as e: print 'test failed' raise finally: pmgr.cancel_pilots() pmgr.wait_pilots() session.close()
import os import radical.pilot as rp # ------------------------------------------------------------------------------ # if __name__ == '__main__': here = os.path.abspath(os.path.dirname(__file__)) master = '%s/task_overlay_master.py' % here worker = '%s/task_overlay_worker.py' % here session = rp.Session() try: pd = {'resource': 'local.debug', 'cores': 128, 'runtime': 60} td = {'executable': master, 'arguments': [worker]} pmgr = rp.PilotManager(session=session) umgr = rp.UnitManager(session=session) pilot = pmgr.submit_pilots(rp.ComputePilotDescription(pd)) task = umgr.submit_units(rp.ComputeUnitDescription(td)) umgr.add_pilots(pilot) umgr.wait_units() finally: session.close(download=True) # ------------------------------------------------------------------------------
def run_test(cfg): # Create a new session. No need to try/except this: if session creation # fails, there is not much we can do anyways... session = rp.Session() print "session id: %s" % session.uid # all other pilot code is now tried/excepted. If an exception is caught, we # can rely on the session object to exist and be valid, and we can thus tear # the whole RP stack down via a 'session.close()' call in the 'finally' # clause... try: # Add a Pilot Manager. Pilot managers manage one or more ComputePilots. print "Initializing Pilot Manager ..." pmgr = rp.PilotManager(session=session) # Register our callback with the PilotManager. This callback will get # called every time any of the pilots managed by the PilotManager # change their state. pmgr.register_callback(pilot_state_cb) pdesc = rp.ComputePilotDescription() pdesc.resource = cfg['cp_resource'] if cfg['cp_schema']: pdesc.access_schema = cfg['cp_schema'] pdesc.project = cfg['cp_project'] pdesc.queue = cfg['cp_queue'] pdesc.runtime = cfg['cp_runtime'] pdesc.cores = cfg['cp_cores'] pdesc.cleanup = True # submit the pilot. print "Submitting Compute Pilot to Pilot Manager ..." pilot = pmgr.submit_pilots(pdesc) # Combine the ComputePilot, the ComputeUnits and a scheduler via # a UnitManager object. print "Initializing Unit Manager ..." umgr = rp.UnitManager(session=session, scheduler=rp.SCHEDULER_DIRECT_SUBMISSION) # Register our callback with the UnitManager. This callback will get # called every time any of the units managed by the UnitManager # change their state. umgr.register_callback(unit_state_cb) # Add the created ComputePilot to the UnitManager. print "Registering Compute Pilot with Unit Manager ..." umgr.add_pilots(pilot) NUMBER_JOBS = 10 # the total number of cus to run # submit CUs to pilot job cudesc_list = [] for i in range(NUMBER_JOBS): cudesc = rp.ComputeUnitDescription() if cfg['cu_pre_exec']: cudesc.pre_exec = cfg['cu_pre_exec'] cudesc.executable = cfg['executable'] cudesc.arguments = ["helloworld_mpi.py"] cudesc.input_staging = [ "%s/../examples/helloworld_mpi.py" % cfg['pwd'] ] cudesc.cores = cfg['cu_cores'] cudesc.mpi = True cudesc_list.append(cudesc) # Submit the previously created ComputeUnit descriptions to the # PilotManager. This will trigger the selected scheduler to start # assigning ComputeUnits to the ComputePilots. print "Submit Compute Units to Unit Manager ..." cu_set = umgr.submit_units(cudesc_list) print "Waiting for CUs to complete ..." umgr.wait_units() print "All CUs completed successfully!" for unit in cu_set: print "* Task %s - state: %s, exit code: %s, started: %s, finished: %s, stdout: %s" \ % (unit.uid, unit.state, unit.exit_code, unit.start_time, unit.stop_time, unit.stdout) assert (unit.state == rp.DONE) for i in range(cfg['cu_cores']): assert ('mpi rank %d/%d' % (i + 1, cfg['cu_cores']) in unit.stdout) except Exception as e: # Something unexpected happened in the pilot code above print "caught Exception: %s" % e raise except (KeyboardInterrupt, SystemExit) as e: # the callback called sys.exit(), and we can here catch the # corresponding KeyboardInterrupt exception for shutdown. We also catch # SystemExit (which gets raised if the main threads exits for some other # reason). print "need to exit now: %s" % e raise finally: # always clean up the session, no matter if we caught an exception or # not. print "closing session" print "SESSION ID: %s" % session.uid session.close(cleanup=False)
def test_fail_issue_172(setup_stampede): session, pilot, pmgr, umgr, resource = setup_stampede umgr.register_callback(unit_state_cb) # generate some units which use env vars in different ways, w/ and w/o MPI env_variants = [ 'UNDEFINED', # Special case: env will not be set None, # None {}, # empty dict { 'foo': 'bar' }, # single entry dict { 'foo': 'bar', 'sports': 'bar', 'banana': 'bar' } # multi entry dict ] compute_units = [] idx = 1 for env in env_variants: # Serial cud = rp.ComputeUnitDescription() cud.name = "serial_" + str(idx) cud.executable = "/bin/echo" cud.arguments = [ 'Taverns:', '$foo', '$sports', '$banana', 'dollars\$\$', '"$dollar"', 'sitting \'all\' by myself', 'drinking "cheap" beer' ] if env != 'UNDEFINED': cud.environment = env compute_units.append(cud) # MPI cud = rp.ComputeUnitDescription() cud.name = "mpi_" + str(idx) cud.pre_exec = CONFIG[resource]["pre_exec"] cud.executable = "python" cud.input_staging = ["mpi4py_env.py"] cud.arguments = 'mpi4py_env.py' cud.cores = 2 cud.mpi = True if env != 'UNDEFINED': cud.environment = env compute_units.append(cud) idx += 1 units = umgr.submit_units(compute_units) umgr.wait_units() if not isinstance(units, list): units = [units] for unit in units: print unit.stdout print "\n\n" print "* Task %s - env: %s state: %s, exit code: %s, started: %s, finished: %s, stdout: %s" \ % (unit.uid, unit.description.environment, unit.state, \ unit.exit_code, unit.start_time, unit.stop_time, repr(unit.stdout)) assert (unit.state == rp.DONE) if unit.name == "serial_1" or unit.name == "serial_2" or unit.name == "serial_3": assert ("Taverns: dollars$$ \"\"" in unit.stdout) if unit.name == "mpi_1" or unit.name == "mpi_2" or unit.name == "mpi_3": assert ("Taverns: None, None, None" in unit.stdout) if unit.name == "serial_4": assert ("Taverns: bar dollars$$ \"\"" in unit.stdout) if unit.name == "mpi_4": assert ("Taverns: bar, None, None" in unit.stdout) if unit.name == "serial_5": assert ("Taverns: bar bar bar dollars$$ \"\"" in unit.stdout) if unit.name == "mpi_5": assert ("Taverns: bar, bar, bar" in unit.stdout)
def to_cu(self): # Write the python file, stage it, run it and return the model cu = rp.ComputeUnitDescription({}) return cu
def create_cud_from_task(task, placeholders, prof=None): """ Purpose: Create a Compute Unit description based on the defined Task. :arguments: :task: EnTK Task object :placeholders: dictionary holding the values for placeholders :return: ComputeUnitDescription """ try: logger.debug('Creating CU from Task %s' % (task.uid)) if prof: prof.prof('cud_create', uid=task.uid) cud = rp.ComputeUnitDescription() cud.name = '%s,%s,%s,%s,%s,%s' % ( task.uid, task.name, task.parent_stage['uid'], task.parent_stage['name'], task.parent_pipeline['uid'], task.parent_pipeline['name']) cud.pre_exec = task.pre_exec cud.executable = task.executable cud.arguments = resolve_arguments(task.arguments, placeholders) cud.sandbox = task.sandbox cud.post_exec = task.post_exec if task.tag: if task.parent_pipeline['name']: cud.tag = resolve_tags( tag=task.tag, parent_pipeline_name=task.parent_pipeline['name'], placeholders=placeholders) cud.cpu_processes = task.cpu_reqs['processes'] cud.cpu_threads = task.cpu_reqs['threads_per_process'] cud.cpu_process_type = task.cpu_reqs['process_type'] cud.cpu_thread_type = task.cpu_reqs['thread_type'] cud.gpu_processes = task.gpu_reqs['processes'] cud.gpu_threads = task.gpu_reqs['threads_per_process'] cud.gpu_process_type = task.gpu_reqs['process_type'] cud.gpu_thread_type = task.gpu_reqs['thread_type'] if task.lfs_per_process: cud.lfs_per_process = task.lfs_per_process if task.stdout: cud.stdout = task.stdout if task.stderr: cud.stderr = task.stderr cud.input_staging = get_input_list_from_task(task, placeholders) cud.output_staging = get_output_list_from_task(task, placeholders) if prof: prof.prof('cud from task - done', uid=task.uid) logger.debug('CU %s created from Task %s' % (cud.name, task.uid)) return cud except Exception: logger.exception('CU creation failed') raise
ru.write_json(cfg, 'configs/wf0.%s.cfg' % name) pd = rp.ComputePilotDescription(cfg.pilot_descr) pd.cores = nodes * cpn pd.gpus = nodes * gpn pd.runtime = runtime pilot = pmgr.submit_pilots(pd) pid = pilot.uid umgr.add_pilots(pilot) tds = list() for i in range(n_masters): td = rp.ComputeUnitDescription(cfg.master_descr) td.executable = "python3" td.arguments = ['wf0_master.py', i] td.cpu_threads = cpn td.pilot = pid td.input_staging = [ { 'source': 'wf0_master.py', 'target': 'wf0_master.py', 'action': rp.TRANSFER, 'flags': rp.DEFAULT_FLAGS }, { 'source': 'wf0_worker.py', 'target': 'wf0_worker.py', 'action': rp.TRANSFER,
def run_test(cfg): # Create a new session. No need to try/except this: if session creation # fails, there is not much we can do anyways... session = rp.Session() print "session id: %s" % session.uid # all other pilot code is now tried/excepted. If an exception is caught, we # can rely on the session object to exist and be valid, and we can thus tear # the whole RP stack down via a 'session.close()' call in the 'finally' # clause... try: pmgr = rp.PilotManager(session=session) pmgr.register_callback(pilot_state_cb) pdesc = rp.ComputePilotDescription() pdesc.resource = cfg['cp_resource'] pdesc.cores = cfg['cp_cores'] pdesc.project = cfg['cp_project'] pdesc.queue = cfg['cp_queue'] pdesc.runtime = cfg['cp_runtime'] pdesc.cleanup = False pdesc.access_schema = cfg['cp_schema'] pilot = pmgr.submit_pilots(pdesc) input_sd_pilot = { 'source': 'file:///etc/passwd', 'target': 'staging:///f1', 'action': rp.TRANSFER } pilot.stage_in(input_sd_pilot) umgr = rp.UnitManager(session=session, scheduler=SCHED) umgr.register_callback(unit_state_cb, rp.UNIT_STATE) umgr.register_callback(wait_queue_size_cb, rp.WAIT_QUEUE_SIZE) umgr.add_pilots(pilot) input_sd_umgr = { 'source': 'file:///etc/group', 'target': 'f2', 'action': rp.COPY } input_sd_agent = { 'source': 'staging:///f1', 'target': 'f1', 'action': rp.COPY } output_sd_agent = { 'source': 'f1', 'target': 'staging:///f1.bak', 'action': rp.COPY } output_sd_umgr = { 'source': 'f2', 'target': 'f2.bak', 'action': rp.TRANSFER } cuds = list() for unit_count in range(0, UNITS): cud = rp.ComputeUnitDescription() cud.executable = "wc" cud.arguments = ["f1", "f2"] cud.cores = 1 cud.input_staging = [input_sd_umgr, input_sd_agent] cud.output_staging = [output_sd_umgr, output_sd_agent] cuds.append(cud) units = umgr.submit_units(cuds) umgr.wait_units() for cu in units: print "* Task %s state %s, exit code: %s, started: %s, finished: %s" \ % (cu.uid, cu.state, cu.exit_code, cu.start_time, cu.stop_time) # os.system ("radicalpilot-stats -m stat,plot -s %s > %s.stat" % (session.uid, session_name)) except Exception as e: # Something unexpected happened in the pilot code above print "caught Exception: %s" % e raise except (KeyboardInterrupt, SystemExit) as e: # the callback called sys.exit(), and we can here catch the # corresponding KeyboardInterrupt exception for shutdown. We also catch # SystemExit (which gets raised if the main threads exits for some other # reason). print "need to exit now: %s" % e raise finally: # always clean up the session, no matter if we caught an exception or # not. print "closing session" print "SESSION ID: %s" % session.uid session.close(cleanup=False)
'go://marksant#netbook/Users/mark/proj/radical.pilot/examples/helloworld_mpi.py', 'target': 'go://nersc#edison/scratch2/scratchdirs/marksant/go/', #'target': 'staging:///go/', 'action': rp.TRANSFER } pilot.stage_in(pilot_globe) unit_globe = { 'source': '/scratch2/scratchdirs/marksant/go/helloworld_mpi.py', #'source': 'staging:///go/helloworld_mpi.py', 'action': rp.LINK, } for unit_count in range(0, 1): mpi_test_task = rp.ComputeUnitDescription() mpi_test_task.pre_exec = ["module load python", "module load mpi4py"] mpi_test_task.input_staging = [unit_globe] mpi_test_task.executable = "python-mpi" mpi_test_task.arguments = ["helloworld_mpi.py"] mpi_test_task.mpi = True mpi_test_task.cores = 24 cud_list.append(mpi_test_task) # Combine the ComputePilot, the ComputeUnits and a scheduler via # a UnitManager object. umgr = rp.UnitManager(session, scheduler=rp.SCHED_DIRECT) # Register our callback with the UnitManager. This callback will get
pdesc.project = 'mc3bggp' pdesc.gpus = 2 pdesc.cores = 24 pdesc.runtime = 60 pdesc.exit_on_error = True pdesc.queue = 'GPU' pmgr = rp.PilotManager(session=session) pilot = pmgr.submit_pilots(pdesc) umgr = rp.UnitManager(session=session) umgr.add_pilots(pilot) cud1 = rp.ComputeUnitDescription() cud1.executable = 'python3' cud1.gpu_processes = 1 cud1.cpu_processes = 1 cud1.pre_exec = [ 'module load psc_path/1.1', 'module load slurm/default', 'module load intel/17.4', 'module load python3', 'module load cuda', 'mkdir -p classified_images/crabeater', 'mkdir -p classified_images/weddel', 'mkdir -p classified_images/pack-ice', 'mkdir -p classified_images/other', 'source /pylon5/mc3bggp/paraskev/pytorchCuda/bin/activate' ] cud1.arguments = [ 'pt_predict.py', '-class_names', 'crabeater', 'weddel', 'pack-ice',
def test_generate_trajectorygenerationtask_generation_cud(self): """Test proper Compute Unit Description generation for TrajectoryGenerationTask""" task_descriptions = self.db.get_task_descriptions() # PythonTask task_desc = dict() for task in task_descriptions: if task['_id'] == "04f01b52-8c69-11e7-9eb2-000000000124": task_desc = task break cud = utils.generate_trajectorygenerationtask_cud( task_desc, self.db, '/home/test', self.db.project) actual_cud = rp.ComputeUnitDescription() actual_cud.name = "04f01b52-8c69-11e7-9eb2-000000000124" actual_cud.environment = {"TEST1": "1", "TEST2": "2"} actual_cud.input_staging = [{ "action": "Link", "source": "pilot:///alanine.pdb", "target": "unit:///initial.pdb" }, { "action": "Link", "source": "pilot:///system.xml", "target": "unit:///system.xml" }, { "action": "Link", "source": "pilot:///integrator.xml", "target": "unit:///integrator.xml" }, { "action": "Link", "source": "pilot:///openmmrun.py", "target": "unit:///openmmrun.py" }] actual_cud.pre_exec = [ 'mkdir -p traj', 'mkdir -p extension', 'source /home/test/venv/bin/activate', 'mdconvert -o input.pdb -i 3 -t initial.pdb source/allatoms.dcd' ] actual_cud.executable = 'python' actual_cud.arguments = [ "openmmrun.py", "-r", "--report-interval", "1", "-p", "CPU", "--types", "{'protein':{'stride':1,'selection':'protein','name':null,'filename':'protein.dcd'},'master':{'stride':10,'selection':null,'name':null,'filename':'master.dcd'}}", "-t", "initial.pdb", "--length", "100", "traj/" ] actual_cud.output_staging = [ { "action": "Move", "source": "traj/restart.npz", "target": "/home/test//projects/rp_testing_modeller_1/trajs/00000004//restart.npz" }, { "action": "Move", "source": "traj/master.dcd", "target": "/home/test//projects/rp_testing_modeller_1/trajs/00000004//master.dcd" }, { "action": "Move", "source": "traj/protein.dcd", "target": "/home/test//projects/rp_testing_modeller_1/trajs/00000004//protein.dcd" }, ] actual_cud.post_exec = ['deactivate'] actual_cud.cpu_process_type = 'POSIX' #actual_cud.gpu_process_type = 'POSIX' actual_cud.cpu_thread_type = 'POSIX' #actual_cud.gpu_thread_type = 'CUDA' actual_cud.cpu_processes = 1 #actual_cud.gpu_processes = 1 actual_cud.cpu_threads = 1 #actual_cud.gpu_threads = 1 # compare all parts of the cuds self.maxDiff = None self.assertEquals(cud.name, actual_cud.name) self.assertDictEqual(cud.environment, actual_cud.environment) self.assertListEqual(cud.input_staging, actual_cud.input_staging) self.assertListEqual(cud.pre_exec, actual_cud.pre_exec) self.assertEquals(cud.executable, actual_cud.executable) self.assertListEqual(cud.arguments, actual_cud.arguments) self.assertListEqual(cud.output_staging, actual_cud.output_staging) self.assertListEqual(cud.post_exec, actual_cud.post_exec) self.assertEquals(cud.cpu_process_type, actual_cud.cpu_process_type) self.assertEquals(cud.cpu_thread_type, actual_cud.cpu_thread_type) self.assertEquals(cud.cpu_processes, actual_cud.cpu_processes) self.assertEquals(cud.cpu_threads, actual_cud.cpu_threads)
print "Session id: %s Pilot Manager: %s" % (session.uid, str(pmgr.as_dict())) pdesc = rp.ComputePilotDescription() pdesc.resource = "xsede.wrangler_streaming" # NOTE: This is a "label", not a hostname pdesc.runtime = 20 # minutes pdesc.cores = 24 pdesc.cleanup = False pdesc.project = "TG-MCB090174" pdesc.queue = 'debug' pdesc.access_schema = 'gsissh' pilot = pmgr.submit_pilots(pdesc) umgr.add_pilots(pilot) #----------BEGIN USER DEFINED TEST-CU DESCRIPTION-------------------# cudesc = rp.ComputeUnitDescription() cudesc.executable = 'python' cudesc.arguments = ['start_redis.py'] cudesc.input_staging = ['start_redis.py'] cudesc.cores = 1 #-----------END USER DEFINED TEST-CU DESCRIPTION--------------------# print 'Starting up Kafka cluster..' cu_set = umgr.submit_units([cudesc]) umgr.wait_units() print 'Kafka cluster is running' pilot_info = pilot.as_dict() pilot_info = pilot_info['resource_details']['lm_detail'] broker = pilot_info['brokers'][0] + ':9092' print 'broker %s ' % broker
def test_generate_trajectorygenerationtask_extension_cud(self): """Test proper Compute Unit Description generation for TrajectoryExtensionTask""" task_descriptions = self.db.get_task_descriptions() # PythonTask task_desc = dict() for task in task_descriptions: if task['_id'] == "24888d76-219e-11e8-8f6d-000000000118": task_desc = task break cud = utils.generate_trajectorygenerationtask_cud( task_desc, self.db, self.shared_path, self.project) actual_cud = rp.ComputeUnitDescription() actual_cud.name = "24888d76-219e-11e8-8f6d-000000000118" actual_cud.environment = { "OPENMM_CPU_THREADS": "1", "TEST1": "1", "TEST2": "2", "TEST3": "hello" } actual_cud.input_staging = [{ "action": "Link", "source": "pilot:///ntl9.pdb", "target": "unit:///initial.pdb" }, { "action": "Link", "source": "pilot:///system-2.xml", "target": "unit:///system-2.xml" }, { "action": "Link", "source": "pilot:///integrator-2.xml", "target": "unit:///integrator-2.xml" }, { "action": "Link", "source": "pilot:///openmmrun.py", "target": "unit:///openmmrun.py" }, { "action": "Link", "source": "/home/test//projects/test_analysis/trajs/00000000/", "target": "unit:///source" }] actual_cud.pre_exec = [ 'mkdir -p traj', 'mkdir -p extension', "module load python", "source /lustre/atlas/proj-shared/bip149/jrossyra/admdrp/admdrpenv/bin/activate" ] actual_cud.executable = 'python' actual_cud.arguments = [ "openmmrun.py", "-r", "-p", "CPU", "--types", "{'protein':{'stride':2,'selection':'protein','name':null,'filename':'protein.dcd'},'master':{'stride':10,'selection':null,'name':null,'filename':'allatoms.dcd'}}", "-s", "system-2.xml", "-i", "integrator-2.xml", "--restart", "/home/test//projects/test_analysis/trajs/00000000/restart.npz", "-t", "initial.pdb", "--length", "200", "extension/" ] actual_cud.output_staging = [ { "action": "Move", "source": "extension/protein.temp.dcd", "target": "extension/protein.dcd" }, { "action": "Move", "source": "extension/master.temp.dcd", "target": "extension/allatoms.dcd" }, { "action": "Move", "source": "extension/restart.npz", "target": "/home/test//projects/test_analysis/trajs/00000000//restart.npz" }, { "action": "Move", "source": "extension/allatoms.dcd", "target": "/home/test//projects/test_analysis/trajs/00000000//allatoms.dcd" }, { "action": "Move", "source": "extension/protein.dcd", "target": "/home/test//projects/test_analysis/trajs/00000000//protein.dcd" }, ] actual_cud.post_exec = [ "mdconvert -o extension/protein.temp.dcd source/protein.dcd extension/protein.dcd", "mdconvert -o extension/master.temp.dcd source/allatoms.dcd extension/allatoms.dcd", "deactivate" ] actual_cud.cpu_process_type = 'POSIX' actual_cud.gpu_process_type = 'POSIX' actual_cud.cpu_thread_type = 'POSIX' actual_cud.gpu_thread_type = 'CUDA' actual_cud.cpu_processes = 1 actual_cud.gpu_processes = 1 actual_cud.cpu_threads = 1 actual_cud.gpu_threads = 1 # compare all parts of the cuds self.maxDiff = None self.assertEquals(cud.name, actual_cud.name) self.assertDictEqual(cud.environment, actual_cud.environment) self.assertListEqual(cud.input_staging, actual_cud.input_staging) self.assertListEqual(cud.pre_exec, actual_cud.pre_exec) self.assertEquals(cud.executable, actual_cud.executable) self.assertListEqual(cud.arguments, actual_cud.arguments) self.assertListEqual(cud.output_staging, actual_cud.output_staging) self.assertListEqual(cud.post_exec, actual_cud.post_exec) self.assertEquals(cud.cpu_process_type, actual_cud.cpu_process_type) self.assertEquals(cud.cpu_thread_type, actual_cud.cpu_thread_type) self.assertEquals(cud.cpu_processes, actual_cud.cpu_processes) self.assertEquals(cud.cpu_threads, actual_cud.cpu_threads)
pilots = pmgr.submit_pilots(pdescs) umgr.add_pilots(pilots) report.header('submit units') # Create a workload of ComputeUnits. # Each compute unit runs '/bin/date'. n = 128 # number of units to run report.info('create %d unit description(s)\n\t' % n) cuds = list() for i in range(0, n): # create a new CU description, and fill it. # Here we don't use dict initialization. cud = rp.ComputeUnitDescription() if i % 10: cud.executable = '/bin/date' else: # trigger an error now and then cud.executable = '/bin/data' # does not exist cuds.append(cud) report.progress() report.ok('>>ok\n') # Submit the previously created ComputeUnit descriptions to the # PilotManager. This will trigger the selected scheduler to start # assigning ComputeUnits to the ComputePilots. units = umgr.submit_units(cuds)
def to_cud(self): cud = rp.ComputeUnitDescription() cud.executable = self.bash_exec cud.arguments = self.bash_args + list(self) cud.input_staging = self.input_staging cud.output_staging = self.output_staging