def run(self): argv_keepalive = [ ffi.new("char[]", "submit"), # Will be stripped off by the library ffi.new("char[]", "--hnp"), ffi.new("char[]", DVM_URI), ffi.NULL, # Required ] argv = ffi.new("char *[]", argv_keepalive) lib.orte_submit_init(3, argv, ffi.NULL) index = ffi.new("int *") for i in range(3): argv_keepalive = [ ffi.new("char[]", "RADICAL-Pilot"), ffi.new("char[]", "--np"), ffi.new("char[]", "1"), ffi.new("char[]", "false"), ffi.NULL, # Required ] argv = ffi.new("char *[]", argv_keepalive) lib.orte_submit_job(argv, index, lib.launch_cb, ffi.NULL, lib.finish_cb, ffi.NULL) task = index[0] task_instance_map[task] = self self.mywait += 1 self.myspawn += 1 print "Task %d submitted!" % task while self.myspawn > 0 or self.mywait > 0: time.sleep(0.1) print("Done!")
def run(self): argv_keepalive = [ ffi.new("char[]", "RADICAL-Pilot"), # Will be stripped off by the library ffi.new("char[]", "--hnp"), ffi.new("char[]", DVM_URI), ffi.NULL, # Required ] argv = ffi.new("char *[]", argv_keepalive) lib.orte_submit_init(3, argv, ffi.NULL) index = ffi.new("int[1]") for i in range(3): argv_keepalive = [ ffi.new("char[]", "RADICAL-Pilot"), ffi.new("char[]", "--np"), ffi.new("char[]", "1"), ffi.new("char[]", "bash"), ffi.new("char[]", "-c"), #ffi.new("char[]", "t=%d; echo $t; touch TOUCHME; sleep $t; exit 0" % 10), ffi.new("char[]", "sleep %d" % 10), ffi.NULL, # Required ] argv = ffi.new("char *[]", argv_keepalive) rc = lib.orte_submit_job(argv, index, lib.launch_cb, ffi.NULL, lib.finish_cb, ffi.NULL) task = index[0] task_instance_map[task] = self self.mywait += 1 self.myspawn += 1 print "Task %d submitted!" % task while self.myspawn > 0 or self.mywait > 0: global fourislaunched if fourislaunched: print "Cancelling task ..." lib.orte_submit_cancel(1) fourislaunched = False time.sleep(0.1) print("Done!") lib.orte_submit_halt()
def spawn(self, launcher, cu): # NOTE: see documentation of cu['sandbox'] semantics in the ComputeUnit # class definition. sandbox = '%s/%s' % (self._pwd, cu['uid']) if False: cu_tmpdir = '%s/%s' % (self.tmpdir, cu['uid']) else: cu_tmpdir = sandbox rec_makedir(cu_tmpdir) # TODO: pre_exec # # Before the Big Bang there was nothing # if cu['description']['pre_exec']: # fail = ' (echo "pre_exec failed"; false) || exit' # pre = '' # for elem in cu['description']['pre_exec']: # pre += "%s || %s\n" % (elem, fail) # # Note: extra spaces below are for visual alignment # launch_script.write("# Pre-exec commands\n") # if 'RADICAL_PILOT_PROFILE' in os.environ: # launch_script.write("echo cu_pre_start `%s` >> %s/%s.prof\n"\ # % (cu['gtod'], cu_tmpdir, cu['uid'])) # launch_script.write(pre) # if 'RADICAL_PILOT_PROFILE' in os.environ: # launch_script.write("echo cu_pre_stop `%s` >> %s/%s.prof\n" \ # % (cu['gtod'], cu_tmpdir, cu['uid'])) # TODO: post_exec # # After the universe dies the infrared death, there will be nothing # if cu['description']['post_exec']: # fail = ' (echo "post_exec failed"; false) || exit' # post = '' # for elem in cu['description']['post_exec']: # post += "%s || %s\n" % (elem, fail) # launch_script.write("# Post-exec commands\n") # if 'RADICAL_PILOT_PROFILE' in os.environ: # launch_script.write("echo cu_post_start `%s` >> %s/%s.prof\n" \ # % (cu['gtod'], cu_tmpdir, cu['uid'])) # launch_script.write('%s\n' % post) # if 'RADICAL_PILOT_PROFILE' in os.environ: # launch_script.write("echo cu_post_stop `%s` >> %s/%s.prof\n" \ # % (cu['gtod'], cu_tmpdir, cu['uid'])) # The actual command line, constructed per launch-method try: orte_command, task_command = launcher.construct_command(cu, None) except Exception as e: msg = "Error in spawner (%s)" % e self._log.exception(msg) raise RuntimeError(msg) # Construct arguments to submit_job arg_list = [] # Take the orte specific commands and split them for arg in orte_command.split(): arg_list.append(ffi.new("char[]", str(arg))) # Set the working directory arg_list.append(ffi.new("char[]", "--wdir")) arg_list.append(ffi.new("char[]", str(cu_tmpdir))) # Set RP environment variables rp_envs = [ "RP_SESSION_ID=%s" % self._cfg['session_id'], "RP_PILOT_ID=%s" % self._cfg['pilot_id'], "RP_AGENT_ID=%s" % self._cfg['agent_name'], "RP_SPAWNER_ID=%s" % self.uid, "RP_UNIT_ID=%s" % cu['uid'], "RP_PILOT_STAGING=%s/staging_area" % self._pwd ] for env in rp_envs: arg_list.append(ffi.new("char[]", "-x")) arg_list.append(ffi.new("char[]", str(env))) # Set pre-populated environment variables if self._cu_environment: for key,val in self._cu_environment.iteritems(): arg_list.append(ffi.new("char[]", "-x")) arg_list.append(ffi.new("char[]", "%s=%s" % (key, val))) # Set environment variables specified for this CU if cu['description']['environment']: for key,val in cu['description']['environment'].iteritems(): arg_list.append(ffi.new("char[]", "-x")) arg_list.append(ffi.new("char[]", "%s=%s" % (key, val))) # Let the orted write stdout and stderr to rank-based output files arg_list.append(ffi.new("char[]", "--output-filename")) arg_list.append(ffi.new("char[]", "%s:nojobid,nocopy" % str(cu_tmpdir))) # Save retval of actual CU application (in case we have post-exec) task_command += "; RETVAL=$?" # Wrap in (sub)shell for output redirection arg_list.append(ffi.new("char[]", "sh")) arg_list.append(ffi.new("char[]", "-c")) if 'RADICAL_PILOT_PROFILE' in os.environ: task_command = "echo script cu_start `%s` >> %s/%s.prof; " \ % (self.gtod, cu_tmpdir, cu['uid']) \ + "echo script cu_cd_done `%s` >> %s/%s.prof; " \ % (self.gtod, cu_tmpdir, cu['uid']) \ + "echo script cu_exec_start `%s` >> %s/%s.prof; " \ % (self.gtod, cu_tmpdir, cu['uid']) \ + task_command \ + "; echo script cu_exec_stop `%s` >> %s/%s.prof" \ % (self.gtod, cu_tmpdir, cu['uid']) arg_list.append(ffi.new("char[]", str("%s; exit $RETVAL" \ % str(task_command)))) self._log.debug("Launching unit %s via %s %s", cu['uid'], orte_command, task_command) # NULL termination, required by ORTE arg_list.append(ffi.NULL) argv = ffi.new("char *[]", arg_list) # stdout/stderr filenames can't be set with orte # TODO: assert here or earlier? # assert cu['description'].get('stdout') == None # assert cu['description'].get('stderr') == None # prepare stdout/stderr # TODO: when mpi==True && cores>1 there will be multiple files that need # to be concatenated. cu['stdout_file'] = os.path.join(cu_tmpdir, 'rank.0/stdout') cu['stderr_file'] = os.path.join(cu_tmpdir, 'rank.0/stderr') # Submit to the DVM! index = ffi.new("int *") with self.task_map_lock: self._prof.prof('exec_start', uid=cu['uid']) rc = orte_lib.orte_submit_job(argv, index, orte_lib.launch_cb, self._myhandle, orte_lib.finish_cb, self._myhandle) if rc: raise Exception("submit job failed with error: %d" % rc) self.task_map[index[0]] = cu # map ORTE index to CU self._prof.prof('exec_ok', uid=cu['uid']) self._log.debug("Task %d submitted!", cu['uid'])
def run(self, ): argv_keepalive = [ ffi.new("char[]", "RADICAL-Pilot"), # Will be stripped off by the library ffi.new("char[]", "--hnp"), ffi.new("char[]", DVM_URI), ffi.NULL, # Required ] argv = ffi.new("char *[]", argv_keepalive) lib.orte_submit_init(3, argv, ffi.NULL) # Used for storing the task id that is returned by orte_submit_job index_ptr = ffi.new("int[1]") task_no = 1 while task_no <= TASKS or self.active > 0: if task_no <= TASKS and self.active < CORES: task_id = 'unit.%.6d' % task_no cu_tmpdir = '%s' % task_id # # ASIC # self.session.prof.prof(event='get', state=AGENT_STAGING_INPUT_PENDING, uid=task_id, name='AgentStagingInputComponent') self.session.prof.prof(event='work start', state=AGENT_STAGING_INPUT_PENDING, uid=task_id, name='AgentStagingInputComponent') self.session.prof.prof('advance', uid=task_id, state=AGENT_STAGING_INPUT, name='AgentStagingInputComponent') os.mkdir('%s' % cu_tmpdir) self.session.prof.prof('advance', uid=task_id, state=ALLOCATING_PENDING, name='AgentStagingInputComponent') self.session.prof.prof(event='work done', state=AGENT_STAGING_INPUT_PENDING, uid=task_id, name='AgentStagingInputComponent') self.session.prof.prof(event='put', state=ALLOCATING_PENDING, uid=task_id, name='AgentStagingInputComponent') # # ASC # self.session.prof.prof(event='get', state=ALLOCATING_PENDING, uid=task_id, name='AgentSchedulingComponent') self.session.prof.prof(event='work start', state=ALLOCATING_PENDING, uid=task_id, name='AgentSchedulingComponent') self.session.prof.prof('advance', uid=task_id, state=ALLOCATING, name='AgentSchedulingComponent') self.session.prof.prof('schedule', msg='try', uid=task_id, name='AgentSchedulingComponent') self.session.prof.prof('schedule', msg='allocated', uid=task_id, name='AgentSchedulingComponent') self.session.prof.prof('advance', uid=task_id, state=EXECUTING_PENDING, name='AgentSchedulingComponent') self.session.prof.prof(event='put', state=EXECUTING_PENDING, uid=task_id, name='AgentSchedulingComponent') self.session.prof.prof(event='work done', state=ALLOCATING_PENDING, uid=task_id, name='AgentSchedulingComponent') # # AEC # self.session.prof.prof(event='get', state=EXECUTING_PENDING, uid=task_id, name='AgentExecutingComponent') self.session.prof.prof(event='work start', state=EXECUTING_PENDING, uid=task_id, name='AgentExecutingComponent') self.session.prof.prof('exec', msg='unit launch', uid=task_id, name='AgentExecutingComponent') self.session.prof.prof('spawn', msg='unit spawn', uid=task_id, name='AgentExecutingComponent') argv_keepalive = [ ffi.new("char[]", "RADICAL-Pilot"), ffi.new("char[]", "--np"), ffi.new("char[]", "1"), ] # Let the orted write stdout and stderr to rank-based output files argv_keepalive.append(ffi.new("char[]", "--output-filename")) argv_keepalive.append(ffi.new("char[]", "%s:nojobid,nocopy" % str(cu_tmpdir))) argv_keepalive.append(ffi.new("char[]", "sh")) argv_keepalive.append(ffi.new("char[]", "-c")) task_command = 'sleep %d' % SLEEP # Wrap in (sub)shell for output redirection task_command = "echo script start_script `%s` >> %s/PROF; " % (GTOD, cu_tmpdir) + \ task_command + \ "; echo script after_exec `%s` >> %s/PROF" % (GTOD, cu_tmpdir) argv_keepalive.append(ffi.new("char[]", str("%s; exit $RETVAL" % str(task_command)))) argv_keepalive.append(ffi.NULL) # NULL Termination Required argv = ffi.new("char *[]", argv_keepalive) self.session.prof.prof('command', msg='launch command constructed', uid=task_id, name='AgentExecutingComponent') struct = {'instance': self, 'task': task_id} cbdata = ffi.new_handle(struct) lib.orte_submit_job(argv, index_ptr, lib.launch_cb, cbdata, lib.finish_cb, cbdata) index = index_ptr[0] # pointer notation self.task_instance_map[index] = cbdata self.session.prof.prof('spawn', msg='spawning passed to orte', uid=task_id, name='AgentExecutingComponent') self.session.prof.prof(event='work done', state=EXECUTING_PENDING, uid=task_id, name='AgentExecutingComponent') print "Task %s submitted!" % task_id self.active += 1 task_no += 1 else: time.sleep(0.001) print("Execution done.") print() print("Collecting profiles ...") for task_no in range(TASKS): task_id = 'unit.%.6d' % task_no self.session.prof.prof('advance', uid=task_id, state=AGENT_STAGING_OUTPUT, name='AgentStagingOutputComponent') cu_tmpdir = '%s' % task_id if os.path.isfile("%s/PROF" % cu_tmpdir): try: with open("%s/PROF" % cu_tmpdir, 'r') as prof_f: txt = prof_f.read() for line in txt.split("\n"): if line: x1, x2, x3 = line.split() self.session.prof.prof(x1, msg=x2, timestamp=float(x3), uid=task_id, name='AgentStagingOutputComponent') except Exception as e: print("Pre/Post profiling file read failed: `%s`" % e)
def spawn(self, launcher, cu): sandbox = cu['unit_sandbox_path'] if False: cu_tmpdir = '%s/%s' % (self.tmpdir, cu['uid']) else: cu_tmpdir = sandbox rec_makedir(cu_tmpdir) # TODO: pre_exec # # Before the Big Bang there was nothing # if cu['description']['pre_exec']: # fail = ' (echo "pre_exec failed"; false) || exit' # pre = '' # for elem in cu['description']['pre_exec']: # pre += "%s || %s\n" % (elem, fail) # # Note: extra spaces below are for visual alignment # launch_script.write("# Pre-exec commands\n") # if self._prof.enabled: # launch_script.write("echo cu_pre_start `%s` >> %s/%s.prof\n"\ # % (cu['gtod'], sandbox, cu['uid'])) # launch_script.write(pre) # if self._prof.enabled: # launch_script.write("echo cu_pre_stop `%s` >> %s/%s.prof\n" \ # % (cu['gtod'], sandbox, cu['uid'])) # TODO: post_exec # # After the universe dies the infrared death, there will be nothing # if cu['description']['post_exec']: # fail = ' (echo "post_exec failed"; false) || exit' # post = '' # for elem in cu['description']['post_exec']: # post += "%s || %s\n" % (elem, fail) # launch_script.write("# Post-exec commands\n") # if self._prof.enabled: # launch_script.write("echo cu_post_start `%s` >> %s/%s.prof\n" # % (cu['gtod'], sandbox, cu['uid'])) # launch_script.write('%s\n' % post) # if self._prof.enabled: # launch_script.write("echo cu_post_stop `%s` >> %s/%s.prof\n" # % (cu['gtod'], sandbox, cu['uid'])) # The actual command line, constructed per launch-method try: orte_command, task_command = launcher.construct_command(cu, None) except Exception as e: msg = "Error in spawner (%s)" % e self._log.exception(msg) raise RuntimeError(msg) # Construct arguments to submit_job arg_list = [] # Take the orte specific commands and split them for arg in orte_command.split(): arg_list.append(ffi.new("char[]", str(arg))) # Set the working directory arg_list.append(ffi.new("char[]", "--wdir")) arg_list.append(ffi.new("char[]", str(sandbox))) # Set RP environment variables rp_envs = [ "RP_SESSION_ID=%s" % self._cfg['sid'], "RP_PILOT_ID=%s" % self._cfg['pid'], "RP_AGENT_ID=%s" % self._cfg['aid'], "RP_SPAWNER_ID=%s" % self.uid, "RP_UNIT_ID=%s" % cu['uid'], "RP_UNIT_NAME=%s" % cu['description'].get('name'), "RP_PILOT_STAGING=%s/staging_area" % self._pwd ] for env in rp_envs: arg_list.append(ffi.new("char[]", "-x")) arg_list.append(ffi.new("char[]", str(env))) # Set pre-populated environment variables if self._cu_environment: for key,val in self._cu_environment.items(): arg_list.append(ffi.new("char[]", "-x")) arg_list.append(ffi.new("char[]", "%s=%s" % (key, val))) # Set environment variables specified for this CU if cu['description']['environment']: for key,val in cu['description']['environment'].items(): arg_list.append(ffi.new("char[]", "-x")) arg_list.append(ffi.new("char[]", "%s=%s" % (key, val))) # Let the orted write stdout and stderr to rank-based output files arg_list.append(ffi.new("char[]", "--output-filename")) arg_list.append(ffi.new("char[]", "%s:nojobid,nocopy" % str(sandbox))) # Save retval of actual CU application (in case we have post-exec) task_command += "; RETVAL=$?" # Wrap in (sub)shell for output redirection arg_list.append(ffi.new("char[]", "sh")) arg_list.append(ffi.new("char[]", "-c")) if self._prof.enabled: task_command = "echo script cu_start `%s` >> %s/%s.prof; " \ % (self.gtod, sandbox, cu['uid']) \ + "echo script cu_exec_start `%s` >> %s/%s.prof; " \ % (self.gtod, sandbox, cu['uid']) \ + task_command \ + "; echo script cu_exec_stop `%s` >> %s/%s.prof" \ % (self.gtod, sandbox, cu['uid']) arg_list.append(ffi.new("char[]", str("%s; exit $RETVAL" % str(task_command)))) self._log.debug("Launching unit %s via %s %s", cu['uid'], orte_command, task_command) # NULL termination, required by ORTE arg_list.append(ffi.NULL) argv = ffi.new("char *[]", arg_list) # stdout/stderr filenames can't be set with orte # TODO: assert here or earlier? # assert cu['description'].get('stdout') == None # assert cu['description'].get('stderr') == None # prepare stdout/stderr # TODO: when mpi==True && cores>1 there will be multiple files that need # to be concatenated. cu['stdout_file'] = os.path.join(sandbox, 'rank.0/stdout') cu['stderr_file'] = os.path.join(sandbox, 'rank.0/stderr') # Submit to the DVM! index = ffi.new("int *") with self.task_map_lock: self._prof.prof('exec_start', uid=cu['uid']) rc = orte_lib.orte_submit_job(argv, index, orte_lib.launch_cb, self._myhandle, orte_lib.finish_cb, self._myhandle) if rc: raise Exception("submit job failed with error: %d" % rc) self.task_map[index[0]] = cu # map ORTE index to CU self._prof.prof('exec_ok', uid=cu['uid']) self._log.debug("Task %d submitted!", cu['uid'])