class IPClusterEnsembleJPL2(SurveyEnsemble): """Parallelized survey ensemble based on IPython parallel (ipcluster) """ def __init__(self, ensemble_controller=None, ensemble_mode='', **specs): SurveyEnsemble.__init__(self, **specs) # allow bail-out if 'init-only' in ensemble_mode: self.vprint("SurveyEnsemble: initialize-only mode") return if 'standalone' in ensemble_mode: self.vprint("SurveyEnsemble: standalone mode: no ipyparallel") self.standalone = True return self.standalone = False self.verb = specs.get('verbose', True) # specify the cluster if ensemble_controller: if '.json' in ensemble_controller: arglist = dict(url_file=ensemble_controller) else: arglist = dict(profile=ensemble_controller) else: arglist = dict() # access the cluster self.rc = Client(**arglist) self.dview = self.rc[:] self.dview.block = True # these are the imports needed by the run_one() with self.dview.sync_imports(): import EXOSIMS, EXOSIMS.util.get_module, EXOSIMS_local, \ time, os, os.path, random, numpy, cPickle, gzip, traceback if 'logger' in specs: specs.pop('logger') # pop the seed from the specs to force re-seeding if 'seed' in specs: specs.pop('seed') # push the specs to the engines self.dview.push(dict(specs=specs)) # instantiate a SurveySimulation in the global workspace on each engine res = self.dview.execute( "SS = EXOSIMS.util.get_module.get_module_from_specs" + "(specs, 'SurveySimulation')(**specs)") self.vprint("Created SurveySimulation objects on %d engines." % len(self.rc.ids)) # pull the seeds from each engine seeds = self.dview.pull('SS.seed', block=True) # print stdout/stderr of each engine's activity - this is likely to be captured # in the invoking function. Note, we don't have access to the parent SS.seed here. if True: for row, erow, id, seed in zip(res.stdout, res.stderr, res.engine_id, seeds): print('==== Engine = %d, Seed = %d ====' % (id, seed)) if erow: msg = ''.join([ '[#%d] Error: %s\n' % (id, line) for line in erow.split('\n') if line ]) print(msg) sys.stderr.write(msg) print(''.join( ['[#%d] %s\n' % (id, line) for line in row.split('\n')])) # we will use the load-balanced view for cluster Exosims runs self.lview = self.rc.load_balanced_view() def run_ensemble(self, sim, nb_run_sim, run_one=None, genNewPlanets=True, rewindPlanets=True, kwargs={}): if self.standalone: return self.run_ensemble_stand(sim, nb_run_sim, run_one, genNewPlanets, rewindPlanets, kwargs) else: return self.run_ensemble_ipp(sim, nb_run_sim, run_one, genNewPlanets, rewindPlanets, kwargs) def run_ensemble_stand(self, sim, nb_run_sim, run_one, genNewPlanets=True, rewindPlanets=True, kwargs={}): r'''Stand-alone simulation runner.''' t1 = time.time() res = [] for j in range(nb_run_sim): if nb_run_sim > 1: print('Survey simulation: %s/%s' % (j + 1, int(nb_run_sim))) seed = sim.seed fn = os.path.join(kwargs['outpath'], 'log', 'log-%d.out' % (seed, )) with RedirectStdStreams(stdout=open(fn, 'w')): ar = run_one(genNewPlanets=genNewPlanets, rewindPlanets=rewindPlanets, **kwargs) res.append(ar) t2 = time.time() self.vprint("Completed %s simulation(s) in %d sec" % (int(nb_run_sim), t2 - t1)) return res def run_ensemble_ipp(self, sim, nb_run_sim, run_one=None, genNewPlanets=True, rewindPlanets=True, kwargs={}): if not run_one: raise ValueError, 'Require a run_one function to be provided' t1 = time.time() async_res = [] for j in range(nb_run_sim): ar = self.lview.apply_async(run_one, genNewPlanets=genNewPlanets, rewindPlanets=rewindPlanets, **kwargs) async_res.append(ar) print("Submitted %d tasks." % len(async_res)) ar = self.rc._asyncresult_from_jobs(async_res) # ad hoc status-reporting progress = 0 while not ar.ready(): ar.wait(10.) clear_output(wait=True) if ar.progress == 0: forecast = 'not yet able to forecast time remaining.' elif ar.progress > progress: # update forecast right after we learn more about job-completion rate, # otherwise, the accuracy of the rate is diminished progress = ar.progress timeleft = ar.elapsed / ar.progress * (nb_run_sim - ar.progress) if timeleft > 3600.: timeleftstr = "%2.2f hours" % (timeleft / 3600.) elif timeleft > 60.: timeleftstr = "%2.2f minutes" % (timeleft / 60.) else: timeleftstr = "%2.2f seconds" % timeleft forecast = 'about ' + timeleftstr + ' to go.' print("%4i/%i tasks finished after %4i s -- %s" % (ar.progress, nb_run_sim, ar.elapsed, forecast), end="") sys.stdout.flush() #self.rc.wait(async_res) #self.rc.wait_interactive(async_res) t2 = time.time() print("\nCompleted in %d sec" % (t2 - t1)) # output the ipp engine stdout's to log-files for j, ar1 in enumerate(async_res): # retrieve result - just the seed, actually seed1 = ar1.get() fn = os.path.join(kwargs['outpath'], 'log', 'log-%s.out' % seed1) with open(fn, 'w') as fp: for line in ar1.stdout: fp.write(line) if ar1.stderr: fn = os.path.join(kwargs['outpath'], 'log', 'log-%s.err' % seed1) with open(fn, 'w') as fp: for line in ar1.stderr: fp.write(line) # return the list of seeds return [ar.get() for ar in async_res]
class IPClusterEnsemble(SurveyEnsemble): """Parallelized suvey ensemble based on IPython parallel (ipcluster) """ def __init__(self, **specs): SurveyEnsemble.__init__(self, **specs) self.verb = specs.get('verbose', True) # access the cluster self.rc = Client() self.dview = self.rc[:] self.dview.block = True with self.dview.sync_imports(): import EXOSIMS, EXOSIMS.util.get_module, \ os, os.path, time, random, pickle, traceback, numpy if 'logger' in specs: specs.pop('logger') if 'seed' in specs: specs.pop('seed') self.dview.push(dict(specs=specs)) self.vprint("Building SurveySimulation object on all workers.") res = self.dview.execute("SS = EXOSIMS.util.get_module.get_module(specs['modules'] \ ['SurveySimulation'], 'SurveySimulation')(**specs)") res2 = self.dview.execute("SS.reset_sim()") self.vprint("Created SurveySimulation objects on %d engines."%len(self.rc.ids)) #for row in res.stdout: # self.vprint(row) self.lview = self.rc.load_balanced_view() self.maxNumEngines = len(self.rc.ids) def run_ensemble(self, sim, nb_run_sim, run_one=None, genNewPlanets=True, rewindPlanets=True, kwargs={}): """ Args: sim: """ hangingRunsOccured = False # keeps track of whether hanging runs have occured t1 = time.time() async_res = [] for j in range(nb_run_sim): ar = self.lview.apply_async(run_one, genNewPlanets=genNewPlanets, rewindPlanets=rewindPlanets, **kwargs) async_res.append(ar) print("Submitted %d tasks."%len(async_res)) engine_pids = self.rc[:].apply(os.getpid).get_dict() #ar2 = self.lview.apply_async(os.getpid) #pids = ar2.get_dict() print('engine_pids') print(engine_pids) runStartTime = time.time()#create job starting time avg_time_per_run = 0. tmplenoutstandingset = nb_run_sim tLastRunFinished = time.time() ar= self.rc._asyncresult_from_jobs(async_res) while not ar.ready(): ar.wait(10.) clear_output(wait=True) if ar.progress > 0: timeleft = ar.elapsed/ar.progress * (nb_run_sim - ar.progress) if timeleft > 3600.: timeleftstr = "%2.2f hours"%(timeleft/3600.) elif timeleft > 60.: timeleftstr = "%2.2f minutes"%(timeleft/60.) else: timeleftstr = "%2.2f seconds"%timeleft else: timeleftstr = "who knows" #Terminate hanging runs outstandingset = self.rc.outstanding#a set of msg_ids that have been submitted but resunts have not been received if len(outstandingset) > 0 and len(outstandingset) < nb_run_sim:#there is at least 1 run still going and we have not just started avg_time_per_run = (time.time() - runStartTime)/float(nb_run_sim - len(outstandingset))#compute average amount of time per run if len(outstandingset) < tmplenoutstandingset:#The scheduler has finished a run tmplenoutstandingset = len(outstandingset)#update this. should decrease by ~1 or number of cores... tLastRunFinished = time.time()#update tLastRunFinished to the last time a simulation finished (right now) #self.vprint("tmplenoutstandingset %d, tLastRunFinished %0.6f"%(tmplenoutstandingset,tLastRunFinished)) if time.time() - tLastRunFinished > avg_time_per_run*(1. + self.maxNumEngines*2.)*4.: #nb_run_sim = len(self.rc.outstanding) #restartRuns = True self.vprint('Aborting ' + str(len(self.rc.outstanding)) + 'qty outstandingset jobs') #runningPIDS = os.listdir('/proc') # get all running pids self.vprint('queue_status') self.vprint(str(self.rc.queue_status())) self.rc.abort() ar.wait(20) runningPIDS = [int(tpid) for tpid in os.listdir('/proc') if tpid.isdigit()] #[self.rc.queue_status()[eind] for eind in np.arange(self.maxNumEngines) if self.rc.queue_status()[eind]['tasks']>0] for engineInd in [eind for eind in np.arange(self.maxNumEngines) if self.rc.queue_status()[eind]['tasks']>0]: os.kill(engine_pids[engineInd],15) time.sleep(20) # for pid in [engine_pids[eind] for eind in np.arange(len(engine_pids))]: # if pid in runningPIDS: # os.kill(pid,9) # send kill command to stop this worker stopIPClusterCommand = subprocess.Popen(['ipcluster','stop']) stopIPClusterCommand.wait() time.sleep(60) # doing this instead of waiting for ipcluster to terminate stopIPClusterCommand = subprocess.Popen(['ipcluster','stop']) stopIPClusterCommand.wait() time.sleep(60) # doing this instead of waiting for ipcluster to terminate hangingRunsOccured = True # keeps track of whether hanging runs have occured break #stopIPClusterCommand.wait() # waits for process to terminate #call(["ipcluster","stop"]) # send command to stop ipcluster #self.rc.abort(jobs=self.rc.outstanding.copy().pop()) #self.rc.abort()#by default should abort all outstanding jobs... #it is possible that this will not stop the jobs running #ar.wait(100) #self.rc.purge_everything() # purge all results if outstanding *because rc.abort() didn't seem to do the job right tLastRunFinished = time.time()#update tLastRunFinished to the last time a simulation was restarted (right now) print("%4i/%i tasks finished after %4i s. About %s to go." % (ar.progress, nb_run_sim, ar.elapsed, timeleftstr), end="") sys.stdout.flush() #numRunStarts += 1 # increment number of run restarts t2 = time.time() print("\nCompleted in %d sec" % (t2 - t1)) if hangingRunsOccured: #hanging runs have occured res = [1] else: res = [ar.get() for ar in async_res] return res
class IPClusterEnsemble(SurveyEnsemble): """Parallelized suvey ensemble based on IPython parallel (ipcluster) """ def __init__(self, **specs): SurveyEnsemble.__init__(self, **specs) self.verb = specs.get('verbose', True) # access the cluster self.rc = Client() self.dview = self.rc[:] self.dview.block = True with self.dview.sync_imports(): import EXOSIMS, EXOSIMS.util.get_module, \ os, os.path, time, random, pickle, traceback, numpy if 'logger' in specs: specs.pop('logger') if 'seed' in specs: specs.pop('seed') self.dview.push(dict(specs=specs)) self.vprint("Building SurveySimulation object on all workers.") res = self.dview.execute( "SS = EXOSIMS.util.get_module.get_module(specs['modules'] \ ['SurveySimulation'], 'SurveySimulation')(**specs)") res2 = self.dview.execute("SS.reset_sim()") self.vprint("Created SurveySimulation objects on %d engines." % len(self.rc.ids)) #for row in res.stdout: # self.vprint(row) self.lview = self.rc.load_balanced_view() self.maxNumEngines = len(self.rc.ids) def run_ensemble(self, sim, nb_run_sim, run_one=None, genNewPlanets=True, rewindPlanets=True, kwargs={}): """ Args: sim: """ hangingRunsOccured = False # keeps track of whether hanging runs have occured t1 = time.time() async_res = [] for j in range(nb_run_sim): ar = self.lview.apply_async(run_one, genNewPlanets=genNewPlanets, rewindPlanets=rewindPlanets, **kwargs) async_res.append(ar) print("Submitted %d tasks." % len(async_res)) engine_pids = self.rc[:].apply(os.getpid).get_dict() #ar2 = self.lview.apply_async(os.getpid) #pids = ar2.get_dict() print('engine_pids') print(engine_pids) runStartTime = time.time() #create job starting time avg_time_per_run = 0. tmplenoutstandingset = nb_run_sim tLastRunFinished = time.time() ar = self.rc._asyncresult_from_jobs(async_res) while not ar.ready(): ar.wait(10.) clear_output(wait=True) if ar.progress > 0: timeleft = ar.elapsed / ar.progress * (nb_run_sim - ar.progress) if timeleft > 3600.: timeleftstr = "%2.2f hours" % (timeleft / 3600.) elif timeleft > 60.: timeleftstr = "%2.2f minutes" % (timeleft / 60.) else: timeleftstr = "%2.2f seconds" % timeleft else: timeleftstr = "who knows" #Terminate hanging runs outstandingset = self.rc.outstanding #a set of msg_ids that have been submitted but resunts have not been received if len(outstandingset) > 0 and len( outstandingset ) < nb_run_sim: #there is at least 1 run still going and we have not just started avg_time_per_run = (time.time() - runStartTime) / float( nb_run_sim - len(outstandingset) ) #compute average amount of time per run if len( outstandingset ) < tmplenoutstandingset: #The scheduler has finished a run tmplenoutstandingset = len( outstandingset ) #update this. should decrease by ~1 or number of cores... tLastRunFinished = time.time( ) #update tLastRunFinished to the last time a simulation finished (right now) #self.vprint("tmplenoutstandingset %d, tLastRunFinished %0.6f"%(tmplenoutstandingset,tLastRunFinished)) if time.time() - tLastRunFinished > avg_time_per_run * ( 1. + self.maxNumEngines * 2.) * 4.: #nb_run_sim = len(self.rc.outstanding) #restartRuns = True self.vprint('Aborting ' + str(len(self.rc.outstanding)) + 'qty outstandingset jobs') #runningPIDS = os.listdir('/proc') # get all running pids self.vprint('queue_status') self.vprint(str(self.rc.queue_status())) self.rc.abort() ar.wait(20) runningPIDS = [ int(tpid) for tpid in os.listdir('/proc') if tpid.isdigit() ] #[self.rc.queue_status()[eind] for eind in np.arange(self.maxNumEngines) if self.rc.queue_status()[eind]['tasks']>0] for engineInd in [ eind for eind in np.arange(self.maxNumEngines) if self.rc.queue_status()[eind]['tasks'] > 0 ]: os.kill(engine_pids[engineInd], 15) time.sleep(20) # for pid in [engine_pids[eind] for eind in np.arange(len(engine_pids))]: # if pid in runningPIDS: # os.kill(pid,9) # send kill command to stop this worker stopIPClusterCommand = subprocess.Popen( ['ipcluster', 'stop']) stopIPClusterCommand.wait() time.sleep( 60 ) # doing this instead of waiting for ipcluster to terminate stopIPClusterCommand = subprocess.Popen( ['ipcluster', 'stop']) stopIPClusterCommand.wait() time.sleep( 60 ) # doing this instead of waiting for ipcluster to terminate hangingRunsOccured = True # keeps track of whether hanging runs have occured break #stopIPClusterCommand.wait() # waits for process to terminate #call(["ipcluster","stop"]) # send command to stop ipcluster #self.rc.abort(jobs=self.rc.outstanding.copy().pop()) #self.rc.abort()#by default should abort all outstanding jobs... #it is possible that this will not stop the jobs running #ar.wait(100) #self.rc.purge_everything() # purge all results if outstanding *because rc.abort() didn't seem to do the job right tLastRunFinished = time.time( ) #update tLastRunFinished to the last time a simulation was restarted (right now) print("%4i/%i tasks finished after %4i s. About %s to go." % (ar.progress, nb_run_sim, ar.elapsed, timeleftstr), end="") sys.stdout.flush() #numRunStarts += 1 # increment number of run restarts t2 = time.time() print("\nCompleted in %d sec" % (t2 - t1)) if hangingRunsOccured: #hanging runs have occured res = [1] else: res = [ar.get() for ar in async_res] return res
class IPClusterEnsemble(SurveyEnsemble): """Parallelized suvey ensemble based on IPython parallel (ipcluster) """ def __init__(self, **specs): SurveyEnsemble.__init__(self, **specs) self.verb = specs.get('verbose', True) # access the cluster self.rc = Client() self.dview = self.rc[:] self.dview.block = True with self.dview.sync_imports(): import EXOSIMS, EXOSIMS.util.get_module, \ os, os.path, time, random, cPickle, traceback if specs.has_key('logger'): specs.pop('logger') if specs.has_key('seed'): specs.pop('seed') self.dview.push(dict(specs=specs)) res = self.dview.execute( "SS = EXOSIMS.util.get_module.get_module(specs['modules'] \ ['SurveySimulation'], 'SurveySimulation')(**specs)") self.vprint("Created SurveySimulation objects on %d engines." % len(self.rc.ids)) #for row in res.stdout: # self.vprint(row) self.lview = self.rc.load_balanced_view() def run_ensemble(self, sim, nb_run_sim, run_one=None, genNewPlanets=True, rewindPlanets=True, kwargs={}): t1 = time.time() async_res = [] for j in range(nb_run_sim): ar = self.lview.apply_async(run_one, genNewPlanets=genNewPlanets, rewindPlanets=rewindPlanets, **kwargs) async_res.append(ar) print("Submitted %d tasks." % len(async_res)) ar = self.rc._asyncresult_from_jobs(async_res) while not ar.ready(): ar.wait(10.) clear_output(wait=True) if ar.progress > 0: timeleft = ar.elapsed / ar.progress * (nb_run_sim - ar.progress) if timeleft > 3600.: timeleftstr = "%2.2f hours" % (timeleft / 3600.) elif timeleft > 60.: timeleftstr = "%2.2f minutes" % (timeleft / 60.) else: timeleftstr = "%2.2f seconds" % timeleft else: timeleftstr = "who knows" print("%4i/%i tasks finished after %4i s. About %s to go." % (ar.progress, nb_run_sim, ar.elapsed, timeleftstr), end="") sys.stdout.flush() #self.rc.wait(async_res) #self.rc.wait_interactive(async_res) t2 = time.time() print("\nCompleted in %d sec" % (t2 - t1)) res = [ar.get() for ar in async_res] return res
class IPClusterEnsembleJPL(SurveyEnsemble): """Parallelized survey ensemble based on IPython parallel (ipcluster) """ def __init__(self, ensemble_controller=None, ensemble_mode=None, **specs): SurveyEnsemble.__init__(self, **specs) # allow bail-out if ensemble_mode and 'init-only' in ensemble_mode: self.vprint("SurveyEnsemble: initialize-only mode") return self.verb = specs.get('verbose', True) # specify the cluster if ensemble_controller: if '.json' in ensemble_controller: arglist = dict(url_file=ensemble_controller) else: arglist = dict(profile=ensemble_controller) else: arglist = dict() # access the cluster self.rc = Client(**arglist) self.dview = self.rc[:] self.dview.block = True # these are the imports needed by the run_one() with self.dview.sync_imports(): import EXOSIMS, EXOSIMS.util.get_module, \ time, os, os.path, random, cPickle, gzip, traceback if specs.has_key('logger'): specs.pop('logger') if specs.has_key('seed'): specs.pop('seed') self.dview.push(dict(specs=specs)) res = self.dview.execute( "SS = EXOSIMS.util.get_module.get_module_from_specs" + "(specs, 'SurveySimulation')(**specs)") self.vprint("Created SurveySimulation objects on %d engines." % len(self.rc.ids)) # optionally print stdout of each engine's activity if False: for row, id in zip(res.stdout, res.engine_id): print(''.join( ['[#%d] %s\n' % (id, line) for line in row.split('\n')])) self.lview = self.rc.load_balanced_view() def run_ensemble(self, sim, nb_run_sim, run_one=None, genNewPlanets=True, rewindPlanets=True, kwargs={}): if not run_one: raise ValueError, 'Require a run_one function to be provided' t1 = time.time() async_res = [] for j in range(nb_run_sim): ar = self.lview.apply_async(run_one, genNewPlanets=genNewPlanets, rewindPlanets=rewindPlanets, **kwargs) async_res.append(ar) print("Submitted %d tasks." % len(async_res)) ar = self.rc._asyncresult_from_jobs(async_res) while not ar.ready(): ar.wait(10.) clear_output(wait=True) if ar.progress > 0: timeleft = ar.elapsed / ar.progress * (nb_run_sim - ar.progress) if timeleft > 3600.: timeleftstr = "%2.2f hours" % (timeleft / 3600.) elif timeleft > 60.: timeleftstr = "%2.2f minutes" % (timeleft / 60.) else: timeleftstr = "%2.2f seconds" % timeleft forecast = 'about ' + timeleftstr + ' to go.' else: forecast = 'not yet able to forecast time remaining.' print("%4i/%i tasks finished after %4i s -- %s" % (ar.progress, nb_run_sim, ar.elapsed, forecast), end="") sys.stdout.flush() #self.rc.wait(async_res) #self.rc.wait_interactive(async_res) t2 = time.time() print("\nCompleted in %d sec" % (t2 - t1)) res = [ar.get() for ar in async_res] return res
def power_of_test(data1, data2, rvs_func='rvs_pairs', tests=['chi2_2samp'], rvs_key={}, test_key={}, parallel=None, sync=True): """Compute the corresponding p-values for each histrogram pairs from the random variates of the given 2 samples/frequencies for size_times. Parameters ---------- data1, data2 : sequence of 1-D ndarrays Input data. Observed samples or frequencies. rvs_func : [callable|str], optional, default : "rvs_pairs" The random variates function. The rvs_func can be either a callable or one of the following strings:: String Description "rvs_pairs" Compute the histogram pairs from the random variates of the given 2 samples/frequencies for size_times. tests : ([callable|str],...), optional, default : ["chi2_2samp"] A list of *test* statistical functions. The *test* can be either a callable or one of the following strings:: String Description "chi2_2samp" Read TS.chi2_2samp for further information. "BDM_2samp" Read TS.BDM_2samp for further information. "likelihoodratio_ksamp" Read TS.likelihoodratio_ksamp for further information. "likelihoodvalue_ksamp" Read TS.likelihoodvalue_ksamp for further information. "ks_2samp" Read TS.ks_2samp for further information. "anderson_ksamp" Read TS.anderson_ksamp for further information. "CVM_2samp" Read TS.CVM_2samp for further information. rvs_key : dict, optional, default : {} Keyword arguments for the rvs function, rvs_func. test_key : dict, optional Keyword arguments for the test statistical function, test. parallel : bool, optional, default : None If True, import IPyParallel package to do the parallel computation. If parallel is None, the global variable PARALLEL will be used instead. sync : bool, optional, default : True When sync is False, an IPyParallel AsyncResult Object will be returned instead. Onyl affect when parallel is True. Returns ------- [p1, p2, ...] : 1-D array The corresponding p-values for each histogram pairs. """ if parallel == None: parallel = PARALLEL if parallel: try: global client client = Client(**ipp_profile) size = rvs_key['size'] N = len(client) jobs = [] for i in range(N): rvs_key['size'] = (size // N + 1) if (i < size % N) else size // N jobs.append(client[client.ids[i]].apply_async( power_of_test, data1, data2, rvs_func, test, rvs_key, test_key, False)) ars = client._asyncresult_from_jobs(jobs) if sync: ars.wait_interactive() ret = {} for key, val in ars.get(): ret.setdefault(key, []).extend(val) else: return ars finally: client.close() return ret if type(rvs_func) == str: rvs_func = globals()[rvs_func] if type(tests) not in (list, tuple): tests = [tests] tests = [(t, getattr(TS, t)) if type(t) == str else (str(t), t) for t in tests] ret = {} for rvs1, rvs2 in rvs_func(data1, data2, **rvs_key): for tname, test in tests: ret.setdefault(tname, []).append( test(rvs1, rvs2, binned=True, **test_key).pvalue) return ret
class IPClusterEnsemble(SurveyEnsemble): """Parallelized suvey ensemble based on IPython parallel (ipcluster) """ def __init__(self, **specs): SurveyEnsemble.__init__(self, **specs) self.verb = specs.get('verbose', True) # access the cluster self.rc = Client() self.dview = self.rc[:] self.dview.block = True with self.dview.sync_imports(): import EXOSIMS, EXOSIMS.util.get_module, \ os, os.path, time, random, cPickle, traceback if specs.has_key('logger'): specs.pop('logger') if specs.has_key('seed'): specs.pop('seed') self.dview.push(dict(specs=specs)) res = self.dview.execute( "SS = EXOSIMS.util.get_module.get_module(specs['modules'] \ ['SurveySimulation'], 'SurveySimulation')(**specs)") res2 = self.dview.execute("SS.reset_sim()") self.vprint("Created SurveySimulation objects on %d engines." % len(self.rc.ids)) #for row in res.stdout: # self.vprint(row) self.lview = self.rc.load_balanced_view() self.maxNumEngines = len(self.rc.ids) def run_ensemble(self, sim, nb_run_sim, run_one=None, genNewPlanets=True, rewindPlanets=True, kwargs={}): """ Args: sim: """ t1 = time.time() async_res = [] for j in range(nb_run_sim): ar = self.lview.apply_async(run_one, genNewPlanets=genNewPlanets, rewindPlanets=rewindPlanets, **kwargs) async_res.append(ar) print("Submitted %d tasks." % len(async_res)) runStartTime = time.time() #create job starting time avg_time_per_run = 0. tmplenoutstandingset = nb_run_sim tLastRunFinished = time.time() ar = self.rc._asyncresult_from_jobs(async_res) while not ar.ready(): ar.wait(10.) clear_output(wait=True) if ar.progress > 0: timeleft = ar.elapsed / ar.progress * (nb_run_sim - ar.progress) if timeleft > 3600.: timeleftstr = "%2.2f hours" % (timeleft / 3600.) elif timeleft > 60.: timeleftstr = "%2.2f minutes" % (timeleft / 60.) else: timeleftstr = "%2.2f seconds" % timeleft else: timeleftstr = "who knows" #Terminate hanging runs outstandingset = self.rc.outstanding #a set of msg_ids that have been submitted but resunts have not been received if len(outstandingset) > 0 and len( outstandingset ) < nb_run_sim: #there is at least 1 run still going and we have not just started avg_time_per_run = (time.time() - runStartTime) / float( nb_run_sim - len(outstandingset) ) #compute average amount of time per run if len( outstandingset ) < tmplenoutstandingset: #The scheduler has finished a run tmplenoutstandingset = len( outstandingset ) #update this. should decrease by ~1 or number of cores... tLastRunFinished = time.time( ) #update tLastRunFinished to the last time a simulation finished (right now) #self.vprint("tmplenoutstandingset %d, tLastRunFinished %0.6f"%(tmplenoutstandingset,tLastRunFinished)) if time.time() - tLastRunFinished > avg_time_per_run * ( 1 + self.maxNumEngines * 2): self.vprint('Aborting ' + str(len(self.rc.outstanding)) + 'qty outstandingset jobs') self.rc.abort( ) #by default should abort all outstanding jobs... #it is possible that this will not stop the jobs running print("%4i/%i tasks finished after %4i s. About %s to go." % (ar.progress, nb_run_sim, ar.elapsed, timeleftstr), end="") sys.stdout.flush() t2 = time.time() print("\nCompleted in %d sec" % (t2 - t1)) res = [ar.get() for ar in async_res] return res