def map(self, parallel_task, args): from ipyparallel import Client, TimeoutError chunksize = 1 if self.max_tasks > 0 and len(args) > self.max_tasks: chunksize = len(args) // self.max_tasks if chunksize * self.max_tasks < len(args): chunksize += 1 client = None try: client = Client() except TimeoutError: raise RuntimeError( 'Cannot connect to the ipyparallel client. Is it running?') ar = None try: client[:].use_cloudpickle() lbv = client.load_balanced_view() ar = lbv.map_async(IppFunctionWrapper(parallel_task, self.timeout), args, chunksize=chunksize) try: r = [] for k, z in enumerate( tqdm(ar, desc="(IPYPARALLEL)", total=len(args))): if z[0] == -1: logger.error(z[1]) engine = ar.engine_id[k] client.abort(ar) client.close() raise RuntimeError( 'remote failure (task %d of %d on engine %d)' % (k + 1, len(ar), engine)) elif z[0] == 0: r.append(z[1]) except KeyboardInterrupt: client.abort(ar) raise finally: # always close the client to release resources if ar: client.abort(ar) if client: client.close() return r
class IPClusterEnsemble(SurveyEnsemble): """Parallelized suvey ensemble based on IPython parallel (ipcluster) """ def __init__(self, **specs): SurveyEnsemble.__init__(self, **specs) self.verb = specs.get('verbose', True) # access the cluster self.rc = Client() self.dview = self.rc[:] self.dview.block = True with self.dview.sync_imports(): import EXOSIMS, EXOSIMS.util.get_module, \ os, os.path, time, random, pickle, traceback, numpy if 'logger' in specs: specs.pop('logger') if 'seed' in specs: specs.pop('seed') self.dview.push(dict(specs=specs)) self.vprint("Building SurveySimulation object on all workers.") res = self.dview.execute("SS = EXOSIMS.util.get_module.get_module(specs['modules'] \ ['SurveySimulation'], 'SurveySimulation')(**specs)") res2 = self.dview.execute("SS.reset_sim()") self.vprint("Created SurveySimulation objects on %d engines."%len(self.rc.ids)) #for row in res.stdout: # self.vprint(row) self.lview = self.rc.load_balanced_view() self.maxNumEngines = len(self.rc.ids) def run_ensemble(self, sim, nb_run_sim, run_one=None, genNewPlanets=True, rewindPlanets=True, kwargs={}): """ Args: sim: """ hangingRunsOccured = False # keeps track of whether hanging runs have occured t1 = time.time() async_res = [] for j in range(nb_run_sim): ar = self.lview.apply_async(run_one, genNewPlanets=genNewPlanets, rewindPlanets=rewindPlanets, **kwargs) async_res.append(ar) print("Submitted %d tasks."%len(async_res)) engine_pids = self.rc[:].apply(os.getpid).get_dict() #ar2 = self.lview.apply_async(os.getpid) #pids = ar2.get_dict() print('engine_pids') print(engine_pids) runStartTime = time.time()#create job starting time avg_time_per_run = 0. tmplenoutstandingset = nb_run_sim tLastRunFinished = time.time() ar= self.rc._asyncresult_from_jobs(async_res) while not ar.ready(): ar.wait(10.) clear_output(wait=True) if ar.progress > 0: timeleft = ar.elapsed/ar.progress * (nb_run_sim - ar.progress) if timeleft > 3600.: timeleftstr = "%2.2f hours"%(timeleft/3600.) elif timeleft > 60.: timeleftstr = "%2.2f minutes"%(timeleft/60.) else: timeleftstr = "%2.2f seconds"%timeleft else: timeleftstr = "who knows" #Terminate hanging runs outstandingset = self.rc.outstanding#a set of msg_ids that have been submitted but resunts have not been received if len(outstandingset) > 0 and len(outstandingset) < nb_run_sim:#there is at least 1 run still going and we have not just started avg_time_per_run = (time.time() - runStartTime)/float(nb_run_sim - len(outstandingset))#compute average amount of time per run if len(outstandingset) < tmplenoutstandingset:#The scheduler has finished a run tmplenoutstandingset = len(outstandingset)#update this. should decrease by ~1 or number of cores... tLastRunFinished = time.time()#update tLastRunFinished to the last time a simulation finished (right now) #self.vprint("tmplenoutstandingset %d, tLastRunFinished %0.6f"%(tmplenoutstandingset,tLastRunFinished)) if time.time() - tLastRunFinished > avg_time_per_run*(1. + self.maxNumEngines*2.)*4.: #nb_run_sim = len(self.rc.outstanding) #restartRuns = True self.vprint('Aborting ' + str(len(self.rc.outstanding)) + 'qty outstandingset jobs') #runningPIDS = os.listdir('/proc') # get all running pids self.vprint('queue_status') self.vprint(str(self.rc.queue_status())) self.rc.abort() ar.wait(20) runningPIDS = [int(tpid) for tpid in os.listdir('/proc') if tpid.isdigit()] #[self.rc.queue_status()[eind] for eind in np.arange(self.maxNumEngines) if self.rc.queue_status()[eind]['tasks']>0] for engineInd in [eind for eind in np.arange(self.maxNumEngines) if self.rc.queue_status()[eind]['tasks']>0]: os.kill(engine_pids[engineInd],15) time.sleep(20) # for pid in [engine_pids[eind] for eind in np.arange(len(engine_pids))]: # if pid in runningPIDS: # os.kill(pid,9) # send kill command to stop this worker stopIPClusterCommand = subprocess.Popen(['ipcluster','stop']) stopIPClusterCommand.wait() time.sleep(60) # doing this instead of waiting for ipcluster to terminate stopIPClusterCommand = subprocess.Popen(['ipcluster','stop']) stopIPClusterCommand.wait() time.sleep(60) # doing this instead of waiting for ipcluster to terminate hangingRunsOccured = True # keeps track of whether hanging runs have occured break #stopIPClusterCommand.wait() # waits for process to terminate #call(["ipcluster","stop"]) # send command to stop ipcluster #self.rc.abort(jobs=self.rc.outstanding.copy().pop()) #self.rc.abort()#by default should abort all outstanding jobs... #it is possible that this will not stop the jobs running #ar.wait(100) #self.rc.purge_everything() # purge all results if outstanding *because rc.abort() didn't seem to do the job right tLastRunFinished = time.time()#update tLastRunFinished to the last time a simulation was restarted (right now) print("%4i/%i tasks finished after %4i s. About %s to go." % (ar.progress, nb_run_sim, ar.elapsed, timeleftstr), end="") sys.stdout.flush() #numRunStarts += 1 # increment number of run restarts t2 = time.time() print("\nCompleted in %d sec" % (t2 - t1)) if hangingRunsOccured: #hanging runs have occured res = [1] else: res = [ar.get() for ar in async_res] return res
class IPClusterEnsemble(SurveyEnsemble): """Parallelized suvey ensemble based on IPython parallel (ipcluster) """ def __init__(self, **specs): SurveyEnsemble.__init__(self, **specs) self.verb = specs.get('verbose', True) # access the cluster self.rc = Client() self.dview = self.rc[:] self.dview.block = True with self.dview.sync_imports(): import EXOSIMS, EXOSIMS.util.get_module, \ os, os.path, time, random, pickle, traceback, numpy if 'logger' in specs: specs.pop('logger') if 'seed' in specs: specs.pop('seed') self.dview.push(dict(specs=specs)) self.vprint("Building SurveySimulation object on all workers.") res = self.dview.execute( "SS = EXOSIMS.util.get_module.get_module(specs['modules'] \ ['SurveySimulation'], 'SurveySimulation')(**specs)") res2 = self.dview.execute("SS.reset_sim()") self.vprint("Created SurveySimulation objects on %d engines." % len(self.rc.ids)) #for row in res.stdout: # self.vprint(row) self.lview = self.rc.load_balanced_view() self.maxNumEngines = len(self.rc.ids) def run_ensemble(self, sim, nb_run_sim, run_one=None, genNewPlanets=True, rewindPlanets=True, kwargs={}): """ Args: sim: """ hangingRunsOccured = False # keeps track of whether hanging runs have occured t1 = time.time() async_res = [] for j in range(nb_run_sim): ar = self.lview.apply_async(run_one, genNewPlanets=genNewPlanets, rewindPlanets=rewindPlanets, **kwargs) async_res.append(ar) print("Submitted %d tasks." % len(async_res)) engine_pids = self.rc[:].apply(os.getpid).get_dict() #ar2 = self.lview.apply_async(os.getpid) #pids = ar2.get_dict() print('engine_pids') print(engine_pids) runStartTime = time.time() #create job starting time avg_time_per_run = 0. tmplenoutstandingset = nb_run_sim tLastRunFinished = time.time() ar = self.rc._asyncresult_from_jobs(async_res) while not ar.ready(): ar.wait(10.) clear_output(wait=True) if ar.progress > 0: timeleft = ar.elapsed / ar.progress * (nb_run_sim - ar.progress) if timeleft > 3600.: timeleftstr = "%2.2f hours" % (timeleft / 3600.) elif timeleft > 60.: timeleftstr = "%2.2f minutes" % (timeleft / 60.) else: timeleftstr = "%2.2f seconds" % timeleft else: timeleftstr = "who knows" #Terminate hanging runs outstandingset = self.rc.outstanding #a set of msg_ids that have been submitted but resunts have not been received if len(outstandingset) > 0 and len( outstandingset ) < nb_run_sim: #there is at least 1 run still going and we have not just started avg_time_per_run = (time.time() - runStartTime) / float( nb_run_sim - len(outstandingset) ) #compute average amount of time per run if len( outstandingset ) < tmplenoutstandingset: #The scheduler has finished a run tmplenoutstandingset = len( outstandingset ) #update this. should decrease by ~1 or number of cores... tLastRunFinished = time.time( ) #update tLastRunFinished to the last time a simulation finished (right now) #self.vprint("tmplenoutstandingset %d, tLastRunFinished %0.6f"%(tmplenoutstandingset,tLastRunFinished)) if time.time() - tLastRunFinished > avg_time_per_run * ( 1. + self.maxNumEngines * 2.) * 4.: #nb_run_sim = len(self.rc.outstanding) #restartRuns = True self.vprint('Aborting ' + str(len(self.rc.outstanding)) + 'qty outstandingset jobs') #runningPIDS = os.listdir('/proc') # get all running pids self.vprint('queue_status') self.vprint(str(self.rc.queue_status())) self.rc.abort() ar.wait(20) runningPIDS = [ int(tpid) for tpid in os.listdir('/proc') if tpid.isdigit() ] #[self.rc.queue_status()[eind] for eind in np.arange(self.maxNumEngines) if self.rc.queue_status()[eind]['tasks']>0] for engineInd in [ eind for eind in np.arange(self.maxNumEngines) if self.rc.queue_status()[eind]['tasks'] > 0 ]: os.kill(engine_pids[engineInd], 15) time.sleep(20) # for pid in [engine_pids[eind] for eind in np.arange(len(engine_pids))]: # if pid in runningPIDS: # os.kill(pid,9) # send kill command to stop this worker stopIPClusterCommand = subprocess.Popen( ['ipcluster', 'stop']) stopIPClusterCommand.wait() time.sleep( 60 ) # doing this instead of waiting for ipcluster to terminate stopIPClusterCommand = subprocess.Popen( ['ipcluster', 'stop']) stopIPClusterCommand.wait() time.sleep( 60 ) # doing this instead of waiting for ipcluster to terminate hangingRunsOccured = True # keeps track of whether hanging runs have occured break #stopIPClusterCommand.wait() # waits for process to terminate #call(["ipcluster","stop"]) # send command to stop ipcluster #self.rc.abort(jobs=self.rc.outstanding.copy().pop()) #self.rc.abort()#by default should abort all outstanding jobs... #it is possible that this will not stop the jobs running #ar.wait(100) #self.rc.purge_everything() # purge all results if outstanding *because rc.abort() didn't seem to do the job right tLastRunFinished = time.time( ) #update tLastRunFinished to the last time a simulation was restarted (right now) print("%4i/%i tasks finished after %4i s. About %s to go." % (ar.progress, nb_run_sim, ar.elapsed, timeleftstr), end="") sys.stdout.flush() #numRunStarts += 1 # increment number of run restarts t2 = time.time() print("\nCompleted in %d sec" % (t2 - t1)) if hangingRunsOccured: #hanging runs have occured res = [1] else: res = [ar.get() for ar in async_res] return res
class ClusterLab(epyc.Lab): """A :class:`Lab` running on an ``ipyparallel`` compute cluster. Experiments are submitted to engines in the cluster for execution in parallel, with the experiments being performed asynchronously to allow for disconnection and subsequent retrieval of results. Combined with a persistent :class:`LabNotebook`, this allows for fully decoupled access to an on-going computational experiment with piecewise retrieval of results. This class requires a cluster to already be set up and running, configured for persistent access, with access to the necessary code and libraries, and with appropriate security information available to the client. """ # Tuning parameters WaitingTime = 30 #: Waiting time for checking for job completion. Lower values increase network traffic. def __init__( self, notebook = None, url_file = None, profile = None, profile_dir = None, ipython_dir = None, context = None, debug = False, sshserver = None, sshkey = None, password = None, paramiko = None, timeout = 10, cluster_id = None, use_dill = False, **extra_args ): """Create an empty lab attached to the given cluster. Most of the arguments are as expected by the ``ipyparallel.Client`` class, and are used to create the underlying connection to the cluster. The connection is opened immediately, meaning the cluster must be up and accessible when creating a lab to use it. :param notebook: the notebook used to results (defaults to an empty :class:`LabNotebook`) :param url_file: file containing connection information for accessing cluster :param profile: name of the IPython profile to use :param profile_dir: directory containing the profile's connection information :param ipython_dir: directory containing profile directories :param context: ZMQ context :param debug: whether to issue debugging information (defaults to False) :param sshserver: username and machine for ssh connections :param sshkey: file containing ssh key :param password: ssh password :param paramiko: True to use paramiko for ssh (defaults to False) :param timeout: timeout in seconds for ssh connection (defaults to 10s) :param cluster_id: string added to runtime files to prevent collisions :param use_dill: whether to use Dill as pickler (defaults to False)""" super(epyc.ClusterLab, self).__init__(notebook) # record all the connection arguments for later self._arguments = dict(url_file = url_file, profile = profile, profile_dir = profile_dir, ipython_dir = ipython_dir, context = context, debug = debug, sshserver = sshserver, sshkey = sshkey, password = password, paramiko = paramiko, timeout = timeout, cluster_id = cluster_id, **extra_args) self._client = None # connect to the cluster self.open() # use Dill if requested if use_dill: self.use_dill() def open( self ): """Connect to the cluster.""" if self._client is None: self._client = Client(**self._arguments) def close( self ): """Close down the connection to the cluster.""" if self._client is not None: self._client.close() self._client = None def numberOfEngines( self ): """Return the number of engines available to this lab. :returns: the number of engines""" return len(self.engines()) def engines( self ): """Return a list of the available engines. :returns: a list of engines""" self.open() return self._client[:] def use_dill( self ): """Make the cluster use Dill as pickler for transferring results. This isn't generally needed, but is sometimes useful for particularly complex experiments such as those involving closures. (Or, to put it another way, if you find yourself tempted to use this method, consider re-structuring your experiment code.)""" self.open() with self.sync_imports(quiet = True): import dill self._client.direct_view().use_dill() def sync_imports( self, quiet = False ): """Return a context manager to control imports onto all the engines in the underlying cluster. This method is used within a ``with`` statement. Any imports should be done with no experiments running, otherwise the method will block until the cluster is quiet. Generally imports will be one of the first things done when connecting to a cluster. (But be careful not to accidentally try to re-import if re-connecting to a running cluster.) :param quiet: if True, suppresses messages (defaults to False) :returns: a context manager""" self.open() return self._client[:].sync_imports(quiet = quiet) def _mixup( self, ps ): """Private method to mix up a list of values in-place using a Fisher-Yates shuffle (see https://en.wikipedia.org/wiki/Fisher-Yates_shuffle). :param ps: the array :returns: the array, shuffled in-place""" for i in range(len(ps) - 1, 0, -1): j = int(numpy.random.random() * i) temp = ps[i] ps[i] = ps[j] ps[j] = temp return ps def runExperiment( self, e ): """Run the experiment across the parameter space in parallel using all the engines in the cluster. This method returns immediately. The experiments are run asynchronously, with the points in the parameter space being explored randomly so that intermediate retrievals of results are more representative of the overall result. Put another way, for a lot of experiments the results available will converge towards a final answer, so we can plot them and see the answer emerge. :param e: the experiment""" # create the parameter space space = self.parameterSpace() # only proceed if there's work to do if len(space) > 0: nb = self.notebook() # randomise the order of the parameter space so that we evaluate across # the space as we go along to try to make intermediate (incomplete) result # sets more representative of the overall result set ps = self._mixup(space) try: # connect to the cluster self.open() # submit an experiment at each point in the parameter space to the cluster view = self._client.load_balanced_view() jobs = [] for p in ps: jobs.extend((view.apply_async((lambda p: e.set(p).run()), p)).msg_ids) # there seems to be a race condition in submitting jobs, # whereby jobs get dropped if they're submitted too quickly time.sleep(0.01) # record the mesage ids of all the jobs as submitted but not yet completed psjs = zip(ps, jobs) for (p, j) in psjs: nb.addPendingResult(p, j) finally: # commit our pending results in the notebook nb.commit() self.close() def updateResults( self ): """Update our results within any pending results that have completed since we last retrieved results from the cluster. :returns: the number of pending results completed at this call""" # we do all the tests for pending results against the notebook directly, # as the corresponding methods on self call this method themselves nb = self.notebook() # look for pending results if we're waiting for any n = 0 if nb.numberOfPendingResults() > 0: # we have results to get self.open() for j in set(nb.pendingResults()): # query the status of a job status = self._client.result_status(j, status_only = False) # add all completed jobs to the notebook if j in status['completed']: r = status[j] # update the result in the notebook, cancelling # the pending result as well # values come back from Client.result_status() in # varying degrees of list-nesting, which LabNotebook.addResult() # handles itself nb.addResult(r, j) # commit changes to the notebook nb.commit() # purge the completed job from the cluster self._client.purge_hub_results(j) # record that we retrieved the results for the given job n = n + 1 return n def numberOfResults( self ): """Return the number of results we have available at the moment. :returns: the number of results""" self.updateResults() return self.notebook().numberOfResults() def numberOfPendingResults( self ): """Return the number of resultswe are waiting for. :returns: the number of pending results""" self.updateResults() return self.notebook().numberOfPendingResults() def _availableResultsFraction( self ): """Private method to return the fraction of results available, as a real number between 0 and 1. This does not update the results fetched from the cluster. :returns: the fraction of available results""" tr = self.notebook().numberOfResults() + self.notebook().numberOfPendingResults() if tr == 0: return 0 else: return (self.notebook().numberOfResults() + 0.0) / tr def readyFraction( self ): """Test what fraction of results are available. This will change over time as the results come in. :returns: the fraction from 0 to 1""" self.updateResults() return self._availableResultsFraction() def ready( self ): """Test whether all the results are available. This will change over time as the results come in. :returns: True if all the results are available""" return (self.readyFraction() == 1) def wait( self, timeout = -1 ): """Wait for all pending results to be finished. If timeout is set, return after this many seconds regardless. :param timeout: timeout period in seconds (defaults to forever) :returns: True if all the results completed""" # we can't use ipyparallel.Client.wait() for this, because that # method only works for cases where the Client object is the one that # submitted the jobs to the cluster hub -- and therefore has the # necessary data structures to perform synchronisation. This isn't the # case for us, as one of the main goals of epyc is to support disconnected # operation, which implies a different Client object retrieving results # than the one that submitted the jobs in the first place. This is # unfortunate, but understandable given the typical use cases for # Client objects. # # Instead. we have to code around a little busily. The ClusterLab.WaitingTime # global sets the latency for waiting, and we repeatedly wait for this amount # of time before updating the results. The latency value essentially controls # how busy this process is: given that most simulations are expected to # be long, a latency in the tens of seconds feels about right as a default if self.numberOfPendingResults() > 0: # we've got pending results, wait for them timeWaited = 0 while (timeout < 0) or (timeWaited < timeout): if self.numberOfPendingResults() == 0: # no pending jobs left, we're complete return True else: # not done yet, calculate the waiting period if timeout == -1: # wait for the default waiting period dt = self.WaitingTime else: # wait for the default waiting period or until the end of the timeout. # whichever comes first if (timeout - timeWaited) < self.WaitingTime: dt = timeout - timeWaited else: dt = self.WaitingTime # sleep for a while time.sleep(dt) timeWaited = timeWaited + dt # if we get here, the timeout expired, so do a final check # and then exit return (self.numberOfPendingResults() == 0) else: # no results, so we got them all return True def pendingResults( self ): """Return the list of job iods for any pending results. :returns: a list of job ids""" return self.notebook().pendingResults() def pendingResultsFor( self, params ): """Return a list of job ids for any results pending for experiments at the given point in the parameter space. :param params: the experimental parameters :returns: a list of job ids""" return self.notebook().pendingResultsFor(params) def _abortJobs( self, js ): """Private method to abort a set of jobs. :param js: the job ids to be aborted""" self.open() self._client.abort(jobs = js) self.close() def cancelPendingResultsFor( self, params ): """Cancel any results pending for experiments at the given point in the parameter space. :param params: the experimental parameters""" # grab the result job ids jobs = self.pendingResultsFor(params) if len(jobs) > 0: # abort in the cluster self._abortJobs(jobs) # cancel in the notebook self.notebook().cancelPendingResultsFor(params) def cancelAllPendingResults( self ): """Cancel all pending results.""" # grab all the pending job ids jobs = self.pendingResults() if len(jobs) > 0: # abort in the cluster self._abortJobs(jobs) # cancel in the notebook self.notebook().cancelAllPendingResults()
class IPClusterEnsemble(SurveyEnsemble): """Parallelized suvey ensemble based on IPython parallel (ipcluster) """ def __init__(self, **specs): SurveyEnsemble.__init__(self, **specs) self.verb = specs.get('verbose', True) # access the cluster self.rc = Client() self.dview = self.rc[:] self.dview.block = True with self.dview.sync_imports(): import EXOSIMS, EXOSIMS.util.get_module, \ os, os.path, time, random, cPickle, traceback if specs.has_key('logger'): specs.pop('logger') if specs.has_key('seed'): specs.pop('seed') self.dview.push(dict(specs=specs)) res = self.dview.execute( "SS = EXOSIMS.util.get_module.get_module(specs['modules'] \ ['SurveySimulation'], 'SurveySimulation')(**specs)") res2 = self.dview.execute("SS.reset_sim()") self.vprint("Created SurveySimulation objects on %d engines." % len(self.rc.ids)) #for row in res.stdout: # self.vprint(row) self.lview = self.rc.load_balanced_view() self.maxNumEngines = len(self.rc.ids) def run_ensemble(self, sim, nb_run_sim, run_one=None, genNewPlanets=True, rewindPlanets=True, kwargs={}): """ Args: sim: """ t1 = time.time() async_res = [] for j in range(nb_run_sim): ar = self.lview.apply_async(run_one, genNewPlanets=genNewPlanets, rewindPlanets=rewindPlanets, **kwargs) async_res.append(ar) print("Submitted %d tasks." % len(async_res)) runStartTime = time.time() #create job starting time avg_time_per_run = 0. tmplenoutstandingset = nb_run_sim tLastRunFinished = time.time() ar = self.rc._asyncresult_from_jobs(async_res) while not ar.ready(): ar.wait(10.) clear_output(wait=True) if ar.progress > 0: timeleft = ar.elapsed / ar.progress * (nb_run_sim - ar.progress) if timeleft > 3600.: timeleftstr = "%2.2f hours" % (timeleft / 3600.) elif timeleft > 60.: timeleftstr = "%2.2f minutes" % (timeleft / 60.) else: timeleftstr = "%2.2f seconds" % timeleft else: timeleftstr = "who knows" #Terminate hanging runs outstandingset = self.rc.outstanding #a set of msg_ids that have been submitted but resunts have not been received if len(outstandingset) > 0 and len( outstandingset ) < nb_run_sim: #there is at least 1 run still going and we have not just started avg_time_per_run = (time.time() - runStartTime) / float( nb_run_sim - len(outstandingset) ) #compute average amount of time per run if len( outstandingset ) < tmplenoutstandingset: #The scheduler has finished a run tmplenoutstandingset = len( outstandingset ) #update this. should decrease by ~1 or number of cores... tLastRunFinished = time.time( ) #update tLastRunFinished to the last time a simulation finished (right now) #self.vprint("tmplenoutstandingset %d, tLastRunFinished %0.6f"%(tmplenoutstandingset,tLastRunFinished)) if time.time() - tLastRunFinished > avg_time_per_run * ( 1 + self.maxNumEngines * 2): self.vprint('Aborting ' + str(len(self.rc.outstanding)) + 'qty outstandingset jobs') self.rc.abort( ) #by default should abort all outstanding jobs... #it is possible that this will not stop the jobs running print("%4i/%i tasks finished after %4i s. About %s to go." % (ar.progress, nb_run_sim, ar.elapsed, timeleftstr), end="") sys.stdout.flush() t2 = time.time() print("\nCompleted in %d sec" % (t2 - t1)) res = [ar.get() for ar in async_res] return res
class ClusterLab(epyc.Lab): """A :class:`Lab` running on an ``pyparallel`` compute cluster. Experiments are submitted to engines in the cluster for execution in parallel, with the experiments being performed asynchronously to allow for disconnection and subsequent retrieval of results. Combined with a persistent :class:`LabNotebook`, this allows for fully decoupled access to an on-going computational experiment with piecewise retrieval of results. This class requires a cluster to already be set up and running, configured for persistent access, with access to the necessary code and libraries, and with appropriate security information available to the client. """ # Tuning parameters WaitingTime = 30 #: Waiting time for checking for job completion. Lower values increase network traffic. def __init__(self, notebook=None, url_file=None, profile=None, profile_dir=None, ipython_dir=None, context=None, debug=False, sshserver=None, sshkey=None, password=None, paramiko=None, timeout=10, cluster_id=None, use_dill=False, **extra_args): """Create an empty lab attached to the given cluster. Most of the arguments are as expected by the ``pyparallel.Client`` class, and are used to create the underlying connection to the cluster. The connection is opened immediately, meaning the cluster must be up and accessible when creating a lab to use it. :param notebook: the notebook used to results (defaults to an empty :class:`LabNotebook`) :param url_file: file containing connection information for accessing cluster :param profile: name of the IPython profile to use :param profile_dir: directory containing the profile's connection information :param ipython_dir: directory containing profile directories :param context: ZMQ context :param debug: whether to issue debugging information (defaults to False) :param sshserver: username and machine for ssh connections :param sshkey: file containing ssh key :param password: ssh password :param paramiko: True to use paramiko for ssh (defaults to False) :param timeout: timeout in seconds for ssh connection (defaults to 10s) :param cluster_id: string added to runtime files to prevent collisions :param use_dill: whether to use Dill as pickler (defaults to False)""" super(epyc.ClusterLab, self).__init__(notebook) # record all the connection arguments for later self._arguments = dict(url_file=url_file, profile=profile, profile_dir=profile_dir, ipython_dir=ipython_dir, context=context, debug=debug, sshserver=sshserver, sshkey=sshkey, password=password, paramiko=paramiko, timeout=timeout, cluster_id=cluster_id, use_dill=use_dill, **extra_args) self._client = None # connect to the cluster self.open() # use Dill if requested if use_dill: self.use_dill() # ---------- Protocol ---------- def open(self): """Connect to the cluster.""" if self._client is None: self._client = Client(**self._arguments) def close(self): """Close down the connection to the cluster.""" if self._client is not None: self._client.close() self._client = None def recreate(self): '''Save the arguments needed to re-connect to the cluster we use. :returns: a (classname, args) pair''' (cn, args) = super(ClusterLab, self).recreate() nargs = args.copy() nargs.update(self._arguments) return (classname, nargs) # ---------- Remote control of the compute engines ---------- def numberOfEngines(self): """Return the number of engines available to this lab. :returns: the number of engines""" return len(self.engines()) def engines(self): """Return a list of the available engines. :returns: a list of engines""" self.open() return self._client[:] def use_dill(self): """Make the cluster use Dill as pickler for transferring results. This isn't generally needed, but is sometimes useful for particularly complex experiments such as those involving closures. (Or, to put it another way, if you find yourself tempted to use this method, consider re-structuring your experiment code.)""" self.open() with self.sync_imports(quiet=True): import dill self._client.direct_view().use_dill() def sync_imports(self, quiet=False): """Return a context manager to control imports onto all the engines in the underlying cluster. This method is used within a ``with`` statement. Any imports should be done with no experiments running, otherwise the method will block until the cluster is quiet. Generally imports will be one of the first things done when connecting to a cluster. (But be careful not to accidentally try to re-import if re-connecting to a running cluster.) :param quiet: if True, suppresses messages (defaults to False) :returns: a context manager""" self.open() return self._client[:].sync_imports(quiet=quiet) # ---------- Running experiments ---------- def _mixup(self, ps): """Private method to mix up a list of values in-place using a Fisher-Yates shuffle (see https://en.wikipedia.org/wiki/Fisher-Yates_shuffle). :param ps: the array :returns: the array, shuffled in-place""" for i in range(len(ps) - 1, 0, -1): j = int(numpy.random.random() * i) temp = ps[i] ps[i] = ps[j] ps[j] = temp return ps def runExperiment(self, e): """Run the experiment across the parameter space in parallel using all the engines in the cluster. This method returns immediately. The experiments are run asynchronously, with the points in the parameter space being explored randomly so that intermediate retrievals of results are more representative of the overall result. Put another way, for a lot of experiments the results available will converge towards a final answer, so we can plot them and see the answer emerge. :param e: the experiment""" # create the parameter space space = self.parameterSpace() # only proceed if there's work to do if len(space) > 0: nb = self.notebook() # randomise the order of the parameter space so that we evaluate across # the space as we go along to try to make intermediate (incomplete) result # sets more representative of the overall result set ps = self._mixup(space) try: # connect to the cluster self.open() # submit an experiment at each point in the parameter space to the cluster view = self._client.load_balanced_view() jobs = [] for p in ps: jobs.extend((view.apply_async((lambda p: e.set(p).run()), p)).msg_ids) # there seems to be a race condition in submitting jobs, # whereby jobs get dropped if they're submitted too quickly time.sleep(0.01) # record the mesage ids of all the jobs as submitted but not yet completed psjs = zip(ps, jobs) for (p, j) in psjs: nb.addPendingResult(p, j) finally: # commit our pending results in the notebook nb.commit() self.close() def updateResults(self): """Update our results within any pending results that have completed since we last retrieved results from the cluster. :returns: the number of pending results completed at this call""" # we do all the tests for pending results against the notebook directly, # as the corresponding methods on self call this method themselves nb = self.notebook() # look for pending results if we're waiting for any n = 0 if nb.numberOfPendingResults() > 0: # we have results to get self.open() for j in set(nb.pendingResults()): # query the status of a job status = self._client.result_status(j, status_only=False) # add all completed jobs to the notebook if j in status['completed']: r = status[j] # update the result in the notebook, cancelling # the pending result as well # values come back from Client.result_status() in # varying degrees of list-nesting, which LabNotebook.addResult() # handles itself nb.addResult(r, j) # commit changes to the notebook nb.commit() # purge the completed job from the cluster self._client.purge_hub_results(j) # record that we retrieved the results for the given job n = n + 1 return n # ---------- Accessing results ---------- def numberOfResults(self): """Return the number of results we have available at the moment. :returns: the number of results""" self.updateResults() return self.notebook().numberOfResults() def numberOfPendingResults(self): """Return the number of resultswe are waiting for. :returns: the number of pending results""" self.updateResults() return self.notebook().numberOfPendingResults() def _availableResultsFraction(self): """Private method to return the fraction of results available, as a real number between 0 and 1. This does not update the results fetched from the cluster. :returns: the fraction of available results""" tr = self.notebook().numberOfResults() + self.notebook( ).numberOfPendingResults() if tr == 0: return 0 else: return (self.notebook().numberOfResults() + 0.0) / tr def readyFraction(self): """Test what fraction of results are available. This will change over time as the results come in. :returns: the fraction from 0 to 1""" self.updateResults() return self._availableResultsFraction() def ready(self): """Test whether all the results are available. This will change over time as the results come in. :returns: True if all the results are available""" return (self.readyFraction() == 1) def wait(self, timeout=-1): """Wait for all pending results to be finished. If timeout is set, return after this many seconds regardless. :param timeout: timeout period in seconds (defaults to forever) :returns: True if all the results completed""" # we can't use pyparallel.Client.wait() for this, because that # method only works for cases where the Client object is the one that # submitted the jobs to the cluster hub -- and therefore has the # necessary data structures to perform synchronisation. This isn't the # case for us, as one of the main goals of epyc is to support disconnected # operation, which implies a different Client object retrieving results # than the one that submitted the jobs in the first place. This is # unfortunate, but understandable given the typical use cases for # Client objects in pyparallel. # # Instead. we have to code around a little busily. The ClusterLab.WaitingTime # global sets the latency for waiting, and we repeatedly wait for this amount # of time before updating the results. The latency value essentially controls # how busy this process is: given that most simulations are expected to # be long, a latency in the tens of seconds feels about right as a default if self.numberOfPendingResults() > 0: # we've got pending results, wait for them timeWaited = 0 while (timeout < 0) or (timeWaited < timeout): if self.numberOfPendingResults() == 0: # no pending jobs left, we're complete return True else: # not done yet, calculate the waiting period if timeout == -1: # wait for the default waiting period dt = self.WaitingTime else: # wait for the default waiting period or until the end of the timeout. # whichever comes first if (timeout - timeWaited) < self.WaitingTime: dt = timeout - timeWaited else: dt = self.WaitingTime # sleep for a while time.sleep(dt) timeWaited = timeWaited + dt # if we get here, the timeout expired, so do a final check # and then exit return (self.numberOfPendingResults() == 0) else: # no results, so we got them all return True # ---------- Managing pending results ---------- def pendingResults(self): """Return the list of job iods for any pending results. :returns: a list of job ids""" return self.notebook().pendingResults() def pendingResultsFor(self, params): """Return a list of job ids for any results pending for experiments at the given point in the parameter space. :param params: the experimental parameters :returns: a list of job ids""" return self.notebook().pendingResultsFor(params) def _abortJobs(self, js): """Private method to abort a set of jobs. :param js: the job ids to be aborted""" self.open() self._client.abort(jobs=js) self.close() def cancelPendingResultsFor(self, params): """Cancel any results pending for experiments at the given point in the parameter space. :param params: the experimental parameters""" # grab the result job ids jobs = self.pendingResultsFor(params) if len(jobs) > 0: # abort in the cluster self._abortJobs(jobs) # cancel in the notebook self.notebook().cancelPendingResultsFor(params) def cancelAllPendingResults(self): """Cancel all pending results.""" # grab all the pending job ids jobs = self.pendingResults() if len(jobs) > 0: # abort in the cluster self._abortJobs(jobs) # cancel in the notebook self.notebook().cancelAllPendingResults()