def map(self, parallel_task, args):
        from ipyparallel import Client, TimeoutError

        chunksize = 1
        if self.max_tasks > 0 and len(args) > self.max_tasks:
            chunksize = len(args) // self.max_tasks
            if chunksize * self.max_tasks < len(args):
                chunksize += 1
        client = None
        try:
            client = Client()
        except TimeoutError:
            raise RuntimeError(
                'Cannot connect to the ipyparallel client. Is it running?')

        ar = None
        try:
            client[:].use_cloudpickle()
            lbv = client.load_balanced_view()
            ar = lbv.map_async(IppFunctionWrapper(parallel_task, self.timeout),
                               args,
                               chunksize=chunksize)
            try:
                r = []
                for k, z in enumerate(
                        tqdm(ar, desc="(IPYPARALLEL)", total=len(args))):
                    if z[0] == -1:
                        logger.error(z[1])
                        engine = ar.engine_id[k]
                        client.abort(ar)
                        client.close()
                        raise RuntimeError(
                            'remote failure (task %d of %d on engine %d)' %
                            (k + 1, len(ar), engine))
                    elif z[0] == 0:
                        r.append(z[1])
            except KeyboardInterrupt:
                client.abort(ar)
                raise
        finally:
            # always close the client to release resources
            if ar:
                client.abort(ar)
            if client:
                client.close()
        return r
Example #2
0
class IPClusterEnsemble(SurveyEnsemble):
    """Parallelized suvey ensemble based on IPython parallel (ipcluster)
    
    """

    def __init__(self, **specs):
        
        SurveyEnsemble.__init__(self, **specs)

        self.verb = specs.get('verbose', True)
        
        # access the cluster
        self.rc = Client()
        self.dview = self.rc[:]
        self.dview.block = True
        with self.dview.sync_imports(): import EXOSIMS, EXOSIMS.util.get_module, \
                os, os.path, time, random, pickle, traceback, numpy
        if 'logger' in specs:
            specs.pop('logger')
        if 'seed' in specs:
            specs.pop('seed')
        self.dview.push(dict(specs=specs))
        self.vprint("Building SurveySimulation object on all workers.")
        res = self.dview.execute("SS = EXOSIMS.util.get_module.get_module(specs['modules'] \
                ['SurveySimulation'], 'SurveySimulation')(**specs)")
        
        res2 = self.dview.execute("SS.reset_sim()")

        self.vprint("Created SurveySimulation objects on %d engines."%len(self.rc.ids))
        #for row in res.stdout:
        #    self.vprint(row)

        self.lview = self.rc.load_balanced_view()

        self.maxNumEngines = len(self.rc.ids)

    def run_ensemble(self, sim, nb_run_sim, run_one=None, genNewPlanets=True,
        rewindPlanets=True, kwargs={}):
        """
        Args:
            sim:

        """
        hangingRunsOccured = False # keeps track of whether hanging runs have occured
        t1 = time.time()
        async_res = []
        for j in range(nb_run_sim):
            ar = self.lview.apply_async(run_one, genNewPlanets=genNewPlanets,
                    rewindPlanets=rewindPlanets, **kwargs)
            async_res.append(ar)
        
        print("Submitted %d tasks."%len(async_res))
        
        engine_pids = self.rc[:].apply(os.getpid).get_dict()
        #ar2 = self.lview.apply_async(os.getpid)
        #pids = ar2.get_dict()
        print('engine_pids')
        print(engine_pids)

        runStartTime = time.time()#create job starting time
        avg_time_per_run = 0.
        tmplenoutstandingset = nb_run_sim
        tLastRunFinished = time.time()
        ar= self.rc._asyncresult_from_jobs(async_res)
        while not ar.ready():
            ar.wait(10.)
            clear_output(wait=True)
            if ar.progress > 0:
                timeleft = ar.elapsed/ar.progress * (nb_run_sim - ar.progress)
                if timeleft > 3600.:
                    timeleftstr = "%2.2f hours"%(timeleft/3600.)
                elif timeleft > 60.:
                    timeleftstr = "%2.2f minutes"%(timeleft/60.)
                else:
                    timeleftstr = "%2.2f seconds"%timeleft
            else:
                timeleftstr = "who knows"

            #Terminate hanging runs
            outstandingset = self.rc.outstanding#a set of msg_ids that have been submitted but resunts have not been received
            if len(outstandingset) > 0 and len(outstandingset) < nb_run_sim:#there is at least 1 run still going and we have not just started
                avg_time_per_run = (time.time() - runStartTime)/float(nb_run_sim - len(outstandingset))#compute average amount of time per run
                if len(outstandingset) < tmplenoutstandingset:#The scheduler has finished a run
                    tmplenoutstandingset = len(outstandingset)#update this. should decrease by ~1 or number of cores...
                    tLastRunFinished = time.time()#update tLastRunFinished to the last time a simulation finished (right now)
                    #self.vprint("tmplenoutstandingset %d, tLastRunFinished %0.6f"%(tmplenoutstandingset,tLastRunFinished))
                if time.time() - tLastRunFinished > avg_time_per_run*(1. + self.maxNumEngines*2.)*4.:
                    #nb_run_sim = len(self.rc.outstanding)
                    #restartRuns = True
                    self.vprint('Aborting ' + str(len(self.rc.outstanding)) + 'qty outstandingset jobs')
                    #runningPIDS = os.listdir('/proc') # get all running pids
                    self.vprint('queue_status')
                    self.vprint(str(self.rc.queue_status()))
                    self.rc.abort()
                    ar.wait(20)
                    runningPIDS = [int(tpid) for tpid in os.listdir('/proc') if tpid.isdigit()]
                    #[self.rc.queue_status()[eind] for eind in np.arange(self.maxNumEngines) if self.rc.queue_status()[eind]['tasks']>0]
                    for engineInd in [eind for eind in np.arange(self.maxNumEngines) if self.rc.queue_status()[eind]['tasks']>0]:
                        os.kill(engine_pids[engineInd],15)
                        time.sleep(20)
                    # for pid in [engine_pids[eind] for eind in np.arange(len(engine_pids))]:
                    #     if pid in runningPIDS:
                    #         os.kill(pid,9) # send kill command to stop this worker
                    stopIPClusterCommand = subprocess.Popen(['ipcluster','stop'])
                    stopIPClusterCommand.wait()
                    time.sleep(60) # doing this instead of waiting for ipcluster to terminate
                    stopIPClusterCommand = subprocess.Popen(['ipcluster','stop'])
                    stopIPClusterCommand.wait()
                    time.sleep(60) # doing this instead of waiting for ipcluster to terminate
                    hangingRunsOccured = True # keeps track of whether hanging runs have occured
                    break
                    #stopIPClusterCommand.wait() # waits for process to terminate
                    #call(["ipcluster","stop"]) # send command to stop ipcluster
                    #self.rc.abort(jobs=self.rc.outstanding.copy().pop())
                    #self.rc.abort()#by default should abort all outstanding jobs... #it is possible that this will not stop the jobs running
                    #ar.wait(100)
                    #self.rc.purge_everything() # purge all results if outstanding *because rc.abort() didn't seem to do the job right
                    tLastRunFinished = time.time()#update tLastRunFinished to the last time a simulation was restarted (right now)

            print("%4i/%i tasks finished after %4i s. About %s to go." % (ar.progress, nb_run_sim, ar.elapsed, timeleftstr), end="")
            sys.stdout.flush()
        #numRunStarts += 1 # increment number of run restarts



        t2 = time.time()
        print("\nCompleted in %d sec" % (t2 - t1))
        
        if hangingRunsOccured: #hanging runs have occured
            res = [1]
        else:
            res = [ar.get() for ar in async_res]
        
        return res
Example #3
0
class IPClusterEnsemble(SurveyEnsemble):
    """Parallelized suvey ensemble based on IPython parallel (ipcluster)
    
    """
    def __init__(self, **specs):

        SurveyEnsemble.__init__(self, **specs)

        self.verb = specs.get('verbose', True)

        # access the cluster
        self.rc = Client()
        self.dview = self.rc[:]
        self.dview.block = True
        with self.dview.sync_imports():            import EXOSIMS, EXOSIMS.util.get_module, \
os, os.path, time, random, pickle, traceback, numpy
        if 'logger' in specs:
            specs.pop('logger')
        if 'seed' in specs:
            specs.pop('seed')
        self.dview.push(dict(specs=specs))
        self.vprint("Building SurveySimulation object on all workers.")
        res = self.dview.execute(
            "SS = EXOSIMS.util.get_module.get_module(specs['modules'] \
                ['SurveySimulation'], 'SurveySimulation')(**specs)")

        res2 = self.dview.execute("SS.reset_sim()")

        self.vprint("Created SurveySimulation objects on %d engines." %
                    len(self.rc.ids))
        #for row in res.stdout:
        #    self.vprint(row)

        self.lview = self.rc.load_balanced_view()

        self.maxNumEngines = len(self.rc.ids)

    def run_ensemble(self,
                     sim,
                     nb_run_sim,
                     run_one=None,
                     genNewPlanets=True,
                     rewindPlanets=True,
                     kwargs={}):
        """
        Args:
            sim:

        """
        hangingRunsOccured = False  # keeps track of whether hanging runs have occured
        t1 = time.time()
        async_res = []
        for j in range(nb_run_sim):
            ar = self.lview.apply_async(run_one,
                                        genNewPlanets=genNewPlanets,
                                        rewindPlanets=rewindPlanets,
                                        **kwargs)
            async_res.append(ar)

        print("Submitted %d tasks." % len(async_res))

        engine_pids = self.rc[:].apply(os.getpid).get_dict()
        #ar2 = self.lview.apply_async(os.getpid)
        #pids = ar2.get_dict()
        print('engine_pids')
        print(engine_pids)

        runStartTime = time.time()  #create job starting time
        avg_time_per_run = 0.
        tmplenoutstandingset = nb_run_sim
        tLastRunFinished = time.time()
        ar = self.rc._asyncresult_from_jobs(async_res)
        while not ar.ready():
            ar.wait(10.)
            clear_output(wait=True)
            if ar.progress > 0:
                timeleft = ar.elapsed / ar.progress * (nb_run_sim -
                                                       ar.progress)
                if timeleft > 3600.:
                    timeleftstr = "%2.2f hours" % (timeleft / 3600.)
                elif timeleft > 60.:
                    timeleftstr = "%2.2f minutes" % (timeleft / 60.)
                else:
                    timeleftstr = "%2.2f seconds" % timeleft
            else:
                timeleftstr = "who knows"

            #Terminate hanging runs
            outstandingset = self.rc.outstanding  #a set of msg_ids that have been submitted but resunts have not been received
            if len(outstandingset) > 0 and len(
                    outstandingset
            ) < nb_run_sim:  #there is at least 1 run still going and we have not just started
                avg_time_per_run = (time.time() - runStartTime) / float(
                    nb_run_sim - len(outstandingset)
                )  #compute average amount of time per run
                if len(
                        outstandingset
                ) < tmplenoutstandingset:  #The scheduler has finished a run
                    tmplenoutstandingset = len(
                        outstandingset
                    )  #update this. should decrease by ~1 or number of cores...
                    tLastRunFinished = time.time(
                    )  #update tLastRunFinished to the last time a simulation finished (right now)
                    #self.vprint("tmplenoutstandingset %d, tLastRunFinished %0.6f"%(tmplenoutstandingset,tLastRunFinished))
                if time.time() - tLastRunFinished > avg_time_per_run * (
                        1. + self.maxNumEngines * 2.) * 4.:
                    #nb_run_sim = len(self.rc.outstanding)
                    #restartRuns = True
                    self.vprint('Aborting ' + str(len(self.rc.outstanding)) +
                                'qty outstandingset jobs')
                    #runningPIDS = os.listdir('/proc') # get all running pids
                    self.vprint('queue_status')
                    self.vprint(str(self.rc.queue_status()))
                    self.rc.abort()
                    ar.wait(20)
                    runningPIDS = [
                        int(tpid) for tpid in os.listdir('/proc')
                        if tpid.isdigit()
                    ]
                    #[self.rc.queue_status()[eind] for eind in np.arange(self.maxNumEngines) if self.rc.queue_status()[eind]['tasks']>0]
                    for engineInd in [
                            eind for eind in np.arange(self.maxNumEngines)
                            if self.rc.queue_status()[eind]['tasks'] > 0
                    ]:
                        os.kill(engine_pids[engineInd], 15)
                        time.sleep(20)
                    # for pid in [engine_pids[eind] for eind in np.arange(len(engine_pids))]:
                    #     if pid in runningPIDS:
                    #         os.kill(pid,9) # send kill command to stop this worker
                    stopIPClusterCommand = subprocess.Popen(
                        ['ipcluster', 'stop'])
                    stopIPClusterCommand.wait()
                    time.sleep(
                        60
                    )  # doing this instead of waiting for ipcluster to terminate
                    stopIPClusterCommand = subprocess.Popen(
                        ['ipcluster', 'stop'])
                    stopIPClusterCommand.wait()
                    time.sleep(
                        60
                    )  # doing this instead of waiting for ipcluster to terminate
                    hangingRunsOccured = True  # keeps track of whether hanging runs have occured
                    break
                    #stopIPClusterCommand.wait() # waits for process to terminate
                    #call(["ipcluster","stop"]) # send command to stop ipcluster
                    #self.rc.abort(jobs=self.rc.outstanding.copy().pop())
                    #self.rc.abort()#by default should abort all outstanding jobs... #it is possible that this will not stop the jobs running
                    #ar.wait(100)
                    #self.rc.purge_everything() # purge all results if outstanding *because rc.abort() didn't seem to do the job right
                    tLastRunFinished = time.time(
                    )  #update tLastRunFinished to the last time a simulation was restarted (right now)

            print("%4i/%i tasks finished after %4i s. About %s to go." %
                  (ar.progress, nb_run_sim, ar.elapsed, timeleftstr),
                  end="")
            sys.stdout.flush()
        #numRunStarts += 1 # increment number of run restarts

        t2 = time.time()
        print("\nCompleted in %d sec" % (t2 - t1))

        if hangingRunsOccured:  #hanging runs have occured
            res = [1]
        else:
            res = [ar.get() for ar in async_res]

        return res
Example #4
0
class ClusterLab(epyc.Lab):
    """A :class:`Lab` running on an ``ipyparallel`` compute
    cluster.

    Experiments are submitted to engines in the cluster for
    execution in parallel, with the experiments being performed
    asynchronously to allow for disconnection and subsequent retrieval
    of results. Combined with a persistent :class:`LabNotebook`, this allows
    for fully decoupled access to an on-going computational experiment
    with piecewise retrieval of results.

    This class requires a cluster to already be set up and running, configured
    for persistent access, with access to the necessary code and libraries,
    and with appropriate security information available to the client.
    """

    # Tuning parameters
    WaitingTime = 30           #: Waiting time for checking for job completion. Lower values increase network traffic.

    
    def __init__( self, notebook = None, url_file = None, profile = None, profile_dir = None, ipython_dir = None, context = None, debug = False, sshserver = None, sshkey = None, password = None, paramiko = None, timeout = 10, cluster_id = None, use_dill = False, **extra_args ):
        """Create an empty lab attached to the given cluster. Most of the arguments
        are as expected by the ``ipyparallel.Client`` class, and are used to create the
        underlying connection to the cluster. The connection is opened immediately,
        meaning the cluster must be up and accessible when creating a lab to use it.

        :param notebook: the notebook used to results (defaults to an empty :class:`LabNotebook`)
        :param url_file: file containing connection information for accessing cluster
        :param profile: name of the IPython profile to use
        :param profile_dir: directory containing the profile's connection information
        :param ipython_dir: directory containing profile directories
        :param context: ZMQ context
        :param debug: whether to issue debugging information (defaults to False)
        :param sshserver: username and machine for ssh connections
        :param sshkey: file containing ssh key
        :param password: ssh password
        :param paramiko: True to use paramiko for ssh (defaults to False)
        :param timeout: timeout in seconds for ssh connection (defaults to 10s)
        :param cluster_id: string added to runtime files to prevent collisions
        :param use_dill: whether to use Dill as pickler (defaults to False)"""
        super(epyc.ClusterLab, self).__init__(notebook)
        
        # record all the connection arguments for later
        self._arguments = dict(url_file = url_file,
                               profile = profile,
                               profile_dir = profile_dir,
                               ipython_dir = ipython_dir,
                               context = context,
                               debug = debug,
                               sshserver = sshserver,
                               sshkey = sshkey,
                               password = password,
                               paramiko = paramiko,
                               timeout = timeout,
                               cluster_id = cluster_id,
                               **extra_args)
        self._client = None

        # connect to the cluster
        self.open()

        # use Dill if requested
        if use_dill:
            self.use_dill()
        
    def open( self ):
        """Connect to the cluster."""
        if self._client is None:
            self._client = Client(**self._arguments)
        
    def close( self ):
        """Close down the connection to the cluster."""
        if self._client is not None:
            self._client.close()
            self._client = None
        
    def numberOfEngines( self ):
        """Return the number of engines available to this lab.

        :returns: the number of engines"""
        return len(self.engines())

    def engines( self ):
        """Return a list of the available engines.

        :returns: a list of engines"""
        self.open()
        return self._client[:]

    def use_dill( self ):
        """Make the cluster use Dill as pickler for transferring results. This isn't
        generally needed, but is sometimes useful for particularly complex experiments
        such as those involving closures. (Or, to put it another way, if you find yourself
        tempted to use this method, consider re-structuring your experiment code.)"""
        self.open()
        with self.sync_imports(quiet = True):
            import dill
        self._client.direct_view().use_dill()

    def sync_imports( self, quiet = False ):
        """Return a context manager to control imports onto all the engines
        in the underlying cluster. This method is used within a ``with`` statement.

        Any imports should be done with no experiments running, otherwise the
        method will block until the cluster is quiet. Generally imports will be one
        of the first things done when connecting to a cluster. (But be careful
        not to accidentally try to re-import if re-connecting to a running
        cluster.)

        :param quiet: if True, suppresses messages (defaults to False)
        :returns: a context manager"""
        self.open()
        return self._client[:].sync_imports(quiet = quiet)
    
    def _mixup( self, ps ):
        """Private method to mix up a list of values in-place using a Fisher-Yates
        shuffle (see https://en.wikipedia.org/wiki/Fisher-Yates_shuffle).

        :param ps: the array
        :returns: the array, shuffled in-place"""
        for i in range(len(ps) - 1, 0, -1):
            j = int(numpy.random.random() * i)
            temp = ps[i]
            ps[i] = ps[j]
            ps[j] = temp
        return ps
     
    def runExperiment( self, e ):
        """Run the experiment across the parameter space in parallel using
        all the engines in the cluster. This method returns immediately.

        The experiments are run asynchronously, with the points in the parameter
        space being explored randomly so that intermediate retrievals of results
        are more representative of the overall result. Put another way, for a lot
        of experiments the results available will converge towards a final
        answer, so we can plot them and see the answer emerge.

        :param e: the experiment"""

        # create the parameter space
        space = self.parameterSpace()

        # only proceed if there's work to do
        if len(space) > 0:
            nb = self.notebook()
            
            # randomise the order of the parameter space so that we evaluate across
            # the space as we go along to try to make intermediate (incomplete) result
            # sets more representative of the overall result set
            ps = self._mixup(space)

            try:
                # connect to the cluster
                self.open()

                # submit an experiment at each point in the parameter space to the cluster
                view = self._client.load_balanced_view()
                jobs = []
                for p in ps:
                    jobs.extend((view.apply_async((lambda p: e.set(p).run()), p)).msg_ids)

                    # there seems to be a race condition in submitting jobs,
                    # whereby jobs get dropped if they're submitted too quickly
                    time.sleep(0.01)
                
                # record the mesage ids of all the jobs as submitted but not yet completed
                psjs = zip(ps, jobs)
                for (p, j) in psjs:
                    nb.addPendingResult(p, j)
            finally:
                # commit our pending results in the notebook
                nb.commit()
                self.close()

    def updateResults( self ):
        """Update our results within any pending results that have completed since we
        last retrieved results from the cluster.

        :returns: the number of pending results completed at this call"""

        # we do all the tests for pending results against the notebook directly,
        # as the corresponding methods on self call this method themselves
        nb = self.notebook()

        # look for pending results if we're waiting for any
        n = 0
        if nb.numberOfPendingResults() > 0:
            # we have results to get
            self.open()
            for j in set(nb.pendingResults()):
                # query the status of a job
                status = self._client.result_status(j, status_only = False)
                    
                # add all completed jobs to the notebook
                if j in status['completed']:
                    r = status[j]
                        
                    # update the result in the notebook, cancelling
                    # the pending result as well
                    # values come back from Client.result_status() in
                    # varying degrees of list-nesting, which LabNotebook.addResult()
                    # handles itself
                    nb.addResult(r, j)

                    # commit changes to the notebook
                    nb.commit()

                    # purge the completed job from the cluster
                    self._client.purge_hub_results(j)
                         
                    # record that we retrieved the results for the given job
                    n = n + 1
        return n

    def numberOfResults( self ):
        """Return the number of results we have available at the moment.

        :returns: the number of results"""
        self.updateResults()
        return self.notebook().numberOfResults()

    def numberOfPendingResults( self ):
        """Return the number of resultswe are waiting for.

        :returns: the number of pending results"""
        self.updateResults()
        return self.notebook().numberOfPendingResults()
    
    def _availableResultsFraction( self ):
        """Private method to return the fraction of results available, as a real number
        between 0 and 1. This does not update the results fetched from the cluster.

        :returns: the fraction of available results"""
        tr = self.notebook().numberOfResults() + self.notebook().numberOfPendingResults()
        if tr == 0:
            return 0
        else:
            return (self.notebook().numberOfResults() + 0.0) / tr
    
    def readyFraction( self ):
        """Test what fraction of results are available. This will change over
        time as the results come in.

        :returns: the fraction from 0 to 1"""
        self.updateResults()
        return self._availableResultsFraction()
    
    def ready( self ):
        """Test whether all the results are available. This will change over
        time as the results come in.

        :returns: True if all the results are available"""
        return (self.readyFraction() == 1)

    def wait( self, timeout = -1 ):
        """Wait for all pending results to be finished. If timeout is set,
        return after this many seconds regardless.

        :param timeout: timeout period in seconds (defaults to forever)
        :returns: True if all the results completed"""

        # we can't use ipyparallel.Client.wait() for this, because that
        # method only works for cases where the Client object is the one that
        # submitted the jobs to the cluster hub -- and therefore has the
        # necessary data structures to perform synchronisation. This isn't the
        # case for us, as one of the main goals of epyc is to support disconnected
        # operation, which implies a different Client object retrieving results
        # than the one that submitted the jobs in the first place. This is
        # unfortunate, but understandable given the typical use cases for
        # Client objects.
        #
        # Instead. we have to code around a little busily. The ClusterLab.WaitingTime
        # global sets the latency for waiting, and we repeatedly wait for this amount
        # of time before updating the results. The latency value essentially controls
        # how busy this process is: given that most simulations are expected to
        # be long, a latency in the tens of seconds feels about right as a default
        if self.numberOfPendingResults() > 0:
            # we've got pending results, wait for them
            timeWaited = 0
            while (timeout < 0) or (timeWaited < timeout):
                if self.numberOfPendingResults() == 0:
                    # no pending jobs left, we're complete
                    return True
                else:
                    # not done yet, calculate the waiting period
                    if timeout == -1:
                        # wait for the default waiting period
                        dt = self.WaitingTime
                    else:
                        # wait for the default waiting period or until the end of the timeout.
                        # whichever comes first
                        if (timeout - timeWaited) < self.WaitingTime:
                            dt = timeout - timeWaited
                        else:
                            dt = self.WaitingTime
                            
                    # sleep for a while
                    time.sleep(dt)
                    timeWaited = timeWaited + dt

            # if we get here, the timeout expired, so do a final check
            # and then exit
            return (self.numberOfPendingResults() == 0)

        else:
            # no results, so we got them all
            return True
        
    def pendingResults( self ):
        """Return the list of job iods for any pending results.

        :returns: a list of job ids"""
        return self.notebook().pendingResults()
    
    def pendingResultsFor( self, params ):
        """Return a list of job ids for any results pending for experiments
        at the given point in the parameter space.

        :param params: the experimental parameters
        :returns: a list of job ids"""
        return self.notebook().pendingResultsFor(params)
    
    def _abortJobs( self, js ):
        """Private method to abort a set of jobs.

        :param js: the job ids to be aborted"""
        self.open()
        self._client.abort(jobs = js)
        self.close()
        
    def cancelPendingResultsFor( self, params ):
        """Cancel any results pending for experiments at the given point
        in the parameter space.

        :param params: the experimental parameters"""
        
        # grab the result job ids
        jobs = self.pendingResultsFor(params)
        
        if len(jobs) > 0:
            # abort in the cluster
            self._abortJobs(jobs)
            
            # cancel in the notebook                  
            self.notebook().cancelPendingResultsFor(params)
        
    def cancelAllPendingResults( self ):
        """Cancel all pending results."""

        # grab all the pending job ids
        jobs = self.pendingResults()
        
        if len(jobs) > 0:
            # abort in the cluster
            self._abortJobs(jobs)
            
            # cancel in the notebook                  
            self.notebook().cancelAllPendingResults()
Example #5
0
class IPClusterEnsemble(SurveyEnsemble):
    """Parallelized suvey ensemble based on IPython parallel (ipcluster)
    
    """
    def __init__(self, **specs):

        SurveyEnsemble.__init__(self, **specs)

        self.verb = specs.get('verbose', True)

        # access the cluster
        self.rc = Client()
        self.dview = self.rc[:]
        self.dview.block = True
        with self.dview.sync_imports():            import EXOSIMS, EXOSIMS.util.get_module, \
os, os.path, time, random, cPickle, traceback
        if specs.has_key('logger'):
            specs.pop('logger')
        if specs.has_key('seed'):
            specs.pop('seed')
        self.dview.push(dict(specs=specs))
        res = self.dview.execute(
            "SS = EXOSIMS.util.get_module.get_module(specs['modules'] \
                ['SurveySimulation'], 'SurveySimulation')(**specs)")

        res2 = self.dview.execute("SS.reset_sim()")

        self.vprint("Created SurveySimulation objects on %d engines." %
                    len(self.rc.ids))
        #for row in res.stdout:
        #    self.vprint(row)

        self.lview = self.rc.load_balanced_view()

        self.maxNumEngines = len(self.rc.ids)

    def run_ensemble(self,
                     sim,
                     nb_run_sim,
                     run_one=None,
                     genNewPlanets=True,
                     rewindPlanets=True,
                     kwargs={}):
        """
        Args:
            sim:

        """

        t1 = time.time()
        async_res = []
        for j in range(nb_run_sim):
            ar = self.lview.apply_async(run_one,
                                        genNewPlanets=genNewPlanets,
                                        rewindPlanets=rewindPlanets,
                                        **kwargs)
            async_res.append(ar)

        print("Submitted %d tasks." % len(async_res))

        runStartTime = time.time()  #create job starting time
        avg_time_per_run = 0.
        tmplenoutstandingset = nb_run_sim
        tLastRunFinished = time.time()
        ar = self.rc._asyncresult_from_jobs(async_res)
        while not ar.ready():
            ar.wait(10.)
            clear_output(wait=True)
            if ar.progress > 0:
                timeleft = ar.elapsed / ar.progress * (nb_run_sim -
                                                       ar.progress)
                if timeleft > 3600.:
                    timeleftstr = "%2.2f hours" % (timeleft / 3600.)
                elif timeleft > 60.:
                    timeleftstr = "%2.2f minutes" % (timeleft / 60.)
                else:
                    timeleftstr = "%2.2f seconds" % timeleft
            else:
                timeleftstr = "who knows"

            #Terminate hanging runs
            outstandingset = self.rc.outstanding  #a set of msg_ids that have been submitted but resunts have not been received
            if len(outstandingset) > 0 and len(
                    outstandingset
            ) < nb_run_sim:  #there is at least 1 run still going and we have not just started
                avg_time_per_run = (time.time() - runStartTime) / float(
                    nb_run_sim - len(outstandingset)
                )  #compute average amount of time per run
                if len(
                        outstandingset
                ) < tmplenoutstandingset:  #The scheduler has finished a run
                    tmplenoutstandingset = len(
                        outstandingset
                    )  #update this. should decrease by ~1 or number of cores...
                    tLastRunFinished = time.time(
                    )  #update tLastRunFinished to the last time a simulation finished (right now)
                    #self.vprint("tmplenoutstandingset %d, tLastRunFinished %0.6f"%(tmplenoutstandingset,tLastRunFinished))
                if time.time() - tLastRunFinished > avg_time_per_run * (
                        1 + self.maxNumEngines * 2):
                    self.vprint('Aborting ' + str(len(self.rc.outstanding)) +
                                'qty outstandingset jobs')
                    self.rc.abort(
                    )  #by default should abort all outstanding jobs... #it is possible that this will not stop the jobs running

            print("%4i/%i tasks finished after %4i s. About %s to go." %
                  (ar.progress, nb_run_sim, ar.elapsed, timeleftstr),
                  end="")
            sys.stdout.flush()

        t2 = time.time()
        print("\nCompleted in %d sec" % (t2 - t1))

        res = [ar.get() for ar in async_res]

        return res
Example #6
0
class ClusterLab(epyc.Lab):
    """A :class:`Lab` running on an ``pyparallel`` compute
    cluster.

    Experiments are submitted to engines in the cluster for
    execution in parallel, with the experiments being performed
    asynchronously to allow for disconnection and subsequent retrieval
    of results. Combined with a persistent :class:`LabNotebook`, this allows
    for fully decoupled access to an on-going computational experiment
    with piecewise retrieval of results.

    This class requires a cluster to already be set up and running, configured
    for persistent access, with access to the necessary code and libraries,
    and with appropriate security information available to the client.
    """

    # Tuning parameters
    WaitingTime = 30  #: Waiting time for checking for job completion. Lower values increase network traffic.

    def __init__(self,
                 notebook=None,
                 url_file=None,
                 profile=None,
                 profile_dir=None,
                 ipython_dir=None,
                 context=None,
                 debug=False,
                 sshserver=None,
                 sshkey=None,
                 password=None,
                 paramiko=None,
                 timeout=10,
                 cluster_id=None,
                 use_dill=False,
                 **extra_args):
        """Create an empty lab attached to the given cluster. Most of the arguments
        are as expected by the ``pyparallel.Client`` class, and are used to create the
        underlying connection to the cluster. The connection is opened immediately,
        meaning the cluster must be up and accessible when creating a lab to use it.

        :param notebook: the notebook used to results (defaults to an empty :class:`LabNotebook`)
        :param url_file: file containing connection information for accessing cluster
        :param profile: name of the IPython profile to use
        :param profile_dir: directory containing the profile's connection information
        :param ipython_dir: directory containing profile directories
        :param context: ZMQ context
        :param debug: whether to issue debugging information (defaults to False)
        :param sshserver: username and machine for ssh connections
        :param sshkey: file containing ssh key
        :param password: ssh password
        :param paramiko: True to use paramiko for ssh (defaults to False)
        :param timeout: timeout in seconds for ssh connection (defaults to 10s)
        :param cluster_id: string added to runtime files to prevent collisions
        :param use_dill: whether to use Dill as pickler (defaults to False)"""
        super(epyc.ClusterLab, self).__init__(notebook)

        # record all the connection arguments for later
        self._arguments = dict(url_file=url_file,
                               profile=profile,
                               profile_dir=profile_dir,
                               ipython_dir=ipython_dir,
                               context=context,
                               debug=debug,
                               sshserver=sshserver,
                               sshkey=sshkey,
                               password=password,
                               paramiko=paramiko,
                               timeout=timeout,
                               cluster_id=cluster_id,
                               use_dill=use_dill,
                               **extra_args)
        self._client = None

        # connect to the cluster
        self.open()

        # use Dill if requested
        if use_dill:
            self.use_dill()

    # ---------- Protocol ----------

    def open(self):
        """Connect to the cluster."""
        if self._client is None:
            self._client = Client(**self._arguments)

    def close(self):
        """Close down the connection to the cluster."""
        if self._client is not None:
            self._client.close()
            self._client = None

    def recreate(self):
        '''Save the arguments needed to re-connect to the cluster we use.

        :returns: a (classname, args) pair'''
        (cn, args) = super(ClusterLab, self).recreate()
        nargs = args.copy()
        nargs.update(self._arguments)
        return (classname, nargs)

    # ---------- Remote control of the compute engines ----------

    def numberOfEngines(self):
        """Return the number of engines available to this lab.

        :returns: the number of engines"""
        return len(self.engines())

    def engines(self):
        """Return a list of the available engines.

        :returns: a list of engines"""
        self.open()
        return self._client[:]

    def use_dill(self):
        """Make the cluster use Dill as pickler for transferring results. This isn't
        generally needed, but is sometimes useful for particularly complex experiments
        such as those involving closures. (Or, to put it another way, if you find yourself
        tempted to use this method, consider re-structuring your experiment code.)"""
        self.open()
        with self.sync_imports(quiet=True):
            import dill
        self._client.direct_view().use_dill()

    def sync_imports(self, quiet=False):
        """Return a context manager to control imports onto all the engines
        in the underlying cluster. This method is used within a ``with`` statement.

        Any imports should be done with no experiments running, otherwise the
        method will block until the cluster is quiet. Generally imports will be one
        of the first things done when connecting to a cluster. (But be careful
        not to accidentally try to re-import if re-connecting to a running
        cluster.)

        :param quiet: if True, suppresses messages (defaults to False)
        :returns: a context manager"""
        self.open()
        return self._client[:].sync_imports(quiet=quiet)

    # ---------- Running experiments ----------

    def _mixup(self, ps):
        """Private method to mix up a list of values in-place using a Fisher-Yates
        shuffle (see https://en.wikipedia.org/wiki/Fisher-Yates_shuffle).

        :param ps: the array
        :returns: the array, shuffled in-place"""
        for i in range(len(ps) - 1, 0, -1):
            j = int(numpy.random.random() * i)
            temp = ps[i]
            ps[i] = ps[j]
            ps[j] = temp
        return ps

    def runExperiment(self, e):
        """Run the experiment across the parameter space in parallel using
        all the engines in the cluster. This method returns immediately.

        The experiments are run asynchronously, with the points in the parameter
        space being explored randomly so that intermediate retrievals of results
        are more representative of the overall result. Put another way, for a lot
        of experiments the results available will converge towards a final
        answer, so we can plot them and see the answer emerge.

        :param e: the experiment"""

        # create the parameter space
        space = self.parameterSpace()

        # only proceed if there's work to do
        if len(space) > 0:
            nb = self.notebook()

            # randomise the order of the parameter space so that we evaluate across
            # the space as we go along to try to make intermediate (incomplete) result
            # sets more representative of the overall result set
            ps = self._mixup(space)

            try:
                # connect to the cluster
                self.open()

                # submit an experiment at each point in the parameter space to the cluster
                view = self._client.load_balanced_view()
                jobs = []
                for p in ps:
                    jobs.extend((view.apply_async((lambda p: e.set(p).run()),
                                                  p)).msg_ids)

                    # there seems to be a race condition in submitting jobs,
                    # whereby jobs get dropped if they're submitted too quickly
                    time.sleep(0.01)

                # record the mesage ids of all the jobs as submitted but not yet completed
                psjs = zip(ps, jobs)
                for (p, j) in psjs:
                    nb.addPendingResult(p, j)
            finally:
                # commit our pending results in the notebook
                nb.commit()
                self.close()

    def updateResults(self):
        """Update our results within any pending results that have completed since we
        last retrieved results from the cluster.

        :returns: the number of pending results completed at this call"""

        # we do all the tests for pending results against the notebook directly,
        # as the corresponding methods on self call this method themselves
        nb = self.notebook()

        # look for pending results if we're waiting for any
        n = 0
        if nb.numberOfPendingResults() > 0:
            # we have results to get
            self.open()
            for j in set(nb.pendingResults()):
                # query the status of a job
                status = self._client.result_status(j, status_only=False)

                # add all completed jobs to the notebook
                if j in status['completed']:
                    r = status[j]

                    # update the result in the notebook, cancelling
                    # the pending result as well
                    # values come back from Client.result_status() in
                    # varying degrees of list-nesting, which LabNotebook.addResult()
                    # handles itself
                    nb.addResult(r, j)

                    # commit changes to the notebook
                    nb.commit()

                    # purge the completed job from the cluster
                    self._client.purge_hub_results(j)

                    # record that we retrieved the results for the given job
                    n = n + 1
        return n

    # ---------- Accessing results ----------

    def numberOfResults(self):
        """Return the number of results we have available at the moment.

        :returns: the number of results"""
        self.updateResults()
        return self.notebook().numberOfResults()

    def numberOfPendingResults(self):
        """Return the number of resultswe are waiting for.

        :returns: the number of pending results"""
        self.updateResults()
        return self.notebook().numberOfPendingResults()

    def _availableResultsFraction(self):
        """Private method to return the fraction of results available, as a real number
        between 0 and 1. This does not update the results fetched from the cluster.

        :returns: the fraction of available results"""
        tr = self.notebook().numberOfResults() + self.notebook(
        ).numberOfPendingResults()
        if tr == 0:
            return 0
        else:
            return (self.notebook().numberOfResults() + 0.0) / tr

    def readyFraction(self):
        """Test what fraction of results are available. This will change over
        time as the results come in.

        :returns: the fraction from 0 to 1"""
        self.updateResults()
        return self._availableResultsFraction()

    def ready(self):
        """Test whether all the results are available. This will change over
        time as the results come in.

        :returns: True if all the results are available"""
        return (self.readyFraction() == 1)

    def wait(self, timeout=-1):
        """Wait for all pending results to be finished. If timeout is set,
        return after this many seconds regardless.

        :param timeout: timeout period in seconds (defaults to forever)
        :returns: True if all the results completed"""

        # we can't use pyparallel.Client.wait() for this, because that
        # method only works for cases where the Client object is the one that
        # submitted the jobs to the cluster hub -- and therefore has the
        # necessary data structures to perform synchronisation. This isn't the
        # case for us, as one of the main goals of epyc is to support disconnected
        # operation, which implies a different Client object retrieving results
        # than the one that submitted the jobs in the first place. This is
        # unfortunate, but understandable given the typical use cases for
        # Client objects in pyparallel.
        #
        # Instead. we have to code around a little busily. The ClusterLab.WaitingTime
        # global sets the latency for waiting, and we repeatedly wait for this amount
        # of time before updating the results. The latency value essentially controls
        # how busy this process is: given that most simulations are expected to
        # be long, a latency in the tens of seconds feels about right as a default
        if self.numberOfPendingResults() > 0:
            # we've got pending results, wait for them
            timeWaited = 0
            while (timeout < 0) or (timeWaited < timeout):
                if self.numberOfPendingResults() == 0:
                    # no pending jobs left, we're complete
                    return True
                else:
                    # not done yet, calculate the waiting period
                    if timeout == -1:
                        # wait for the default waiting period
                        dt = self.WaitingTime
                    else:
                        # wait for the default waiting period or until the end of the timeout.
                        # whichever comes first
                        if (timeout - timeWaited) < self.WaitingTime:
                            dt = timeout - timeWaited
                        else:
                            dt = self.WaitingTime

                    # sleep for a while
                    time.sleep(dt)
                    timeWaited = timeWaited + dt

            # if we get here, the timeout expired, so do a final check
            # and then exit
            return (self.numberOfPendingResults() == 0)

        else:
            # no results, so we got them all
            return True

    # ---------- Managing pending results ----------

    def pendingResults(self):
        """Return the list of job iods for any pending results.

        :returns: a list of job ids"""
        return self.notebook().pendingResults()

    def pendingResultsFor(self, params):
        """Return a list of job ids for any results pending for experiments
        at the given point in the parameter space.

        :param params: the experimental parameters
        :returns: a list of job ids"""
        return self.notebook().pendingResultsFor(params)

    def _abortJobs(self, js):
        """Private method to abort a set of jobs.

        :param js: the job ids to be aborted"""
        self.open()
        self._client.abort(jobs=js)
        self.close()

    def cancelPendingResultsFor(self, params):
        """Cancel any results pending for experiments at the given point
        in the parameter space.

        :param params: the experimental parameters"""

        # grab the result job ids
        jobs = self.pendingResultsFor(params)

        if len(jobs) > 0:
            # abort in the cluster
            self._abortJobs(jobs)

            # cancel in the notebook
            self.notebook().cancelPendingResultsFor(params)

    def cancelAllPendingResults(self):
        """Cancel all pending results."""

        # grab all the pending job ids
        jobs = self.pendingResults()

        if len(jobs) > 0:
            # abort in the cluster
            self._abortJobs(jobs)

            # cancel in the notebook
            self.notebook().cancelAllPendingResults()