Exemple #1
0
def cluster_status():
    try:
        rcl = Client()
        nworkers = len(rcl[:])
        qstat = rcl.queue_status()
        queued = qstat[u'unassigned']
        working = sum([qstat[w][u'tasks'] for w in rcl.ids])
        idle = nworkers - working
        rcl.close()
    except:
        nworkers, queued, working, idle = 0, 0, 0, 0
    return nworkers, queued, working, idle
class IPClusterEnsemble(SurveyEnsemble):
    """Parallelized suvey ensemble based on IPython parallel (ipcluster)
    
    """

    def __init__(self, **specs):
        
        SurveyEnsemble.__init__(self, **specs)

        self.verb = specs.get('verbose', True)
        
        # access the cluster
        self.rc = Client()
        self.dview = self.rc[:]
        self.dview.block = True
        with self.dview.sync_imports(): import EXOSIMS, EXOSIMS.util.get_module, \
                os, os.path, time, random, pickle, traceback, numpy
        if 'logger' in specs:
            specs.pop('logger')
        if 'seed' in specs:
            specs.pop('seed')
        self.dview.push(dict(specs=specs))
        self.vprint("Building SurveySimulation object on all workers.")
        res = self.dview.execute("SS = EXOSIMS.util.get_module.get_module(specs['modules'] \
                ['SurveySimulation'], 'SurveySimulation')(**specs)")
        
        res2 = self.dview.execute("SS.reset_sim()")

        self.vprint("Created SurveySimulation objects on %d engines."%len(self.rc.ids))
        #for row in res.stdout:
        #    self.vprint(row)

        self.lview = self.rc.load_balanced_view()

        self.maxNumEngines = len(self.rc.ids)

    def run_ensemble(self, sim, nb_run_sim, run_one=None, genNewPlanets=True,
        rewindPlanets=True, kwargs={}):
        """
        Args:
            sim:

        """
        hangingRunsOccured = False # keeps track of whether hanging runs have occured
        t1 = time.time()
        async_res = []
        for j in range(nb_run_sim):
            ar = self.lview.apply_async(run_one, genNewPlanets=genNewPlanets,
                    rewindPlanets=rewindPlanets, **kwargs)
            async_res.append(ar)
        
        print("Submitted %d tasks."%len(async_res))
        
        engine_pids = self.rc[:].apply(os.getpid).get_dict()
        #ar2 = self.lview.apply_async(os.getpid)
        #pids = ar2.get_dict()
        print('engine_pids')
        print(engine_pids)

        runStartTime = time.time()#create job starting time
        avg_time_per_run = 0.
        tmplenoutstandingset = nb_run_sim
        tLastRunFinished = time.time()
        ar= self.rc._asyncresult_from_jobs(async_res)
        while not ar.ready():
            ar.wait(10.)
            clear_output(wait=True)
            if ar.progress > 0:
                timeleft = ar.elapsed/ar.progress * (nb_run_sim - ar.progress)
                if timeleft > 3600.:
                    timeleftstr = "%2.2f hours"%(timeleft/3600.)
                elif timeleft > 60.:
                    timeleftstr = "%2.2f minutes"%(timeleft/60.)
                else:
                    timeleftstr = "%2.2f seconds"%timeleft
            else:
                timeleftstr = "who knows"

            #Terminate hanging runs
            outstandingset = self.rc.outstanding#a set of msg_ids that have been submitted but resunts have not been received
            if len(outstandingset) > 0 and len(outstandingset) < nb_run_sim:#there is at least 1 run still going and we have not just started
                avg_time_per_run = (time.time() - runStartTime)/float(nb_run_sim - len(outstandingset))#compute average amount of time per run
                if len(outstandingset) < tmplenoutstandingset:#The scheduler has finished a run
                    tmplenoutstandingset = len(outstandingset)#update this. should decrease by ~1 or number of cores...
                    tLastRunFinished = time.time()#update tLastRunFinished to the last time a simulation finished (right now)
                    #self.vprint("tmplenoutstandingset %d, tLastRunFinished %0.6f"%(tmplenoutstandingset,tLastRunFinished))
                if time.time() - tLastRunFinished > avg_time_per_run*(1. + self.maxNumEngines*2.)*4.:
                    #nb_run_sim = len(self.rc.outstanding)
                    #restartRuns = True
                    self.vprint('Aborting ' + str(len(self.rc.outstanding)) + 'qty outstandingset jobs')
                    #runningPIDS = os.listdir('/proc') # get all running pids
                    self.vprint('queue_status')
                    self.vprint(str(self.rc.queue_status()))
                    self.rc.abort()
                    ar.wait(20)
                    runningPIDS = [int(tpid) for tpid in os.listdir('/proc') if tpid.isdigit()]
                    #[self.rc.queue_status()[eind] for eind in np.arange(self.maxNumEngines) if self.rc.queue_status()[eind]['tasks']>0]
                    for engineInd in [eind for eind in np.arange(self.maxNumEngines) if self.rc.queue_status()[eind]['tasks']>0]:
                        os.kill(engine_pids[engineInd],15)
                        time.sleep(20)
                    # for pid in [engine_pids[eind] for eind in np.arange(len(engine_pids))]:
                    #     if pid in runningPIDS:
                    #         os.kill(pid,9) # send kill command to stop this worker
                    stopIPClusterCommand = subprocess.Popen(['ipcluster','stop'])
                    stopIPClusterCommand.wait()
                    time.sleep(60) # doing this instead of waiting for ipcluster to terminate
                    stopIPClusterCommand = subprocess.Popen(['ipcluster','stop'])
                    stopIPClusterCommand.wait()
                    time.sleep(60) # doing this instead of waiting for ipcluster to terminate
                    hangingRunsOccured = True # keeps track of whether hanging runs have occured
                    break
                    #stopIPClusterCommand.wait() # waits for process to terminate
                    #call(["ipcluster","stop"]) # send command to stop ipcluster
                    #self.rc.abort(jobs=self.rc.outstanding.copy().pop())
                    #self.rc.abort()#by default should abort all outstanding jobs... #it is possible that this will not stop the jobs running
                    #ar.wait(100)
                    #self.rc.purge_everything() # purge all results if outstanding *because rc.abort() didn't seem to do the job right
                    tLastRunFinished = time.time()#update tLastRunFinished to the last time a simulation was restarted (right now)

            print("%4i/%i tasks finished after %4i s. About %s to go." % (ar.progress, nb_run_sim, ar.elapsed, timeleftstr), end="")
            sys.stdout.flush()
        #numRunStarts += 1 # increment number of run restarts



        t2 = time.time()
        print("\nCompleted in %d sec" % (t2 - t1))
        
        if hangingRunsOccured: #hanging runs have occured
            res = [1]
        else:
            res = [ar.get() for ar in async_res]
        
        return res
Exemple #3
0
class IPClusterEnsemble(SurveyEnsemble):
    """Parallelized suvey ensemble based on IPython parallel (ipcluster)
    
    """
    def __init__(self, **specs):

        SurveyEnsemble.__init__(self, **specs)

        self.verb = specs.get('verbose', True)

        # access the cluster
        self.rc = Client()
        self.dview = self.rc[:]
        self.dview.block = True
        with self.dview.sync_imports():            import EXOSIMS, EXOSIMS.util.get_module, \
os, os.path, time, random, pickle, traceback, numpy
        if 'logger' in specs:
            specs.pop('logger')
        if 'seed' in specs:
            specs.pop('seed')
        self.dview.push(dict(specs=specs))
        self.vprint("Building SurveySimulation object on all workers.")
        res = self.dview.execute(
            "SS = EXOSIMS.util.get_module.get_module(specs['modules'] \
                ['SurveySimulation'], 'SurveySimulation')(**specs)")

        res2 = self.dview.execute("SS.reset_sim()")

        self.vprint("Created SurveySimulation objects on %d engines." %
                    len(self.rc.ids))
        #for row in res.stdout:
        #    self.vprint(row)

        self.lview = self.rc.load_balanced_view()

        self.maxNumEngines = len(self.rc.ids)

    def run_ensemble(self,
                     sim,
                     nb_run_sim,
                     run_one=None,
                     genNewPlanets=True,
                     rewindPlanets=True,
                     kwargs={}):
        """
        Args:
            sim:

        """
        hangingRunsOccured = False  # keeps track of whether hanging runs have occured
        t1 = time.time()
        async_res = []
        for j in range(nb_run_sim):
            ar = self.lview.apply_async(run_one,
                                        genNewPlanets=genNewPlanets,
                                        rewindPlanets=rewindPlanets,
                                        **kwargs)
            async_res.append(ar)

        print("Submitted %d tasks." % len(async_res))

        engine_pids = self.rc[:].apply(os.getpid).get_dict()
        #ar2 = self.lview.apply_async(os.getpid)
        #pids = ar2.get_dict()
        print('engine_pids')
        print(engine_pids)

        runStartTime = time.time()  #create job starting time
        avg_time_per_run = 0.
        tmplenoutstandingset = nb_run_sim
        tLastRunFinished = time.time()
        ar = self.rc._asyncresult_from_jobs(async_res)
        while not ar.ready():
            ar.wait(10.)
            clear_output(wait=True)
            if ar.progress > 0:
                timeleft = ar.elapsed / ar.progress * (nb_run_sim -
                                                       ar.progress)
                if timeleft > 3600.:
                    timeleftstr = "%2.2f hours" % (timeleft / 3600.)
                elif timeleft > 60.:
                    timeleftstr = "%2.2f minutes" % (timeleft / 60.)
                else:
                    timeleftstr = "%2.2f seconds" % timeleft
            else:
                timeleftstr = "who knows"

            #Terminate hanging runs
            outstandingset = self.rc.outstanding  #a set of msg_ids that have been submitted but resunts have not been received
            if len(outstandingset) > 0 and len(
                    outstandingset
            ) < nb_run_sim:  #there is at least 1 run still going and we have not just started
                avg_time_per_run = (time.time() - runStartTime) / float(
                    nb_run_sim - len(outstandingset)
                )  #compute average amount of time per run
                if len(
                        outstandingset
                ) < tmplenoutstandingset:  #The scheduler has finished a run
                    tmplenoutstandingset = len(
                        outstandingset
                    )  #update this. should decrease by ~1 or number of cores...
                    tLastRunFinished = time.time(
                    )  #update tLastRunFinished to the last time a simulation finished (right now)
                    #self.vprint("tmplenoutstandingset %d, tLastRunFinished %0.6f"%(tmplenoutstandingset,tLastRunFinished))
                if time.time() - tLastRunFinished > avg_time_per_run * (
                        1. + self.maxNumEngines * 2.) * 4.:
                    #nb_run_sim = len(self.rc.outstanding)
                    #restartRuns = True
                    self.vprint('Aborting ' + str(len(self.rc.outstanding)) +
                                'qty outstandingset jobs')
                    #runningPIDS = os.listdir('/proc') # get all running pids
                    self.vprint('queue_status')
                    self.vprint(str(self.rc.queue_status()))
                    self.rc.abort()
                    ar.wait(20)
                    runningPIDS = [
                        int(tpid) for tpid in os.listdir('/proc')
                        if tpid.isdigit()
                    ]
                    #[self.rc.queue_status()[eind] for eind in np.arange(self.maxNumEngines) if self.rc.queue_status()[eind]['tasks']>0]
                    for engineInd in [
                            eind for eind in np.arange(self.maxNumEngines)
                            if self.rc.queue_status()[eind]['tasks'] > 0
                    ]:
                        os.kill(engine_pids[engineInd], 15)
                        time.sleep(20)
                    # for pid in [engine_pids[eind] for eind in np.arange(len(engine_pids))]:
                    #     if pid in runningPIDS:
                    #         os.kill(pid,9) # send kill command to stop this worker
                    stopIPClusterCommand = subprocess.Popen(
                        ['ipcluster', 'stop'])
                    stopIPClusterCommand.wait()
                    time.sleep(
                        60
                    )  # doing this instead of waiting for ipcluster to terminate
                    stopIPClusterCommand = subprocess.Popen(
                        ['ipcluster', 'stop'])
                    stopIPClusterCommand.wait()
                    time.sleep(
                        60
                    )  # doing this instead of waiting for ipcluster to terminate
                    hangingRunsOccured = True  # keeps track of whether hanging runs have occured
                    break
                    #stopIPClusterCommand.wait() # waits for process to terminate
                    #call(["ipcluster","stop"]) # send command to stop ipcluster
                    #self.rc.abort(jobs=self.rc.outstanding.copy().pop())
                    #self.rc.abort()#by default should abort all outstanding jobs... #it is possible that this will not stop the jobs running
                    #ar.wait(100)
                    #self.rc.purge_everything() # purge all results if outstanding *because rc.abort() didn't seem to do the job right
                    tLastRunFinished = time.time(
                    )  #update tLastRunFinished to the last time a simulation was restarted (right now)

            print("%4i/%i tasks finished after %4i s. About %s to go." %
                  (ar.progress, nb_run_sim, ar.elapsed, timeleftstr),
                  end="")
            sys.stdout.flush()
        #numRunStarts += 1 # increment number of run restarts

        t2 = time.time()
        print("\nCompleted in %d sec" % (t2 - t1))

        if hangingRunsOccured:  #hanging runs have occured
            res = [1]
        else:
            res = [ar.get() for ar in async_res]

        return res