def __init__(self, hosts=[], channel_type="mpi", preamble=None, retry_jobs=True, no_wait=True, verbose=True, max_retries=2, use_threading=False): self.hosts = [] self.job_list = deque() self.idle_codes = [] self.retry_jobs = retry_jobs self.max_retries = max_retries self._finished_jobs = deque() self.preamble = preamble self.pool = AsyncRequestsPool() self.number_available_codes = 0 self.number_starting_codes = 0 self.no_wait = no_wait self.last_finished_job = None self.use_threading = use_threading self.verbose = verbose if self.verbose: print("AMUSE JobServer launching") self.add_hosts(hosts=hosts, channel_type=channel_type)
def test22(self): pool = AsyncRequestsPool() x = self.ForTestingInterface() y = self.ForTestingInterface() request1 = x.sleep.asynchronous(0.5) request2 = y.sleep.asynchronous(1.5) finished_requests = [] def handle_result(request, index): self.assertTrue(request.is_result_available()) finished_requests.append(index) pool.add_request(request1, handle_result, [1]) pool.add_request(request2, handle_result, [2]) pool.wait() self.assertEquals(len(finished_requests), 1) self.assertEquals(len(pool), 1) pool.wait() self.assertEquals(len(finished_requests), 2) self.assertEquals(len(pool), 0) self.assertTrue(request1.is_result_available()) self.assertTrue(request2.is_result_available()) self.assertEquals(request1.result(), 0) self.assertEquals(request2.result(), 0) y.stop() x.stop()
def step_les_models(model_time, work_queue, offset=les_spinup): global errorFlag les_wall_times = [] if not any(les_models): return les_wall_times if les_queue_threads >= len( les_models): # Step all dales models in parallel if async_evolve: # evolve all dales models with asynchronous Amuse calls reqs = [] pool = AsyncRequestsPool() for les in les_models: req = les.evolve_model.asynchronous(model_time + (offset | units.s), exactEnd=True) reqs.append(req) pool.add_request(req) # now while the dales threads are working, sync the netcdf to disk spio.sync_root() # wait for all threads pool.waitall() try: les_wall_times = [r.result().value_in(units.s) for r in reqs] log.info("async step_les_models() done. Elapsed times:" + str(['%5.1f' % t for t in les_wall_times])) except Exception as e: log.error("Exception caught while gathering results: %s" % e.message) else: # evolve all dales models using python threads threads = [] for les in les_models: t = threading.Thread(target=step_les, args=(les, model_time, offset), name=str(les.grid_index)) # t.setDaemon(True) threads.append(t) t.start() # now while the dales threads are working, sync the netcdf to disk spio.sync_root() # wait for all threads for t in threads: # log.info("Waiting to join thread %s..." % t.name) t.join() # log.info("joined thread %s" % t.name) elif les_queue_threads > 1: for les in les_models: work_queue.put((les, model_time)) # enqueue all dales instances # now while the dales threads are working, sync the netcdf to disk spio.sync_root() work_queue.join() # wait for all dales work to be completed if errorFlag: log.info("One thread failed - exiting ...") # stop_worker_threads(work_queue) # signal worker threads to quit - now an atexit function, should not # need it here finalize() sys.exit(1) else: # sequential version for les in les_models: step_les(les, model_time, offset) return les_wall_times
def test29(self): pool = AsyncRequestsPool() x = self.ForTestingInterface() y = self.ForTestingInterface() sequenced_requests_indices = [] def next_request(index): if index < 4: sequenced_requests_indices.append(index) return x.sleep.asynchronous(0.5) else: return None request1 = ASyncRequestSequence(next_request) request2 = y.sleep.asynchronous(1.0) finished_requests = [] def handle_result(request, index): self.assertTrue(request.is_result_available()) self.assertTrue(request.is_finished) finished_requests.append(index) pool.add_request(request1, handle_result, [1]) pool.add_request(request2, handle_result, [2]) pool.wait() self.assertEquals(len(finished_requests), 1) self.assertEquals(len(pool), 1) self.assertEquals(finished_requests, [2]) self.assertTrue(len(sequenced_requests_indices) > 0) pool.wait() self.assertEquals(len(finished_requests), 2) self.assertEquals(len(pool), 0) x.sleep(0.1) self.assertEquals(sequenced_requests_indices, [0, 1, 2, 3]) self.assertTrue(request1.is_result_available()) self.assertTrue(request2.is_result_available()) self.assertEquals(request1.result(), [0, 0, 0, 0]) self.assertEquals(request2.result(), 0) y.stop() x.stop()
def test23(self): pool = AsyncRequestsPool() x = ForTestingInterface(channel_type='sockets') y = ForTestingInterface(channel_type='sockets') request1 = x.sleep.asynchronous(0.2) request2 = y.sleep.asynchronous(0.2) finished_requests = [] def handle_result(request, index): self.assertTrue(request.is_result_available()) finished_requests.append(index) pool.add_request(request1, handle_result, [1]) pool.add_request(request2, handle_result, [2]) time.sleep(1.0) pool.wait() pool.wait() self.assertEquals(len(finished_requests), 2) self.assertEquals(len(pool), 0) self.assertTrue(request1.is_result_available()) self.assertTrue(request2.is_result_available()) self.assertEquals(request1.result(), 0) self.assertEquals(request2.result(), 0) pool.wait() self.assertEquals(len(pool), 0) y.stop() x.stop()
def test25(self): """ more test of pool: calls of same code """ from amuse.rfi.async_request import AsyncRequestsPool instance1 = ForTesting(self.exefile) r1 = instance1.do_sleep(1, return_request=True) r2 = instance1.echo_int(2, return_request=True) p1 = AsyncRequestsPool() r1.wait() r2.wait() p1.add_request(r1) p1.add_request(r2) #~ p1=r1.join(r2) p1.waitall() self.assertEqual(r2.result(), 2) instance1.stop()
class JobServer(object): def __init__(self, hosts=[], channel_type="mpi", preamble=None, retry_jobs=True, no_wait=True, verbose=True, max_retries=2, use_threading=False): self.hosts = [] self.job_list = deque() self.idle_codes = [] self.retry_jobs = retry_jobs self.max_retries = max_retries self._finished_jobs = deque() self.preamble = preamble self.pool = AsyncRequestsPool() self.number_available_codes = 0 self.number_starting_codes = 0 self.no_wait = no_wait self.last_finished_job = None self.use_threading = use_threading self.verbose = verbose if self.verbose: print("AMUSE JobServer launching") self.add_hosts(hosts=hosts, channel_type=channel_type) def no_hosts(self): if self.number_available_codes == 0 and self.number_starting_codes == 0: return True return False def add_hosts(self, hosts=[], channel_type="mpi"): self.hosts.append(hosts) if self.verbose: print("JobServer: connecting %i hosts" % len(hosts)) if not self.use_threading: for host in hosts: self.number_starting_codes += 1 self._startup(channel_type=channel_type, hostname=host, label=host, copy_worker_code=True, redirection="none") else: threads = [] for host in hosts: kwargs = dict(channel_type=channel_type, hostname=host, label=host, copy_worker_code=True, redirection="none") threads.append( threading.Thread(target=self._startup, kwargs=kwargs)) for thread in threads: self.number_starting_codes += 1 thread.daemon = True thread.start() if not self.no_wait: if self.verbose: print("... waiting") for thread in threads: thread.join() else: if self.verbose: print("... waiting for first available host") while self.number_available_codes == 0 and self.number_starting_codes > 0: sleep(0.1) if self.no_wait: if self.verbose: print("JobServer: launched") else: if self.verbose: print("JobServer: launched with", len(self.idle_codes), "hosts") def _startup(self, *args, **kwargs): try: code = RemoteCodeInterface(*args, **kwargs) except Exception as ex: self.number_starting_codes -= 1 print("JobServer: startup failed on", kwargs['hostname'] or "default") print(ex) else: if self.preamble is not None: code.execute(self.preamble) self.number_available_codes += 1 self.number_starting_codes -= 1 if self.no_wait: if self.number_available_codes & (self.number_available_codes - 1) == 0: if self.verbose: print("JobServer: hosts now available:", self.number_available_codes) if self.number_starting_codes == 0: if self.verbose: print("JobServer: hosts in total:", self.number_available_codes) if self.job_list: self._add_job(self.job_list.popleft(), code) else: self.idle_codes.append(code) def exec_(self, arg): while self.number_starting_codes > 0: sleep(0.1) self.waitall() for code in self.idle_codes: code.execute(arg) def submit_job(self, f, args=(), kwargs={}): if len(self.pool) == 0 and not self.job_list: if self.verbose: print("JobServer: submitting first job on queue") job = Job(f, args, kwargs) self.job_list.append(job) if self.idle_codes: self._add_job(self.job_list.popleft(), self.idle_codes.pop()) return job def wait(self): if self._finished_jobs: self.last_finished_job = self._finished_jobs.popleft() return True elif len(self.pool) == 0 and not self.job_list: if self.verbose: print("JobServer: no more jobs on queue or running") return False else: while len(self.pool) == 0 and self.job_list: if self.number_available_codes > 0: raise Exception("JobServer: this should not happen") if self.number_starting_codes == 0: raise Exception("JobServer: no codes available") self.pool.wait() self.last_finished_job = self._finished_jobs.popleft() return True def waitall(self): while len(self.pool) == 0 and self.job_list: if self.number_available_codes > 0: raise Exception("JobServer: this should not happen") if self.number_starting_codes == 0: raise Exception("JobServer: no codes available") while len(self.pool) > 0 or self.job_list: self.pool.wait() self.last_finished_job = self._finished_jobs[-1] @property def finished_jobs(self): while self._finished_jobs: yield self._finished_jobs.popleft() def _finalize_job(self, request, job, code): try: job.result = request.result() job.err = None except Exception as ex: job.result = None job.err = ex if job.err and not isinstance(job.err, RemoteCodeException): del code self.number_available_codes -= 1 if self.retry_jobs and job.retries < self.max_retries: retry = Job(job.f, job.args, job.kwargs, job.retries + 1) self.job_list.append(retry) else: self.idle_codes.append(code) if self.job_list and self.idle_codes: self._add_job(self.job_list.popleft(), self.idle_codes.pop()) if not self.job_list: if self.verbose: print("JobServer: last job dispatched") self._finished_jobs.append(job) def _add_job(self, job, code): job.request = code.async_func(job.f, *job.args, **job.kwargs) self.pool.add_request(job.request, self._finalize_job, [job, code]) def __del__(self): if not self.no_hosts(): self.waitall() if self.job_list: warnings.warn( "JobServer: Warning: shutting down with unfinished jobs") for code in self.idle_codes: code.stop() if self.number_starting_codes > 0: warnings.warn( "JobServer: Warning: some hosts startup threads possibly blocking" )
channel_type='sockets', number_of_workers=1, case='bomex') # explicitly initialize the codes # otherwise implicitly done when calling evolve_model d1.commit_parameters() d2.commit_parameters() # add parameter redirection='none' to see DALES diagnostics output target_time = 120 | units.s # target time # create a pool for managing asynchronous requests t = time.time() pool = AsyncRequestsPool() # add requests to the two codes to the pool request1 = d1.evolve_model.asynchronous(target_time, exactEnd=True) pool.add_request(request1) request2 = d2.evolve_model.asynchronous(target_time, exactEnd=True) pool.add_request(request2) print('Generating asynchronous requests %f s' % (time.time() - t)) # wait for the requests to finish print('Calling pool.waitall()') t = time.time() pool.waitall() print('pool.waitall() returned %f s' % (time.time() - t))