def test_join_timeout(self): logging.debug('') logging.debug('test_join_timeout') worker_q = WorkerPool.get() worker_q.put((time.sleep, (3, ), {}, self.reply_q)) WorkerPool.cleanup()
def test_never_released(self): logging.debug('') logging.debug('test_never_released') worker_q = WorkerPool.get() worker_q.put((self.add, (1, ), {}, self.reply_q)) WorkerPool.cleanup()
def test_never_released(self): logging.debug('') logging.debug('test_never_released') worker_q = WorkerPool.get() worker_q.put((self.add, (1,), {}, self.reply_q)) WorkerPool.cleanup()
def test_join_timeout(self): logging.debug('') logging.debug('test_join_timeout') worker_q = WorkerPool.get() worker_q.put((time.sleep, (3,), {}, self.reply_q)) WorkerPool.cleanup()
def max_servers(self, resource_desc): """ Returns the total of :meth:`max_servers` across all :class:`LocalAllocator` in the cluster. resource_desc: dict Description of required resources. """ credentials = get_credentials() key = 'allocator' value = resource_desc.get(key, '') if value: if self.name != value: return 0 else: # Any host in our cluster is OK. resource_desc = resource_desc.copy() del resource_desc[key] with self._lock: # Drain _reply_q. while True: try: self._reply_q.get_nowait() except Queue.Empty: break # Get counts via worker threads. todo = [] max_workers = 10 for i, allocator in enumerate(self._allocators.values()): if i < max_workers: worker_q = WorkerPool.get() worker_q.put((self._get_count, (allocator, resource_desc, credentials), {}, self._reply_q)) else: todo.append(allocator) # Process counts. total = 0 for i in range(len(self._allocators)): worker_q, retval, exc, trace = self._reply_q.get() if exc: self._logger.error(trace) raise exc try: next_allocator = todo.pop(0) except IndexError: WorkerPool.release(worker_q) else: worker_q.put((self._get_count, (next_allocator, resource_desc, credentials), {}, self._reply_q)) count = retval if count: total += count return total
def test_basic(self): logging.debug('') logging.debug('test_basic') worker_q = WorkerPool.get() worker_q.put((self.add, (1, ), {}, self.reply_q)) done_q, retval, exc, trace = self.reply_q.get() self.assertEqual(done_q, worker_q) self.assertEqual(retval, -1) self.assertEqual(exc, None) self.assertEqual(trace, None) self.assertEqual(self.total, 1) WorkerPool.release(worker_q) WorkerPool.cleanup()
def test_basic(self): logging.debug('') logging.debug('test_basic') worker_q = WorkerPool.get() worker_q.put((self.add, (1,), {}, self.reply_q)) done_q, retval, exc, trace = self.reply_q.get() self.assertEqual(done_q, worker_q) self.assertEqual(retval, -1) self.assertEqual(exc, None) self.assertEqual(trace, None) self.assertEqual(self.total, 1) WorkerPool.release(worker_q) WorkerPool.cleanup()
def test_exception(self): logging.debug('') logging.debug('test_exception') tail = "TypeError: unsupported operand type(s) for +=: 'int' and 'NoneType'\n" worker_q = WorkerPool.get() worker_q.put((self.add, (None, ), {}, self.reply_q)) done_q, retval, exc, trace = self.reply_q.get() self.assertEqual(done_q, worker_q) self.assertEqual(retval, None) self.assertEqual(type(exc), TypeError) self.assertTrue(trace.endswith(tail)) self.assertEqual(self.total, 0) WorkerPool.release(worker_q) WorkerPool.cleanup()
def test_exception(self): logging.debug('') logging.debug('test_exception') tail = "TypeError: unsupported operand type(s) for +=: 'int' and 'NoneType'\n" worker_q = WorkerPool.get() worker_q.put((self.add, (None,), {}, self.reply_q)) done_q, retval, exc, trace = self.reply_q.get() self.assertEqual(done_q, worker_q) self.assertEqual(retval, None) self.assertEqual(type(exc), TypeError) self.assertTrue(trace.endswith(tail)) self.assertEqual(self.total, 0) WorkerPool.release(worker_q) WorkerPool.cleanup()
def _start_hosts(self, address, credentials): """ Start host managers. Sequence for each host is: 1. Check connectivity via simple 'ssh' call. 2. Send startup files. 3. Invoke remote Python process. (state 'started') 4. Receive remote connection information. (state 'up') """ # Start first set of hosts. todo = [] max_workers = 5 # Somewhat related to listener backlog. for i, host in enumerate(self._hostlist): if i < max_workers: worker_q = WorkerPool.get() _LOGGER.info('Starting host %s...', host.hostname) worker_q.put( (self._start_manager, (host, i, address, credentials), {}, self._reply_q)) else: todo.append(host) # Wait for worker, start next host. for i in range(len(self._hostlist)): worker_q, host, exc, trace = self._reply_q.get() if exc: _LOGGER.error(trace) raise exc _LOGGER.debug('Host %r state %s', host.hostname, host.state) try: next_host = todo.pop(0) except IndexError: WorkerPool.release(worker_q) else: _LOGGER.info('Starting host %s...', next_host.hostname) worker_q.put( (self._start_manager, (next_host, i + max_workers, address, credentials), {}, self._reply_q))
def _start_hosts(self, address, credentials): """ Start host managers. Sequence for each host is: 1. Check connectivity via simple 'ssh' call. 2. Send startup files. 3. Invoke remote Python process. (state 'started') 4. Receive remote connection information. (state 'up') """ # Start first set of hosts. todo = [] max_workers = 5 # Somewhat related to listener backlog. for i, host in enumerate(self._hostlist): if i < max_workers: worker_q = WorkerPool.get() _LOGGER.info('Starting host %s...', host.hostname) worker_q.put((self._start_manager, (host, i, address, credentials), {}, self._reply_q)) else: todo.append(host) # Wait for worker, start next host. for i in range(len(self._hostlist)): worker_q, host, exc, trace = self._reply_q.get() if exc: _LOGGER.error(trace) raise exc _LOGGER.debug('Host %r state %s', host.hostname, host.state) try: next_host = todo.pop(0) except IndexError: WorkerPool.release(worker_q) else: _LOGGER.info('Starting host %s...', next_host.hostname) worker_q.put((self._start_manager, (next_host, i+max_workers, address, credentials), {}, self._reply_q))
def time_estimate(self, resource_desc): """ Returns ``(estimate, criteria)`` indicating how well this allocator can satisfy the `resource_desc` request. The estimate will be: - >0 for an estimate of walltime (seconds). - 0 for no estimate. - -1 for no resource at this time. - -2 for no support for `resource_desc`. The returned criteria is a dictionary containing information related to the estimate, such as hostnames, load averages, unsupported resources, etc. This allocator polls each :class:`LocalAllocator` in the cluster to find the best match and returns that. The best allocator is saved in the returned criteria for a subsequent :meth:`deploy`. resource_desc: dict Description of required resources. """ credentials = get_credentials() key = "allocator" value = resource_desc.get(key, "") if value: if self.name != value: return (-2, {key: value}) else: # Any host in our cluster is OK. resource_desc = resource_desc.copy() del resource_desc[key] n_cpus = resource_desc.get("n_cpus", 0) if n_cpus: # Spread across LocalAllocators. resource_desc = resource_desc.copy() resource_desc["n_cpus"] = 1 with self._lock: best_estimate = -2 best_criteria = None best_allocator = None # Prefer not to repeat use of just-used allocator. prev_estimate = -2 prev_criteria = None prev_allocator = self._last_deployed self._last_deployed = None # Drain _reply_q. while True: try: self._reply_q.get_nowait() except Queue.Empty: break # Get estimates via worker threads. todo = [] max_workers = 10 for i, allocator in enumerate(self._allocators.values()): if i < max_workers: worker_q = WorkerPool.get() worker_q.put((self._get_estimate, (allocator, resource_desc, credentials), {}, self._reply_q)) else: todo.append(allocator) # Process estimates. host_loads = [] # Sorted list of (hostname, load) for i in range(len(self._allocators)): worker_q, retval, exc, trace = self._reply_q.get() if exc: self._logger.error(trace) retval = None try: next_allocator = todo.pop(0) except IndexError: WorkerPool.release(worker_q) else: worker_q.put((self._get_estimate, (next_allocator, resource_desc, credentials), {}, self._reply_q)) if retval is None: continue allocator, estimate, criteria = retval if estimate is None: continue # Update loads. if estimate >= 0 and n_cpus: load = criteria["loadavgs"][0] new_info = (criteria["hostnames"][0], load) if host_loads: for i, info in enumerate(host_loads): if load < info[1]: host_loads.insert(i, new_info) break else: host_loads.append(new_info) else: host_loads.append(new_info) # Update best estimate. if allocator is prev_allocator: prev_estimate = estimate prev_criteria = criteria elif (best_estimate <= 0 and estimate > best_estimate) or ( best_estimate > 0 and estimate < best_estimate ): best_estimate = estimate best_criteria = criteria best_allocator = allocator elif best_estimate == 0 and estimate == 0: best_load = best_criteria["loadavgs"][0] load = criteria["loadavgs"][0] if load < best_load: best_estimate = estimate best_criteria = criteria best_allocator = allocator # If no alternative, repeat use of previous allocator. if best_estimate < 0 and prev_estimate >= 0: best_estimate = prev_estimate best_criteria = prev_criteria best_allocator = prev_allocator # Save best allocator in criteria in case we're asked to deploy. if best_criteria is not None: best_criteria["allocator"] = best_allocator # Save n_cpus hostnames in criteria. best_criteria["hostnames"] = [host_loads[i][0] for i in range(min(n_cpus, len(host_loads)))] return (best_estimate, best_criteria)