def start(self): """ Start this manager and all remote managers. """ super(Cluster, self).start() hostname = socket.getfqdn() listener = connection.Listener(address=(hostname, 0), authkey=self._authkey, backlog=5) # Default is 1. # TODO: support multiple addresses if multiple networks are attached. # Start managers in separate thread to avoid losing connections. starter = threading.Thread(target=self._start_hosts, args=(listener.address, get_credentials())) starter.daemon = True starter.start() # Accept callback connections from started managers. waiting = [''] retry = 0 while waiting: host_processed = False for host in self._hostlist: host.poll() if host.state == 'started': # Accept conection from *any* host. _LOGGER.debug('waiting for a connection, host %s', host.hostname) # This will hang if server doesn't receive our address. conn = listener.accept() i, address, pubkey_text = conn.recv() conn.close() other_host = self._hostlist[i] if address is None: _LOGGER.error('Host %s died: %s', other_host.hostname, pubkey_text) # Exception text. continue other_host.manager = HostManager.from_address( address, self._authkey) other_host.state = 'up' if pubkey_text: other_host.manager._pubkey = \ decode_public_key(pubkey_text) host_processed = True _LOGGER.debug('Host %s is now up', other_host.hostname) self._up.append(other_host) # See if there are still hosts to wait for. waiting = [] for host in self._hostlist: host.poll() if host.state == 'init' or host.state == 'started': waiting.append(host) if waiting: if not host_processed: retry += 1 if retry < 600: # ~60 seconds. time.sleep(0.1) else: _LOGGER.warning('Cluster startup timeout,' ' hosts not started:') for host in waiting: _LOGGER.warning(' %s (%s) in dir %s', host.hostname, host.state, host.tempdir) break else: break self._up = sorted(self._up, key=lambda host: host.hostname) self._base_shutdown = self.shutdown del self.shutdown
def start(self): """ Start this manager and all remote managers. """ super(Cluster, self).start() hostname = socket.getfqdn() listener = connection.Listener(address=(hostname, 0), authkey=self._authkey, backlog=5) # Default is 1. # TODO: support multiple addresses if multiple networks are attached. # Start managers in separate thread to avoid losing connections. starter = threading.Thread(target=self._start_hosts, args=(listener.address, get_credentials())) starter.daemon = True starter.start() # Accept callback connections from started managers. waiting = [''] retry = 0 while waiting: host_processed = False for host in self._hostlist: host.poll() if host.state == 'started': # Accept conection from *any* host. _LOGGER.debug('waiting for a connection, host %s', host.hostname) # This will hang if server doesn't receive our address. conn = listener.accept() i, address, pubkey_text = conn.recv() conn.close() other_host = self._hostlist[i] if address is None: _LOGGER.error('Host %s died: %s', other_host.hostname, pubkey_text) # Exception text. continue other_host.manager = HostManager.from_address(address, self._authkey) other_host.state = 'up' if pubkey_text: other_host.manager._pubkey = \ decode_public_key(pubkey_text) host_processed = True _LOGGER.debug('Host %s is now up', other_host.hostname) self._up.append(other_host) # See if there are still hosts to wait for. waiting = [] for host in self._hostlist: host.poll() if host.state == 'init' or host.state == 'started': waiting.append(host) if waiting: if not host_processed: retry += 1 if retry < 300: # ~60 seconds. time.sleep(0.2) else: _LOGGER.warning('Cluster startup timeout,' ' hosts not started:') for host in waiting: _LOGGER.warning(' %s (%s) in dir %s', host.hostname, host.state, host.tempdir) break else: break self._up = sorted(self._up, key=lambda host: host.hostname) self._base_shutdown = self.shutdown del self.shutdown
def start(self): """ Start this manager and all remote managers. If some managers fail to start, errors are logged and the corresponding host's state is set to ``failed``. You can use ``len(cluster)`` to determine how many remote managers are available. A :class:`RuntimeError` will be raised if no managers were successfully started. """ super(Cluster, self).start() listener = connection.Listener(address=(self._hostname, 0), authkey=self._authkey, backlog=5) # Default is 1. # Start managers in separate thread to avoid losing connections. starter = threading.Thread(target=self._start_hosts, args=(listener.address, get_credentials())) starter.daemon = True starter.start() # Accept callback connections from started managers. waiting = [''] retry = 0 while waiting: host_processed = False for host in self._hostlist: host.poll() if host.state == 'started': # Accept conection from *any* host. _LOGGER.debug('waiting for a connection, host %s', host.hostname) # Normal accept() can hang. retval = [] accepter = threading.Thread(target=self._accept, args=(listener, retval), name='ClusterAccepter') accepter.daemon = True accepter.start() accepter.join(30) if accepter.is_alive(): msg = 'timeout waiting for reply from %s' \ % [host.hostname for host in self._hostlist if host.state == 'started'] _LOGGER.error(msg) for host in self._hostlist: if host.state == 'started': if host.proc is not None: host.proc.terminate() if host.reverse_cleanup is not None: host.reverse_cleanup[0](*host.reverse_cleanup[1:]) host.state = 'failed' continue conn = retval[0] i, address, pubkey_text = conn.recv() conn.close() other_host = self._hostlist[i] if address is None: _LOGGER.error('Host %s died: %s', other_host.hostname, pubkey_text) # Exception text. other_host.state = 'failed' continue try: other_host.manager = \ HostManager.from_address(address, self._authkey, other_host) except Exception as exc: _LOGGER.error("Can't start manager for %s: %s", other_host.hostname, str(exc) or repr(exc)) if other_host.proc is not None: other_host.proc.terminate() other_host.state = 'failed' continue else: other_host.state = 'up' if pubkey_text: other_host.manager._pubkey = \ decode_public_key(pubkey_text) host_processed = True _LOGGER.debug('Host %s is now up', other_host.hostname) self._up.append(other_host) # See if there are still hosts to wait for. waiting = [] for host in self._hostlist: host.poll() if host.state == 'init' or host.state == 'started': waiting.append(host) if waiting: if not host_processed: retry += 1 if retry < 300: # ~60 seconds. time.sleep(0.2) else: _LOGGER.warning('Cluster startup timeout,' ' hosts not started:') for host in waiting: _LOGGER.warning(' %s (%s) in dir %s', host.hostname, host.state, host.tempdir) break else: break self._up = sorted(self._up, key=lambda host: host.hostname) # So our class defined shutdown() is called before the superclass # installed shutdown(). self._base_shutdown = self.shutdown del self.shutdown if len(self._up) < 1: raise RuntimeError('No hosts successfully started')
def start(self): """ Start this manager and all remote managers. If some managers fail to start, errors are logged and the corresponding host's state is set to ``failed``. You can use ``len(cluster)`` to determine how many remote managers are available. A :class:`RuntimeError` will be raised if no managers were successfully started. """ super(Cluster, self).start() listener = connection.Listener(address=(self._hostname, 0), authkey=self._authkey, backlog=5) # Default is 1. # Start managers in separate thread to avoid losing connections. starter = threading.Thread(target=self._start_hosts, args=(listener.address, get_credentials())) starter.daemon = True starter.start() # Accept callback connections from started managers. waiting = [''] retry = 0 while waiting: host_processed = False for host in self._hostlist: host.poll() if host.state == 'started': # Accept conection from *any* host. _LOGGER.debug('waiting for a connection, host %s', host.hostname) # Normal accept() can hang. retval = [] accepter = threading.Thread(target=self._accept, args=(listener, retval), name='ClusterAccepter') accepter.daemon = True accepter.start() accepter.join(30) if accepter.is_alive(): msg = 'timeout waiting for reply from %s' \ % [host.hostname for host in self._hostlist if host.state == 'started'] _LOGGER.error(msg) for host in self._hostlist: if host.state == 'started': if host.proc is not None: host.proc.terminate() if host.reverse_cleanup is not None: host.reverse_cleanup[0]( *host.reverse_cleanup[1:]) host.state = 'failed' continue conn = retval[0] i, address, pubkey_text = conn.recv() conn.close() other_host = self._hostlist[i] if address is None: _LOGGER.error('Host %s died: %s', other_host.hostname, pubkey_text) # Exception text. other_host.state = 'failed' continue try: other_host.manager = \ HostManager.from_address(address, self._authkey, other_host) except Exception as exc: _LOGGER.error("Can't start manager for %s: %s", other_host.hostname, str(exc) or repr(exc)) if other_host.proc is not None: other_host.proc.terminate() other_host.state = 'failed' continue else: other_host.state = 'up' if pubkey_text: other_host.manager._pubkey = \ decode_public_key(pubkey_text) host_processed = True _LOGGER.debug('Host %s is now up', other_host.hostname) self._up.append(other_host) # See if there are still hosts to wait for. waiting = [] for host in self._hostlist: host.poll() if host.state == 'init' or host.state == 'started': waiting.append(host) if waiting: if not host_processed: retry += 1 if retry < 300: # ~60 seconds. time.sleep(0.2) else: _LOGGER.warning('Cluster startup timeout,' ' hosts not started:') for host in waiting: _LOGGER.warning(' %s (%s) in dir %s', host.hostname, host.state, host.tempdir) break else: break self._up = sorted(self._up, key=lambda host: host.hostname) # So our class defined shutdown() is called before the superclass # installed shutdown(). self._base_shutdown = self.shutdown del self.shutdown if len(self._up) < 1: raise RuntimeError('No hosts successfully started')