def _gather_node_ip_addresses(self, nodes, lapse, ssh_timeout, remake=False): """ Connect via SSH to each node. Return set of nodes that could not be reached with `lapse` seconds. """ # for convenience, we might set this to ``None`` if the file cannot # be opened -- but we do not want to forget the cluster-wide # setting in case the error is transient known_hosts_path = self.known_hosts_file # If run with remake=True, deletes known_hosts_file so that it will # be recreated. Prevents "Invalid host key" errors if remake and os.path.isfile(known_hosts_path): os.remove(known_hosts_path) # Create the file if it's not present, otherwise the # following lines will raise an error try: fd = open(known_hosts_path, 'a') fd.close() except IOError as err: log.warning("Error opening SSH 'known hosts' file `%s`: %s", known_hosts_path, err) known_hosts_path = None keys = paramiko.hostkeys.HostKeys(known_hosts_path) with timeout(lapse, raise_timeout_error): try: while nodes: for node in copy(nodes): ssh = node.connect( keyfile=known_hosts_path, timeout=ssh_timeout) if ssh: log.info("Connection to node `%s` successful," " using IP address %s to connect.", node.name, node.connection_ip()) # Add host keys to the keys object. for host, key in ssh.get_host_keys().items(): for keytype, keydata in key.items(): keys.add(host, keytype, keydata) self._save_keys_to_known_hosts_file(keys) nodes.remove(node) if nodes: time.sleep(self.polling_interval) except TimeoutError: log.error( "Some nodes of the cluster were unreachable" " within the given %d-seconds timeout: %s", lapse, ', '.join(node.name for node in nodes)) # return list of nodes return nodes
def _gather_node_ip_addresses(self, nodes, lapse): """ Connect via SSH to each node. Return set of nodes that could not be reached with `lapse` seconds. """ # for convenience, we might set this to ``None`` if the file cannot # be opened -- but we do not want to forget the cluster-wide # setting in case the error is transient known_hosts_path = self.known_hosts_file # Create the file if it's not present, otherwise the # following lines will raise an error try: fd = open(known_hosts_path, 'a') fd.close() except IOError as err: log.warning("Error opening SSH 'known hosts' file `%s`: %s", known_hosts_path, err) known_hosts_path = None keys = paramiko.hostkeys.HostKeys(known_hosts_path) with timeout(lapse, raise_timeout_error): try: while nodes: for node in copy(nodes): ssh = node.connect(keyfile=known_hosts_path) if ssh: log.info("Connection to node `%s` successful," " using IP address %s to connect.", node.name, node.connection_ip()) # Add host keys to the keys object. for host, key in ssh.get_host_keys().items(): for keytype, keydata in key.items(): keys.add(host, keytype, keydata) self._save_keys_to_known_hosts_file(keys) nodes.remove(node) if nodes: time.sleep(self.polling_interval) except TimeoutError: log.error( "Some nodes of the cluster were unreachable" " within the given %d-seconds timeout: %s", lapse, ', '.join(node.name for node in nodes)) # return list of nodes return nodes
def _check_starting_nodes(self, nodes, lapse): """ Wait until all given nodes are alive, for max `lapse` seconds. """ with timeout(lapse, raise_timeout_error): try: while nodes: nodes = set(node for node in nodes if not node.is_alive()) if nodes: log.debug("Waiting for %d more nodes to come up ...", len(nodes)) time.sleep(self.polling_interval) except TimeoutError: log.error("Some nodes did not start correctly" " within the given %d-seconds timeout: %s", lapse, ', '.join(node.name for node in nodes)) # return list of not-yet-started nodes, # so we can exclude them from coming rounds return nodes