def test_temporarysshkey_create(self): """ Verify create of TemporarySSHKey creates a new key. """ key = TemporarySSHKey(TEST_HOST, logging.getLogger()) key.create() self.assertTrue(os.path.isfile(key.path)) os.unlink(key.path)
def test_temporarysshkey_remove(self): """ Verify TemporarySSHKey.remove successfully removes keys. """ key = TemporarySSHKey(TEST_HOST, logging.getLogger()) key.create() self.assertTrue(os.path.isfile(key.path)) key.remove() self.assertFalse(os.path.isfile(key.path))
def test_temporarysshkey_contextmanager(self): """ Verify TemporarySSHKey can be used as a context manager. """ with TemporarySSHKey(TEST_HOST, logging.getLogger()) as key: self.assertTrue(os.path.isfile(key.path)) self.assertFalse(os.path.isfile(key.path))
def test_temporarysshkey__init(self): """ Verify init of TemporarySSHKey sets up the instances. """ key = TemporarySSHKey(TEST_HOST, logging.getLogger()) # There should be no path yet self.assertEquals(None, key.path)
def _check(self, address): """ Initiates an check on the requested host. :param address: Host address to investigate :type address: str :param cluster_data: Optional data for the associated cluster :type cluster_data: dict """ # Statuses follow: # http://commissaire.readthedocs.org/en/latest/enums.html#host-statuses self.logger.info('Checking host "{}".'.format(address)) try: response = self.request('storage.get', params={ 'model_type_name': 'Host', 'model_json_data': Host.new(address=address).to_json(), 'secure': True, }) host = Host.new(**response['result']) except Exception as error: self.logger.warn('Unable to continue for host "{}" due to ' '{}: {}. Returning...'.format( address, type(error), error)) raise error transport = ansibleapi.Transport(host.remote_user) with TemporarySSHKey(host, self.logger) as key: try: self.logger.debug( 'Starting watcher run for host "{}"'.format(address)) result = transport.check_host_availability(host, key.path) host.last_check = datetime.utcnow().isoformat() self.logger.debug('Watcher result for host {}: {}'.format( address, result)) except Exception as error: self.logger.warn( 'Failed to connect to host node "{}"'.format(address)) self.logger.debug( 'Watcher failed for host node "{}" with {}: {}'.format( address, str(error), error)) host.status = 'failed' raise error finally: # Save the model self.request('storage.save', params={ 'model_type_name': host.__class__.__name__, 'model_json_data': host.to_json(), }) self.logger.info( 'Finished watcher run for host "{}"'.format(address))
def test_temporarysshkey_remove_failure(self): """ Verify TemporarySSHKey.remove reacts properly to failure. """ mock_logger = mock.MagicMock(logging.Logger('test')) key = TemporarySSHKey(TEST_HOST, mock_logger) key.create() with mock.patch('os.unlink') as _unlink: _unlink.side_effect = Exception self.assertTrue(os.path.isfile(key.path)) key.remove() self.assertTrue(os.path.isfile(key.path)) # We should have a warning in the log mock_logger.warn.assert_called_once() # Clean up the file key.remove()
def test_temporarysshkey_remove_failure(self): """ Verify TemporarySSHKey.remove reacts properly to failure. """ mock_logger = mock.MagicMock(logging.Logger('test')) key = TemporarySSHKey(TEST_HOST, mock_logger) key.create() with mock.patch('os.unlink') as _unlink: _unlink.side_effect = Exception self.assertTrue(os.path.isfile(key.path)) key.remove() self.assertTrue(os.path.isfile(key.path)) # We should have a warning in the log mock_logger.warn.assert_called_once_with(mock.ANY) # Clean up the file key.remove()
def _check(self, address): """ Initiates an check on the requested host. :param address: Host address to investigate :type address: str :param cluster_data: Optional data for the associated cluster :type cluster_data: dict """ # Statuses follow: # http://commissaire.readthedocs.org/en/latest/enums.html#host-statuses self.logger.info('Checking host "{}".'.format(address)) host = self.storage.get_host(address) host_creds = self.storage.get(HostCreds.new(address=host.address)) transport = ansibleapi.Transport(host_creds.remote_user) with TemporarySSHKey(host_creds, self.logger) as key: try: self.logger.debug( 'Starting watcher run for host "{}"'.format(address)) result = transport.check_host_availability(host, key.path) host.last_check = formatted_dt() self.logger.debug('Watcher result for host {}: {}'.format( address, result)) except Exception as error: self.logger.warn( 'Failed to connect to host node "{}"'.format(address)) self.logger.debug( 'Watcher failed for host node "{}" with {}: {}'.format( address, str(error), error)) host.status = C.HOST_STATUS_FAILED raise error finally: # Save the model self.storage.save(host) self.logger.info( 'Finished watcher run for host "{}"'.format(address))
def on_investigate(self, message, address, cluster_data={}): """ Initiates an investigation of the requested host. :param message: A message instance :type message: kombu.message.Message :param address: Host address to investigate :type address: str :param cluster_data: Optional data for the associated cluster :type cluster_data: dict """ # Statuses follow: # http://commissaire.readthedocs.org/en/latest/enums.html#host-statuses self.logger.info('{} is now in investigating.'.format(address)) self.logger.debug('Investigating: {}'.format(address)) if cluster_data: self.logger.debug('Related cluster: {}'.format(cluster_data)) host = self.storage.get_host(address) host_creds = self.storage.get(HostCreds.new(address=host.address)) transport = ansibleapi.Transport(host.remote_user) key = TemporarySSHKey(host_creds, self.logger) try: key.create() except Exception as error: self.logger.warn('Unable to continue for {} due to ' '{}: {}. Returning...'.format( address, type(error), error)) raise error try: facts = transport.get_info(address, key.path) # recreate the host instance with new data data = json.loads(host.to_json()) data.update(facts) host = Host.new(**data) host.last_check = formatted_dt() host.status = C.HOST_STATUS_BOOTSTRAPPING self.logger.info('Facts for {} retrieved'.format(address)) self.logger.debug('Data: {}'.format(host.to_json())) except Exception as error: self.logger.warn('Getting info failed for {}: {}'.format( address, str(error))) host.status = C.HOST_STATUS_FAILED key.remove() raise error finally: # Save the updated host model. self.storage.save(host) self.logger.info( 'Finished and stored investigation data for {}'.format(address)) self.logger.debug('Finished investigation update for {}: {}'.format( address, host.to_json())) self.logger.info('{} is now in bootstrapping'.format(address)) oscmd = get_oscmd(host.os) try: etcd_config = self._get_etcd_config() cluster, network = self._get_cluster_and_network_models( cluster_data) container_manager = None if cluster: if cluster.container_manager: container_manager = cluster.container_manager self.logger.info( 'Using cluster "{}" managed by "{}"'.format( cluster.name, container_manager)) else: self.logger.info('Using unmanaged cluster "{}"'.format( cluster.name)) self.logger.info('Using network "{}" of type "{}"'.format( network.name, network.type)) transport.bootstrap(address, key.path, oscmd, etcd_config, network) host.status = C.HOST_STATUS_DISASSOCIATED except Exception as error: self.logger.warn('Unable to start bootstraping for {}: {}'.format( address, str(error))) host.status = C.HOST_STATUS_FAILED key.remove() raise error finally: # Save the updated host model. self.storage.save(host) # Register with container manager (if applicable). try: if container_manager: self.request('container.register_node', container_manager, address) host.status = C.HOST_STATUS_ACTIVE except Exception as error: self.logger.warn( 'Unable to register {} to container manager "{}": {}'.format( address, container_manager, error.args[0])) key.remove() raise error finally: # Save the updated host model. self.storage.save(host) self.logger.info('Finished bootstrapping for {}'.format(address)) self.logger.debug('Finished bootstrapping for {}: {}'.format( address, host.to_json())) # XXX TEMPORARILY DISABLED # WATCHER_QUEUE.put_nowait((host, datetime.datetime.utcnow())) key.remove() return host.to_json()
def watcher(queue, store_manager, run_once=False): """ Attempts to connect and check hosts for status. :param queue: Queue to pull work from. :type queue: Queue.Queue :param store_manager: Proxy object for remtote stores :type store_manager: commissaire.store.StoreHandlerManager :param run_once: If only one run should occur. :type run_once: bool """ logger = logging.getLogger('watcher') logger.info('Watcher started') # TODO: should be configurable delta = datetime.timedelta(seconds=20) # TODO: should be configurable throttle = 60 # 1 minute # If the queue is empty attempt to populated it with known hosts if queue.qsize() == 0: logger.info('The WATCHER_QUEUE is empty. ' 'Attempting to populate it from the store.') try: hosts = store_manager.list(Hosts(hosts=[])) for host in hosts.hosts: last_check = datetime.datetime.strptime( host.last_check, "%Y-%m-%dT%H:%M:%S.%f") queue.put_nowait((host, last_check)) logger.debug('Inserted {0} into WATCHER_QUEUE'.format( host.address)) except: logger.info('No hosts found in the store.') while True: try: host, last_run = queue.get_nowait() except Empty: time.sleep(throttle) continue logger.debug('Retrieved {0} from queue. Last check was {1}'.format( host.address, last_run)) now = datetime.datetime.utcnow() if last_run > now - delta: logger.debug('{0} not ready to check. {1}'.format( host.address, last_run)) # Requeue the host with the same last_run queue.put_nowait((host, last_run)) else: logger.info('Checking {0} for availability'.format( host.address)) transport = ansibleapi.Transport(host.remote_user) with TemporarySSHKey(host, logger) as key: results = transport.check_host_availability(host, key.path) host.last_check = now.isoformat() if results[0] == 0: # This means the host is available # Only flip the bit on failed only if host.status == 'failed': try: cluster_type = util.cluster_for_host( host.address, store_manager).type except Exception: logger.debug( '{0} has no cluster type. Assuming {1}'.format( host.address, C.CLUSTER_TYPE_HOST)) cluster_type = C.CLUSTER_TYPE_HOST # If the type is CLUSTER_TYPE_HOST then it should be if cluster_type == C.CLUSTER_TYPE_HOST: host.status = 'disassociated' else: host.status = 'active' else: # If we can not access the host at all throw it to failed host.status = 'failed' host.last_check = now.isoformat() host = store_manager.save(host) # Requeue the host queue.put_nowait((host, now)) logger.debug('{0} has been requeued for next check run'.format( host.address)) if run_once: logger.info('Exiting watcher due to run_once request.') break logger.debug('Sleeping for {0} seconds.'.format(throttle)) time.sleep(throttle) logger.info('Watcher stopping')
def _execute(self, message, model_instance, command_args, finished_hosts_key): """ Remotely executes OS-specific shell commands across a cluster. :param message: A message instance :type message: kombu.message.Message :param model_instance: Initial model for the async operation :type model_instance: commissaire.models.Model :param command_args: Command name + arguments as a tuple :type command_args: tuple :param finished_hosts_key: Model attribute name for finished hosts :type finished_hosts_key: str """ # Split out the command name. command_name = command_args[0] command_args = command_args[1:] end_status = 'finished' # XXX We assume the model instance names a cluster. # Note, cluster_name is used in the except clause, # so it must be reliably defined. cluster_name = getattr(model_instance, 'name', None) try: assert cluster_name is not None model_json_data = model_instance.to_dict() # Set the initial status in the store. self.logger.info('Setting initial status.') self.logger.debug('Status={}'.format(model_json_data)) self.storage.save(model_instance) # Respond to the caller with the initial status. if message.properties.get('reply_to'): # XXX Have to dig up the message ID again. # CommissaireService.on_message() already # does this, but doesn't pass it to us. body = message.body if isinstance(body, bytes): body = json.loads(body.decode()) self.respond(message.properties['reply_to'], body.get('id', -1), model_json_data) except Exception as error: self.logger.error( 'Unable to save initial state for "{}" clusterexec due to ' '{}: {}'.format(cluster_name, type(error), error)) raise error # Collect all host addresses in the cluster. cluster = self.storage.get_cluster(cluster_name) n_hosts = len(cluster.hostset) if n_hosts: self.logger.debug('{} hosts in cluster "{}"'.format( n_hosts, cluster_name)) else: self.logger.warn('No hosts in cluster "{}"'.format(cluster_name)) for address in cluster.hostset: host = self.storage.get_host(address) oscmd = get_oscmd(host.os) # os_command is only used for logging os_command = getattr(oscmd, command_name)(*command_args) self.logger.info('Executing {} on {}...'.format( os_command, host.address)) model_instance.in_process.append(host.address) self.storage.save(model_instance) with TemporarySSHKey(host, self.logger) as key: try: transport = ansibleapi.Transport(host.remote_user) method = getattr(transport, command_name) method(host.address, key.path, oscmd, command_args) except Exception as error: # If there was a failure, set the end_status and break. end_status = C.HOST_STATUS_FAILED self.logger.error( 'Clusterexec {} for {} failed: {}: {}'.format( command_name, host.address, type(error), error)) break # Set the finished hosts. finished_hosts = getattr(model_instance, finished_hosts_key) finished_hosts.append(host.address) try: index = model_instance.in_process.index(host.address) model_instance.in_process.pop(index) except ValueError: self.logger.warn('Host {} was not in_process for {} {}'.format( host.address, command_name, cluster_name)) self.storage.save(model_instance) self.logger.info('Finished executing {} for {} in {}'.format( command_name, host.address, cluster_name)) # Final set of command result. model_instance.finished_at = formatted_dt() model_instance.status = end_status self.logger.info('Cluster {} final {} status: {}'.format( cluster_name, command_name, model_instance.to_json())) self.storage.save(model_instance)
def on_investigate(self, message, address, cluster_data={}): """ Initiates an investigation of the requested host. :param message: A message instance :type message: kombu.message.Message :param address: Host address to investigate :type address: str :param cluster_data: Optional data for the associated cluster :type cluster_data: dict """ # Statuses follow: # http://commissaire.readthedocs.org/en/latest/enums.html#host-statuses self.logger.info('{0} is now in investigating.'.format(address)) self.logger.debug('Investigating: {0}'.format(address)) if cluster_data: self.logger.debug('Related cluster: {0}'.format(cluster_data)) try: params = { 'model_type_name': 'Host', 'model_json_data': Host.new(address=address).to_json(), 'secure': True } response = self.request('storage.get', params=params) host = Host.new(**response['result']) except Exception as error: self.logger.warn( 'Unable to continue for {0} due to ' '{1}: {2}. Returning...'.format(address, type(error), error)) raise error transport = ansibleapi.Transport(host.remote_user) key = TemporarySSHKey(host, self.logger) try: key.create() except Exception as error: self.logger.warn( 'Unable to continue for {0} due to ' '{1}: {2}. Returning...'.format(address, type(error), error)) raise error try: facts = transport.get_info(address, key.path) # recreate the host instance with new data data = json.loads(host.to_json(secure=True)) data.update(facts) host = Host.new(**data) host.last_check = datetime.datetime.utcnow().isoformat() host.status = 'bootstrapping' self.logger.info('Facts for {0} retrieved'.format(address)) self.logger.debug('Data: {0}'.format(host.to_json())) except Exception as error: self.logger.warn('Getting info failed for {0}: {1}'.format( address, str(error))) host.status = 'failed' key.remove() raise error finally: # Save the updated host model. params = { 'model_type_name': host.__class__.__name__, 'model_json_data': host.to_json() } self.request('storage.save', params=params) self.logger.info( 'Finished and stored investigation data for {0}'.format(address)) self.logger.debug( 'Finished investigation update for {0}: {1}'.format( address, host.to_json())) self.logger.info('{0} is now in bootstrapping'.format(address)) oscmd = get_oscmd(host.os) try: etcd_config = self._get_etcd_config() cluster, network = self._get_cluster_and_network_models( cluster_data) self.logger.info( 'Using cluster "{0}" of type "{1}"'.format( cluster.name, cluster.type)) self.logger.info( 'Using network "{0}" of type "{1}"'.format( network.name, network.type)) transport.bootstrap( address, key.path, oscmd, etcd_config, cluster, network) host.status = 'inactive' except Exception as error: self.logger.warn( 'Unable to start bootstraping for {0}: {1}'.format( address, str(error))) host.status = 'disassociated' key.remove() raise error finally: # Save the updated host model. params = { 'model_type_name': host.__class__.__name__, 'model_json_data': host.to_json() } self.request('storage.save', params=params) # Verify association with relevant container managers params = { 'cluster_type': cluster.type, 'address': address } response = self.request('storage.node_registered', params=params) if response['result']: host.status = 'active' self.logger.info( 'Finished bootstrapping for {0}'.format(address)) self.logger.debug('Finished bootstrapping for {0}: {1}'.format( address, host.to_json())) # XXX TEMPORARILY DISABLED # WATCHER_QUEUE.put_nowait((host, datetime.datetime.utcnow())) key.remove() return host.to_json()
def investigator(queue, config, run_once=False): """ Investigates new hosts to retrieve and store facts. :param queue: Queue to pull work from. :type queue: Queue.Queue :param config: Configuration information. :type config: commissaire.config.Config """ logger = logging.getLogger('investigator') logger.info('Investigator started') while True: # Statuses follow: # http://commissaire.readthedocs.org/en/latest/enums.html#host-statuses store_manager, to_investigate, ssh_priv_key, remote_user = queue.get() address = to_investigate['address'] logger.info('{0} is now in investigating.'.format(address)) logger.debug( 'Investigation details: key={0}, data={1}, remote_user={2}'.format( to_investigate, ssh_priv_key, remote_user)) transport = ansibleapi.Transport(remote_user) try: host = store_manager.get( Host( address=address, status='', os='', cpus=0, memory=0, space=0, last_check='', ssh_priv_key='', remote_user='')) key = TemporarySSHKey(host, logger) key.create() except Exception as error: logger.warn( 'Unable to continue for {0} due to ' '{1}: {2}. Returning...'.format(address, type(error), error)) key.remove() continue try: result, facts = transport.get_info(address, key.path) # recreate the host instance with new data data = json.loads(host.to_json(secure=True)) data.update(facts) host = Host(**data) host.last_check = datetime.datetime.utcnow().isoformat() host.status = 'bootstrapping' logger.info('Facts for {0} retrieved'.format(address)) logger.debug('Data: {0}'.format(host.to_json())) except: exc_type, exc_msg, tb = sys.exc_info() logger.warn('Getting info failed for {0}: {1}'.format( address, exc_msg)) host.status = 'failed' store_manager.save(host) key.remove() if run_once: break continue store_manager.save(host) logger.info( 'Finished and stored investigation data for {0}'.format(address)) logger.debug('Finished investigation update for {0}: {1}'.format( address, host.to_json())) logger.info('{0} is now in bootstrapping'.format(address)) oscmd = get_oscmd(host.os) try: result, facts = transport.bootstrap( address, key.path, config, oscmd, store_manager) host.status = 'inactive' store_manager.save(host) except: exc_type, exc_msg, tb = sys.exc_info() logger.warn('Unable to start bootstraping for {0}: {1}'.format( address, exc_msg)) host.status = 'disassociated' store_manager.save(host) key.remove() if run_once: break continue host.status = cluster_type = C.CLUSTER_TYPE_HOST try: cluster = util.cluster_for_host(address, store_manager) cluster_type = cluster.type except KeyError: # Not part of a cluster pass # Verify association with the container manager if cluster_type == C.CLUSTER_TYPE_KUBERNETES: try: container_mgr = KubeContainerManager(config) # Try 3 times waiting 5 seconds each time before giving up for cnt in range(0, 3): if container_mgr.node_registered(address): logger.info( '{0} has been registered with the ' 'container manager.'.format(address)) host.status = 'active' break if cnt == 3: msg = 'Could not register with the container manager' logger.warn(msg) raise Exception(msg) logger.debug( '{0} has not been registered with the container ' ' manager. Checking again in 5 seconds...'.format( address)) sleep(5) except: _, exc_msg, _ = sys.exc_info() logger.warn( 'Unable to finish bootstrap for {0} while associating ' 'with the container manager: {1}'.format( address, exc_msg)) host.status = 'inactive' store_manager.save(host) logger.info( 'Finished bootstrapping for {0}'.format(address)) logging.debug('Finished bootstrapping for {0}: {1}'.format( address, host.to_json())) key.remove() if run_once: logger.info('Exiting due to run_once request.') break logger.info('Investigator stopping')
def clusterexec(store_manager, cluster_name, command, kwargs={}): """ Remote executes a shell commands across a cluster. :param store_manager: Proxy object for remtote stores :type store_manager: commissaire.store.StoreHandlerManager :param cluster_name: Name of the cluster to act on :type cluster_name: str :param command: Top-level command to execute :type command: str :param kwargs: Keyword arguments for the command :type kwargs: dict """ logger = logging.getLogger('clusterexec') # TODO: This is a hack and should really be done elsewhere command_args = () if command == 'upgrade': finished_hosts_key = 'upgraded' model_instance = ClusterUpgrade.new( name=cluster_name, status='in_process', started_at=datetime.datetime.utcnow().isoformat(), upgraded=[], in_process=[], ) elif command == 'restart': finished_hosts_key = 'restarted' model_instance = ClusterRestart.new( name=cluster_name, status='in_process', started_at=datetime.datetime.utcnow().isoformat(), restarted=[], in_process=[], ) elif command == 'deploy': finished_hosts_key = 'deployed' version = kwargs.get('version', '') command_args = (version,) model_instance = ClusterDeploy.new( name=cluster_name, status='in_process', started_at=datetime.datetime.utcnow().isoformat(), version=version, deployed=[], in_process=[], ) end_status = 'finished' try: # Set the initial status in the store logger.info('Setting initial status.') logger.debug('Status={0}'.format(model_instance.to_json())) store_manager.save(model_instance) except Exception as error: logger.error( 'Unable to save initial state for "{0}" clusterexec due to ' '{1}: {2}'.format(cluster_name, type(error), error)) return # Collect all host addresses in the cluster try: cluster = store_manager.get(Cluster.new( name=cluster_name, status='', hostset=[])) except Exception as error: logger.warn( 'Unable to continue for cluster "{0}" due to ' '{1}: {2}. Returning...'.format(cluster_name, type(error), error)) return if cluster.hostset: logger.debug( '{0} hosts in cluster "{1}"'.format( len(cluster.hostset), cluster_name)) else: logger.warn('No hosts in cluster "{0}"'.format(cluster_name)) # TODO: Find better way to do this try: hosts = store_manager.list(Hosts(hosts=[])) except Exception as error: logger.warn( 'No hosts in the cluster. Error: {0}. Exiting clusterexec'.format( error)) return for host in hosts.hosts: if host.address not in cluster.hostset: logger.debug( 'Skipping {0} as it is not in this cluster.'.format( host.address)) continue # Move on to the next one oscmd = get_oscmd(host.os) # command_list is only used for logging command_list = getattr(oscmd, command)(*command_args) logger.info('Executing {0} on {1}...'.format( command_list, host.address)) model_instance.in_process.append(host.address) try: store_manager.save(model_instance) except Exception as error: logger.error( 'Unable to save in_process state for "{0}" clusterexec due to ' '{1}: {2}'.format(cluster_name, type(error), error)) return key = TemporarySSHKey(host, logger) key.create() try: transport = ansibleapi.Transport(host.remote_user) exe = getattr(transport, command) result, facts = exe( host.address, key.path, oscmd, kwargs) # XXX: ansibleapi explicitly raises Exception() except Exception as ex: # If there was a failure set the end_status and break out end_status = 'failed' logger.error('Clusterexec {0} for {1} failed: {2}: {3}'.format( command, host.address, type(ex), ex)) break finally: try: key.remove() logger.debug('Removed temporary key file {0}'.format(key.path)) except: logger.warn( 'Unable to remove the temporary key file: {0}'.format( key.path)) # Set the finished hosts new_finished_hosts = getattr( model_instance, finished_hosts_key) + [host.address] setattr( model_instance, finished_hosts_key, new_finished_hosts) try: idx = model_instance.in_process.index(host.address) model_instance.in_process.pop(idx) except ValueError: logger.warn('Host {0} was not in_process for {1} {2}'.format( host['address'], command, cluster_name)) try: store_manager.save(model_instance) logger.info('Finished executing {0} for {1} in {2}'.format( command, host.address, cluster_name)) except Exception as error: logger.error( 'Unable to save cluster state for "{0}" clusterexec due to ' '{1}: {2}'.format(cluster_name, type(error), error)) return # Final set of command result model_instance.finished_at = datetime.datetime.utcnow().isoformat() model_instance.status = end_status logger.info('Cluster {0} final {1} status: {2}'.format( cluster_name, command, model_instance.to_json())) try: store_manager.save(model_instance) except Exception as error: logger.error( 'Unable to save final state for "{0}" clusterexec due to ' '{1}: {2}'.format(cluster_name, type(error), error)) logger.info('Clusterexec stopping')