def test_get_oscmd_with_valid_os_types(self): """ Verify get_oscmd returns proper modules. """ for os_type in ('atomic', 'fedora', 'rhel'): self.assertEquals('commissaire.oscmd.{0}'.format(os_type), oscmd.get_oscmd(os_type).__module__)
def test_get_oscmd_with_valid_os_types(self): """ Verify get_oscmd returns proper modules. """ for os_type in available_os_types: self.assertEquals('commissaire.oscmd.{0}'.format(os_type), oscmd.get_oscmd(os_type).__module__)
def test_get_oscmd_with_valid_os_types(self): """ Verify get_oscmd returns proper modules. """ for os_type in available_os_types: self.assertEquals( 'commissaire.oscmd.{0}'.format(os_type), oscmd.get_oscmd(os_type).__module__)
def test_get_oscmd_with_valid_os_types(self): """ Verify get_oscmd returns proper modules. """ for os_type in ('atomic', 'fedora', 'rhel'): self.assertEquals( 'commissaire.oscmd.{0}'.format(os_type), oscmd.get_oscmd(os_type).__module__)
def test_bootstrap(self): """ Verify Transport().bootstrap works as expected. """ with patch( 'commissaire.transport.ansibleapi.TaskQueueManager') as _tqm: _tqm().run.return_value = 0 transport = ansibleapi.Transport() transport.variable_manager._fact_cache = {} oscmd = MagicMock(OSCmdBase) config = Config(etcd={ 'uri': urlparse('http://127.0.0.1:2379'), }, kubernetes={ 'uri': urlparse('http://127.0.0.1:8080'), 'token': 'token', }) result, facts = transport.bootstrap('10.2.0.2', 'test/fake_key', config, oscmd) # We should have a successful response self.assertEquals(0, result) # We should see expected calls self.assertEquals(1, oscmd.install_docker.call_count) self.assertEquals(1, oscmd.install_kube.call_count) # Check 'commissaire_enable_pkg_repos' playbook variable # for various operating systems. transport = ansibleapi.Transport() transport._run = MagicMock() transport._run.return_value = (0, {}) needs_enable_repos = ('redhat', 'rhel') for os_type in available_os_types: oscmd = get_oscmd(os_type) result, facts = transport.bootstrap('10.2.0.2.', 'test/fake_key', config, oscmd) play_vars = transport._run.call_args[0][4] command = play_vars['commissaire_enable_pkg_repos'] if os_type in needs_enable_repos: self.assertIn('subscription-manager repos', command) else: self.assertEqual('true', command) # no-op command
def test_bootstrap(self): """ Verify Transport().bootstrap works as expected. """ with patch('commissaire.transport.ansibleapi.TaskQueueManager') as _tqm: _tqm().run.return_value = 0 transport = ansibleapi.Transport() transport.variable_manager._fact_cache = {} oscmd = MagicMock(OSCmdBase) config = Config( etcd={ 'uri': urlparse('http://127.0.0.1:2379'), }, kubernetes={ 'uri': urlparse('http://127.0.0.1:8080'), 'token': 'token', } ) result, facts = transport.bootstrap( '10.2.0.2', 'test/fake_key', config, oscmd) # We should have a successful response self.assertEquals(0, result) # We should see expected calls self.assertEquals(1, oscmd.install_docker.call_count) self.assertEquals(1, oscmd.install_kube.call_count) # Check 'commissaire_enable_pkg_repos' playbook variable # for various operating systems. transport = ansibleapi.Transport() transport._run = MagicMock() transport._run.return_value = (0, {}) needs_enable_repos = ('redhat', 'rhel') for os_type in available_os_types: oscmd = get_oscmd(os_type) result, facts = transport.bootstrap( '10.2.0.2.', 'test/fake_key', config, oscmd) play_vars = transport._run.call_args[0][4] command = play_vars['commissaire_enable_pkg_repos'] if os_type in needs_enable_repos: self.assertIn('subscription-manager repos', command) else: self.assertEqual('true', command) # no-op command
def investigator(queue, config, run_once=False): """ Investigates new hosts to retrieve and store facts. :param queue: Queue to pull work from. :type queue: Queue.Queue :param config: Configuration information. :type config: commissaire.config.Config """ logger = logging.getLogger('investigator') logger.info('Investigator started') while True: # Statuses follow: # http://commissaire.readthedocs.org/en/latest/enums.html#host-statuses transport = ansibleapi.Transport() to_investigate, ssh_priv_key = queue.get() address = to_investigate['address'] logger.info('{0} is now in investigating.'.format(address)) logger.debug('Investigation details: key={0}, data={1}'.format( to_investigate, ssh_priv_key)) f = tempfile.NamedTemporaryFile(prefix='key', delete=False) key_file = f.name logger.debug('Using {0} as the temporary key location for {1}'.format( key_file, address)) f.write(base64.decodestring(ssh_priv_key)) logger.info('Wrote key for {0}'.format(address)) f.close() key = '/commissaire/hosts/{0}'.format(address) etcd_resp, error = cherrypy.engine.publish('store-get', key)[0] if error: logger.warn('Unable to continue for {0} due to ' '{1}: {2}. Returning...'.format( address, type(error), error)) clean_up_key(key_file) continue data = json.loads(etcd_resp.value) try: result, facts = transport.get_info(address, key_file) data.update(facts) data['last_check'] = datetime.datetime.utcnow().isoformat() data['status'] = 'bootstrapping' logger.info('Facts for {0} retrieved'.format(address)) logger.debug('Data: {0}'.format(data)) except: exc_type, exc_msg, tb = sys.exc_info() logger.warn('Getting info failed for {0}: {1}'.format( address, exc_msg)) data['status'] = 'failed' cherrypy.engine.publish('store-save', key, json.dumps(data))[0] clean_up_key(key_file) if run_once: break continue cherrypy.engine.publish('store-save', key, json.dumps(data))[0] logger.info( 'Finished and stored investigation data for {0}'.format(address)) logger.debug('Finished investigation update for {0}: {1}'.format( address, data)) logger.info('{0} is now in bootstrapping'.format(address)) oscmd = get_oscmd(data['os']) try: result, facts = transport.bootstrap(address, key_file, config, oscmd) data['status'] = 'inactive' cherrypy.engine.publish('store-save', key, json.dumps(data))[0] except: exc_type, exc_msg, tb = sys.exc_info() logger.warn('Unable to start bootstraping for {0}: {1}'.format( address, exc_msg)) data['status'] = 'disassociated' cherrypy.engine.publish('store-save', key, json.dumps(data))[0] clean_up_key(key_file) if run_once: break continue # Verify association with the container manager try: container_mgr = KubeContainerManager(config) # Try 3 times waiting 5 seconds each time before giving up for cnt in range(0, 3): if container_mgr.node_registered(address): logger.info( '{0} has been registered with the container manager.') data['status'] = 'active' break if cnt == 3: msg = 'Could not register with the container manager' logger.warn(msg) raise Exception(msg) logger.debug( '{0} has not been registered with the container manager. ' 'Checking again in 5 seconds...'.format(address)) sleep(5) except: _, exc_msg, _ = sys.exc_info() logger.warn( 'Unable to finish bootstrap for {0} while associating with ' 'the container manager: {1}'.format(address, exc_msg)) data['status'] = 'inactive' cherrypy.engine.publish('store-save', key, json.dumps(data))[0] logger.info('Finished bootstrapping for {0}'.format(address)) logging.debug('Finished bootstrapping for {0}: {1}'.format( address, data)) clean_up_key(key_file) if run_once: logger.info('Exiting due to run_once request.') break logger.info('Investigator stopping')
def test_bootstrap(self): """ Verify Transport().bootstrap works as expected. """ with patch( 'commissaire.transport.ansibleapi.TaskQueueManager') as _tqm: _tqm().run.return_value = 0 transport = ansibleapi.Transport() transport.variable_manager._fact_cache = {} oscmd = MagicMock(OSCmdBase) result, facts = transport.bootstrap('10.2.0.2', Cluster.new().__dict__, 'test/fake_key', MagicMock(), oscmd) # We should have a successful response self.assertEquals(0, result) # We should see expected calls self.assertEquals(1, oscmd.install_docker.call_count) self.assertEquals(1, oscmd.install_kube.call_count) # Check user-config to playbook-variable translation. etcd_config = { 'server_url': 'https://192.168.1.1:1234', 'certificate_ca_path': '/path/to/etcd/ca/cert', 'certificate_path': '/path/to/etcd/client/cert', 'certificate_key_path': '/path/to/etcd/client/key' } kube_config = { 'server_url': 'https://192.168.2.2:4567', 'certificate_path': '/path/to/kube/client/cert', 'certificate_key_path': '/path/to/kube/client/key' } store_manager = MagicMock(StoreHandlerManager) store_manager.list_store_handlers.return_value = [ (EtcdStoreHandler, etcd_config, ()), (KubernetesStoreHandler, kube_config, ()) ] store_manager.get.return_value = Network.new(name='default', type='flannel_etcd') cluster_data = Cluster.new(name='default', network='default').__dict__ transport = ansibleapi.Transport() transport._run = MagicMock() transport._run.return_value = (0, {}) result, facts = transport.bootstrap('10.2.0.2', cluster_data, 'test/fake_key', store_manager, oscmd) play_vars = transport._run.call_args[0][4] self.assertEqual(play_vars['commissaire_etcd_server_url'], 'https://192.168.1.1:1234') self.assertEqual(play_vars['commissaire_etcd_ca_path_local'], '/path/to/etcd/ca/cert') self.assertEqual( play_vars['commissaire_etcd_client_cert_path_local'], '/path/to/etcd/client/cert') self.assertEqual( play_vars['commissaire_etcd_client_key_path_local'], '/path/to/etcd/client/key') # Check 'commissaire_enable_pkg_repos' playbook variable # for various operating systems. transport = ansibleapi.Transport() transport._run = MagicMock() transport._run.return_value = (0, {}) needs_enable_repos = ('redhat', 'rhel') for os_type in available_os_types: oscmd = get_oscmd(os_type) result, facts = transport.bootstrap('10.2.0.2.', cluster_data, 'test/fake_key', MagicMock(), oscmd) play_vars = transport._run.call_args[0][4] command = play_vars['commissaire_enable_pkg_repos'] if os_type in needs_enable_repos: self.assertIn('subscription-manager repos', command) else: self.assertEqual('true', command) # no-op command
def investigator(queue, config, store_kwargs={}, run_once=False): """ Investigates new hosts to retrieve and store facts. :param queue: Queue to pull work from. :type queue: Queue.Queue :param config: Configuration information. :type config: commissaire.config.Config :param store_kwargs: Keyword arguments used to make the etcd client. :type store_kwargs: dict """ logger = logging.getLogger('investigator') logger.info('Investigator started') store = etcd.Client(**store_kwargs) while True: # Statuses follow: # http://commissaire.readthedocs.org/en/latest/enums.html#host-statuses to_investigate, ssh_priv_key, remote_user = queue.get() address = to_investigate['address'] logger.info('{0} is now in investigating.'.format(address)) logger.debug( 'Investigation details: key={0}, data={1}, remote_user={2}'.format( to_investigate, ssh_priv_key, remote_user)) transport = ansibleapi.Transport(remote_user) f = tempfile.NamedTemporaryFile(prefix='key', delete=False) key_file = f.name logger.debug( 'Using {0} as the temporary key location for {1}'.format( key_file, address)) f.write(base64.decodestring(ssh_priv_key)) logger.info('Wrote key for {0}'.format(address)) f.close() try: key = '/commissaire/hosts/{0}'.format(address) etcd_resp = store.get(key) except Exception as error: logger.warn( 'Unable to continue for {0} due to ' '{1}: {2}. Returning...'.format(address, type(error), error)) clean_up_key(key_file) continue data = json.loads(etcd_resp.value) try: result, facts = transport.get_info(address, key_file) data.update(facts) data['last_check'] = datetime.datetime.utcnow().isoformat() data['status'] = 'bootstrapping' logger.info('Facts for {0} retrieved'.format(address)) logger.debug('Data: {0}'.format(data)) except: exc_type, exc_msg, tb = sys.exc_info() logger.warn('Getting info failed for {0}: {1}'.format( address, exc_msg)) data['status'] = 'failed' store.write(key, json.dumps(data)) clean_up_key(key_file) if run_once: break continue store.write(key, json.dumps(data)) logger.info( 'Finished and stored investigation data for {0}'.format(address)) logger.debug('Finished investigation update for {0}: {1}'.format( address, data)) logger.info('{0} is now in bootstrapping'.format(address)) oscmd = get_oscmd(data['os']) try: result, facts = transport.bootstrap( address, key_file, config, oscmd) data['status'] = 'inactive' store.write(key, json.dumps(data)) except: exc_type, exc_msg, tb = sys.exc_info() logger.warn('Unable to start bootstraping for {0}: {1}'.format( address, exc_msg)) data['status'] = 'disassociated' store.write(key, json.dumps(data)) clean_up_key(key_file) if run_once: break continue # Verify association with the container manager try: container_mgr = KubeContainerManager(config) # Try 3 times waiting 5 seconds each time before giving up for cnt in range(0, 3): if container_mgr.node_registered(address): logger.info( '{0} has been registered with the container manager.') data['status'] = 'active' break if cnt == 3: msg = 'Could not register with the container manager' logger.warn(msg) raise Exception(msg) logger.debug( '{0} has not been registered with the container manager. ' 'Checking again in 5 seconds...'.format(address)) sleep(5) except: _, exc_msg, _ = sys.exc_info() logger.warn( 'Unable to finish bootstrap for {0} while associating with ' 'the container manager: {1}'.format(address, exc_msg)) data['status'] = 'inactive' store.write(key, json.dumps(data)) logger.info( 'Finished bootstrapping for {0}'.format(address)) logging.debug('Finished bootstrapping for {0}: {1}'.format( address, data)) clean_up_key(key_file) if run_once: logger.info('Exiting due to run_once request.') break logger.info('Investigator stopping')
def clusterexec(cluster_name, command): """ Remote executes a shell commands across a cluster. :param store: Data store to place results. :type store: etcd.Client """ logger = logging.getLogger('clusterexec') # TODO: This is a hack and should really be done elsewhere if command == 'upgrade': finished_hosts_key = 'upgraded' cluster_status = { "status": 'in_process', "upgrade_to": 'latest', "upgraded": [], "in_process": [], "started_at": datetime.datetime.utcnow().isoformat(), "finished_at": None, } elif command == 'restart': finished_hosts_key = 'restarted' cluster_status = { "status": 'in_process', "restarted": [], "in_process": [], "started_at": datetime.datetime.utcnow().isoformat(), "finished_at": None } end_status = 'finished' # Set the initial status in the store logger.info('Setting initial status.') logger.debug('Status={0}'.format(cluster_status)) cherrypy.engine.publish( 'store-save', '/commissaire/cluster/{0}/{1}'.format(cluster_name, command), json.dumps(cluster_status)) # Collect all host addresses in the cluster etcd_resp, error = cherrypy.engine.publish( 'store-get', '/commissaire/clusters/{0}'.format(cluster_name))[0] if error: logger.warn( 'Unable to continue for {0} due to ' '{1}: {2}. Returning...'.format(cluster_name, type(error), error)) return cluster_hosts = set(json.loads(etcd_resp.value).get('hostset', [])) if cluster_hosts: logger.debug( '{0} hosts in cluster {1}'.format( len(cluster_hosts), cluster_name)) else: logger.warn('No hosts in cluster {0}'.format(cluster_name)) # TODO: Find better way to do this a_hosts, error = cherrypy.engine.publish( 'store-get', '/commissaire/hosts')[0] if error: logger.warn( 'No hosts in the cluster. Error: {0}. Exiting clusterexec'.format( error)) return for a_host_dict in a_hosts._children: a_host = json.loads(a_host_dict['value']) if a_host['address'] not in cluster_hosts: logger.debug( 'Skipping {0} as it is not in this cluster.'.format( a_host['address'])) continue # Move on to the next one oscmd = get_oscmd(a_host['os']) command_list = getattr(oscmd, command)() # Only used for logging logger.info('Executing {0} on {1}...'.format( command_list, a_host['address'])) cluster_status['in_process'].append(a_host['address']) cherrypy.engine.publish( 'store-save', '/commissaire/cluster/{0}/{1}'.format(cluster_name, command), json.dumps(cluster_status)) # TODO: This is reused, make it reusable f = tempfile.NamedTemporaryFile(prefix='key', delete=False) key_file = f.name logger.debug( 'Using {0} as the temporary key location for {1}'.format( key_file, a_host['address'])) f.write(base64.decodestring(a_host['ssh_priv_key'])) logger.debug('Wrote key for {0}'.format(a_host['address'])) f.close() try: transport = ansibleapi.Transport() exe = getattr(transport, command) result, facts = exe( a_host['address'], key_file, oscmd) # XXX: ansibleapi explicitly raises Exception() except Exception: # If there was a failure set the end_status and break out end_status = 'failed' break finally: try: f.unlink(key_file) logger.debug('Removed temporary key file {0}'.format(key_file)) except: logger.warn( 'Unable to remove the temporary key file: {0}'.format( key_file)) cluster_status[finished_hosts_key].append(a_host['address']) try: idx = cluster_status['in_process'].index(a_host['address']) cluster_status['in_process'].pop(idx) except ValueError: logger.warn('Host {0} was not in_process for {1} {2}'.format( a_host['address'], command, cluster_name)) cherrypy.engine.publish( 'store-save', '/commissaire/cluster/{0}/{1}'.format(cluster_name, command), json.dumps(cluster_status)) logger.info('Finished executing {0} for {1} in {2}'.format( command, a_host['address'], cluster_name)) # Final set of command result cluster_status['finished_at'] = datetime.datetime.utcnow().isoformat() cluster_status['status'] = end_status logger.debug('Cluster {0} final {1} status: {2}'.format( cluster_name, command, cluster_status)) cherrypy.engine.publish( 'store-save', '/commissaire/cluster/{0}/{1}'.format(cluster_name, command), json.dumps(cluster_status)) logger.info('Clusterexec stopping')
def investigator(queue, config, store, run_once=False): """ Investigates new hosts to retrieve and store facts. :param queue: Queue to pull work from. :type queue: gevent.queue.Queue :param config: Configuration information. :type config: commissaire.config.Config :param store: Data store to place results. :type store: etcd.Client """ # TODO: Change this to be watch and etcd "queue" and kick off a function # similar to clusterpoolexec logger = logging.getLogger("investigator") logger.info("Investigator started") transport = ansibleapi.Transport() while True: # Statuses follow: # http://commissaire.readthedocs.org/en/latest/enums.html#host-statuses to_investigate, ssh_priv_key = queue.get() address = to_investigate["address"] logger.info("{0} is now in investigating.".format(address)) logger.debug("Investigation details: key={0}, data={1}".format(to_investigate, ssh_priv_key)) f = tempfile.NamedTemporaryFile(prefix="key", delete=False) key_file = f.name logger.debug("Using {0} as the temporary key location for {1}".format(key_file, address)) f.write(base64.decodestring(ssh_priv_key)) logger.debug("Wrote key for {0}".format(address)) f.close() key = "/commissaire/hosts/{0}".format(address) data = json.loads(store.get(key).value) try: result, facts = transport.get_info(address, key_file) data.update(facts) data["last_check"] = datetime.datetime.utcnow().isoformat() data["status"] = "bootstrapping" logger.info("Facts for {0} retrieved".format(address)) except: logger.warn("Getting info failed for {0}".format(address)) data["status"] = "failed" store.set(key, json.dumps(data)) exc_type, exc_msg, tb = sys.exc_info() logger.debug("{0} Exception: {1}".format(address, exc_msg)) clean_up_key(key_file) if run_once: break continue store.set(key, json.dumps(data)) logger.info("Finished and stored investigation data for {0}".format(address)) logger.debug("Finished investigation update for {0}: {1}".format(address, data)) # -- logger.info("{0} is now in bootstrapping".format(address)) oscmd = get_oscmd(data["os"])() try: result, facts = transport.bootstrap(address, key_file, config, oscmd) data["status"] = "inactive" store.set(key, json.dumps(data)) except: logger.warn("Unable to bootstrap {0}".format(address)) exc_type, exc_msg, tb = sys.exc_info() logger.debug("{0} Exception: {1}".format(address, exc_msg)) data["status"] = "disassociated" store.set(key, json.dumps(data)) clean_up_key(key_file) if run_once: break continue # Verify association with the container manager try: container_mgr = KubeContainerManager(config) # Try 3 times waiting 5 seconds each time before giving up for cnt in range(0, 3): if container_mgr.node_registered(address): logger.info("{0} has been registered with the container manager.") data["status"] = "active" break if cnt == 3: raise Exception("Could not register with the container manager") logger.debug( "{0} has not been registered with the container manager. " "Checking again in 5 seconds...".format(address) ) gevent.sleep(5) except: logger.warn("Unable to bootstrap {0}".format(address)) exc = sys.exc_info()[0] logger.debug("{0} Exception: {1}".format(address, exc)) data["status"] = "inactive" store.set(key, json.dumps(data)) logger.info("Finished bootstrapping for {0}".format(address)) logging.debug("Finished bootstrapping for {0}: {1}".format(address, data)) clean_up_key(key_file) if run_once: logger.info("Exiting due to run_once request.") break logger.info("Investigator stopping")
def clusterexec(cluster_name, command, store): """ Remote executes a shell commands across a cluster. :param store: Data store to place results. :type store: etcd.Client """ logger = logging.getLogger('clusterexec') # TODO: This is a hack and should really be done elsewhere if command == 'upgrade': finished_hosts_key = 'upgraded' cluster_status = { "status": 'in_process', "upgrade_to": 'latest', "upgraded": [], "in_process": [], "started_at": datetime.datetime.utcnow().isoformat(), "finished_at": None, } elif command == 'restart': finished_hosts_key = 'restarted' cluster_status = { "status": 'in_process', "restarted": [], "in_process": [], "started_at": datetime.datetime.utcnow().isoformat(), "finished_at": None } end_status = 'finished' # Set the initial status in the store logger.info('Setting initial status.') logger.debug('Status={0}'.format(cluster_status)) store.set('/commissaire/cluster/{0}/{1}'.format(cluster_name, command), json.dumps(cluster_status)) # Collect all host addresses in the cluster etcd_resp = store.get('/commissaire/clusters/{0}'.format(cluster_name)) cluster_hosts = set(json.loads(etcd_resp.value).get('hostset', [])) if cluster_hosts: logger.debug('{0} hosts in cluster {1}'.format(len(cluster_hosts), cluster_name)) else: logger.warn('No hosts in cluster {1}'.format(cluster_name)) # TODO: Find better way to do this for a_host_dict in store.get('/commissaire/hosts')._children: a_host = json.loads(a_host_dict['value']) if a_host['address'] not in cluster_hosts: logger.debug('Skipping {0} as it is not in this cluster.'.format( a_host['address'])) continue # Move on to the next one oscmd = get_oscmd(a_host['os']) command_list = getattr(oscmd(), command)() # Only used for logging logger.info('Executing {0} on {1}...'.format(command_list, a_host['address'])) cluster_status['in_process'].append(a_host['address']) store.set('/commissaire/cluster/{0}/{1}'.format(cluster_name, command), json.dumps(cluster_status)) # TODO: This is reused, make it reusable f = tempfile.NamedTemporaryFile(prefix='key', delete=False) key_file = f.name logger.debug('Using {0} as the temporary key location for {1}'.format( key_file, a_host['address'])) f.write(base64.decodestring(a_host['ssh_priv_key'])) logger.debug('Wrote key for {0}'.format(a_host['address'])) f.close() transport = ansibleapi.Transport() result, facts = getattr(transport, command)(a_host['address'], key_file, oscmd()) try: f.unlink(key_file) logger.debug('Removed temporary key file {0}'.format(key_file)) except: logger.warn('Unable to remove the temporary key file: {0}'.format( key_file)) # If there was a failure set the end_status and break out if result != 0: end_status = 'failed' break cluster_status[finished_hosts_key].append(a_host['address']) try: idx = cluster_status['in_process'].index(a_host['address']) cluster_status['in_process'].pop(idx) except ValueError: logger.warn('Host {0} was not in_process for {1} {2}'.format( a_host['address'], command, cluster_name)) store.set('/commissaire/cluster/{0}/{1}'.format(cluster_name, command), json.dumps(cluster_status)) logger.info('Finished executing {0} for {1} in {2}'.format( command, a_host['address'], cluster_name)) # Final set of command result cluster_status['finished_at'] = datetime.datetime.utcnow().isoformat() cluster_status['status'] = end_status store.set('/commissaire/cluster/{0}/{1}'.format(cluster_name, command), json.dumps(cluster_status)) logger.info('Clusterexec stopping')
def investigator(queue, config, run_once=False): """ Investigates new hosts to retrieve and store facts. :param queue: Queue to pull work from. :type queue: Queue.Queue :param config: Configuration information. :type config: commissaire.config.Config """ logger = logging.getLogger('investigator') logger.info('Investigator started') while True: # Statuses follow: # http://commissaire.readthedocs.org/en/latest/enums.html#host-statuses store_manager, to_investigate, ssh_priv_key, remote_user = queue.get() address = to_investigate['address'] logger.info('{0} is now in investigating.'.format(address)) logger.debug( 'Investigation details: key={0}, data={1}, remote_user={2}'.format( to_investigate, ssh_priv_key, remote_user)) transport = ansibleapi.Transport(remote_user) try: host = store_manager.get( Host( address=address, status='', os='', cpus=0, memory=0, space=0, last_check='', ssh_priv_key='', remote_user='')) key = TemporarySSHKey(host, logger) key.create() except Exception as error: logger.warn( 'Unable to continue for {0} due to ' '{1}: {2}. Returning...'.format(address, type(error), error)) key.remove() continue try: result, facts = transport.get_info(address, key.path) # recreate the host instance with new data data = json.loads(host.to_json(secure=True)) data.update(facts) host = Host(**data) host.last_check = datetime.datetime.utcnow().isoformat() host.status = 'bootstrapping' logger.info('Facts for {0} retrieved'.format(address)) logger.debug('Data: {0}'.format(host.to_json())) except: exc_type, exc_msg, tb = sys.exc_info() logger.warn('Getting info failed for {0}: {1}'.format( address, exc_msg)) host.status = 'failed' store_manager.save(host) key.remove() if run_once: break continue store_manager.save(host) logger.info( 'Finished and stored investigation data for {0}'.format(address)) logger.debug('Finished investigation update for {0}: {1}'.format( address, host.to_json())) logger.info('{0} is now in bootstrapping'.format(address)) oscmd = get_oscmd(host.os) try: result, facts = transport.bootstrap( address, key.path, config, oscmd, store_manager) host.status = 'inactive' store_manager.save(host) except: exc_type, exc_msg, tb = sys.exc_info() logger.warn('Unable to start bootstraping for {0}: {1}'.format( address, exc_msg)) host.status = 'disassociated' store_manager.save(host) key.remove() if run_once: break continue host.status = cluster_type = C.CLUSTER_TYPE_HOST try: cluster = util.cluster_for_host(address, store_manager) cluster_type = cluster.type except KeyError: # Not part of a cluster pass # Verify association with the container manager if cluster_type == C.CLUSTER_TYPE_KUBERNETES: try: container_mgr = KubeContainerManager(config) # Try 3 times waiting 5 seconds each time before giving up for cnt in range(0, 3): if container_mgr.node_registered(address): logger.info( '{0} has been registered with the ' 'container manager.'.format(address)) host.status = 'active' break if cnt == 3: msg = 'Could not register with the container manager' logger.warn(msg) raise Exception(msg) logger.debug( '{0} has not been registered with the container ' ' manager. Checking again in 5 seconds...'.format( address)) sleep(5) except: _, exc_msg, _ = sys.exc_info() logger.warn( 'Unable to finish bootstrap for {0} while associating ' 'with the container manager: {1}'.format( address, exc_msg)) host.status = 'inactive' store_manager.save(host) logger.info( 'Finished bootstrapping for {0}'.format(address)) logging.debug('Finished bootstrapping for {0}: {1}'.format( address, host.to_json())) key.remove() if run_once: logger.info('Exiting due to run_once request.') break logger.info('Investigator stopping')
def clusterexec(cluster_name, command, store): """ Remote executes a shell commands across a cluster. :param store: Data store to place results. :type store: etcd.Client """ logger = logging.getLogger('clusterexec') # TODO: This is a hack and should really be done elsewhere if command == 'upgrade': finished_hosts_key = 'upgraded' cluster_status = { "status": 'inprocess', "upgrade_to": 'latest', "upgraded": [], "in_process": [], "started_at": datetime.datetime.utcnow().isoformat(), "finished_at": None, } elif command == 'restart': finished_hosts_key = 'restarted' cluster_status = { "status": 'inprocess', "restarted": [], "in_process": [], "started_at": datetime.datetime.utcnow().isoformat(), "finished_at": None } end_status = 'finished' # Set the initial status in the store logger.info('Setting initial status.') logger.debug('Status={0}'.format(cluster_status)) store.set( '/commissaire/cluster/{0}/{1}'.format(cluster_name, command), json.dumps(cluster_status)) # TODO: Find better way to do this for a_host_dict in store.get('/commissaire/hosts')._children: a_host = json.loads(a_host_dict['value']) if a_host['cluster'] != cluster_name: logger.debug('Skipping {0} as it is not in this cluster.'.format( a_host['address'])) continue # Move on to the next one oscmd = get_oscmd(a_host['os']) command_list = getattr(oscmd(), command)() # Only used for logging logger.info('Executing {0} on {1}...'.format( command_list, a_host['address'])) cluster_status['in_process'].append(a_host['address']) store.set( '/commissaire/cluster/{0}/{1}'.format(cluster_name, command), json.dumps(cluster_status)) # TODO: This is reused, make it reusable f = tempfile.NamedTemporaryFile(prefix='key', delete=False) key_file = f.name logger.debug( 'Using {0} as the temporary key location for {1}'.format( key_file, a_host['address'])) f.write(base64.decodestring(a_host['ssh_priv_key'])) logger.debug('Wrote key for {0}'.format(a_host['address'])) f.close() transport = ansibleapi.Transport() result, facts = getattr(transport, command)( a_host['address'], key_file, oscmd()) try: f.unlink(key_file) logger.debug('Removed temporary key file {0}'.format(key_file)) except: logger.warn( 'Unable to remove the temporary key file: {0}'.format( key_file)) # If there was a failure set the end_status and break out if result != 0: end_status = 'failed' break cluster_status[finished_hosts_key].append(a_host['address']) try: idx = cluster_status['in_process'].index(a_host['address']) cluster_status['in_process'].pop(idx) except ValueError: logger.warn('Host {0} was not in_process for {1} {2}'.format( a_host['address'], command, cluster_name)) store.set( '/commissaire/cluster/{0}/{1}'.format(cluster_name, command), json.dumps(cluster_status)) logger.info('Finished executing {0} for {1} in {2}'.format( command, a_host['address'], cluster_name)) # Final set of command result cluster_status['finished_at'] = datetime.datetime.utcnow().isoformat() cluster_status['status'] = end_status store.set( '/commissaire/cluster/{0}/{1}'.format(cluster_name, command), json.dumps(cluster_status)) logger.info('Clusterexec stopping')
def clusterexec(store_manager, cluster_name, command, kwargs={}): """ Remote executes a shell commands across a cluster. :param store_manager: Proxy object for remtote stores :type store_manager: commissaire.store.StoreHandlerManager :param cluster_name: Name of the cluster to act on :type cluster_name: str :param command: Top-level command to execute :type command: str :param kwargs: Keyword arguments for the command :type kwargs: dict """ logger = logging.getLogger('clusterexec') # TODO: This is a hack and should really be done elsewhere command_args = () if command == 'upgrade': finished_hosts_key = 'upgraded' model_instance = ClusterUpgrade.new( name=cluster_name, status='in_process', started_at=datetime.datetime.utcnow().isoformat(), upgraded=[], in_process=[], ) elif command == 'restart': finished_hosts_key = 'restarted' model_instance = ClusterRestart.new( name=cluster_name, status='in_process', started_at=datetime.datetime.utcnow().isoformat(), restarted=[], in_process=[], ) elif command == 'deploy': finished_hosts_key = 'deployed' version = kwargs.get('version', '') command_args = (version,) model_instance = ClusterDeploy.new( name=cluster_name, status='in_process', started_at=datetime.datetime.utcnow().isoformat(), version=version, deployed=[], in_process=[], ) end_status = 'finished' try: # Set the initial status in the store logger.info('Setting initial status.') logger.debug('Status={0}'.format(model_instance.to_json())) store_manager.save(model_instance) except Exception as error: logger.error( 'Unable to save initial state for "{0}" clusterexec due to ' '{1}: {2}'.format(cluster_name, type(error), error)) return # Collect all host addresses in the cluster try: cluster = store_manager.get(Cluster.new( name=cluster_name, status='', hostset=[])) except Exception as error: logger.warn( 'Unable to continue for cluster "{0}" due to ' '{1}: {2}. Returning...'.format(cluster_name, type(error), error)) return if cluster.hostset: logger.debug( '{0} hosts in cluster "{1}"'.format( len(cluster.hostset), cluster_name)) else: logger.warn('No hosts in cluster "{0}"'.format(cluster_name)) # TODO: Find better way to do this try: hosts = store_manager.list(Hosts(hosts=[])) except Exception as error: logger.warn( 'No hosts in the cluster. Error: {0}. Exiting clusterexec'.format( error)) return for host in hosts.hosts: if host.address not in cluster.hostset: logger.debug( 'Skipping {0} as it is not in this cluster.'.format( host.address)) continue # Move on to the next one oscmd = get_oscmd(host.os) # command_list is only used for logging command_list = getattr(oscmd, command)(*command_args) logger.info('Executing {0} on {1}...'.format( command_list, host.address)) model_instance.in_process.append(host.address) try: store_manager.save(model_instance) except Exception as error: logger.error( 'Unable to save in_process state for "{0}" clusterexec due to ' '{1}: {2}'.format(cluster_name, type(error), error)) return key = TemporarySSHKey(host, logger) key.create() try: transport = ansibleapi.Transport(host.remote_user) exe = getattr(transport, command) result, facts = exe( host.address, key.path, oscmd, kwargs) # XXX: ansibleapi explicitly raises Exception() except Exception as ex: # If there was a failure set the end_status and break out end_status = 'failed' logger.error('Clusterexec {0} for {1} failed: {2}: {3}'.format( command, host.address, type(ex), ex)) break finally: try: key.remove() logger.debug('Removed temporary key file {0}'.format(key.path)) except: logger.warn( 'Unable to remove the temporary key file: {0}'.format( key.path)) # Set the finished hosts new_finished_hosts = getattr( model_instance, finished_hosts_key) + [host.address] setattr( model_instance, finished_hosts_key, new_finished_hosts) try: idx = model_instance.in_process.index(host.address) model_instance.in_process.pop(idx) except ValueError: logger.warn('Host {0} was not in_process for {1} {2}'.format( host['address'], command, cluster_name)) try: store_manager.save(model_instance) logger.info('Finished executing {0} for {1} in {2}'.format( command, host.address, cluster_name)) except Exception as error: logger.error( 'Unable to save cluster state for "{0}" clusterexec due to ' '{1}: {2}'.format(cluster_name, type(error), error)) return # Final set of command result model_instance.finished_at = datetime.datetime.utcnow().isoformat() model_instance.status = end_status logger.info('Cluster {0} final {1} status: {2}'.format( cluster_name, command, model_instance.to_json())) try: store_manager.save(model_instance) except Exception as error: logger.error( 'Unable to save final state for "{0}" clusterexec due to ' '{1}: {2}'.format(cluster_name, type(error), error)) logger.info('Clusterexec stopping')