def _configure_arakoon_to_volumedriver(offline_node_ips=None): print 'Update existing vPools' logger.info('Update existing vPools') if offline_node_ips is None: offline_node_ips = [] for storagerouter in StorageRouterList.get_storagerouters(): config = ArakoonClusterConfig('voldrv') config.load_config() arakoon_nodes = [] for node in config.nodes: arakoon_nodes.append({'host': node.ip, 'port': node.client_port, 'node_id': node.name}) with Remote(storagerouter.ip, [os, RawConfigParser, EtcdConfiguration, StorageDriverConfiguration], 'ovs') as remote: configuration_dir = '{0}/storagedriver/storagedriver'.format(EtcdConfiguration.get('/ovs/framework/paths|cfgdir')) if not remote.os.path.exists(configuration_dir): remote.os.makedirs(configuration_dir) for json_file in remote.os.listdir(configuration_dir): vpool_name = json_file.replace('.json', '') if json_file.endswith('.json'): if remote.os.path.exists('{0}/{1}.cfg'.format(configuration_dir, vpool_name)): continue # There's also a .cfg file, so this is an alba_proxy configuration file storagedriver_config = remote.StorageDriverConfiguration('storagedriver', vpool_name) storagedriver_config.load() storagedriver_config.configure_volume_registry(vregistry_arakoon_cluster_id='voldrv', vregistry_arakoon_cluster_nodes=arakoon_nodes) storagedriver_config.configure_distributed_lock_store(dls_type='Arakoon', dls_arakoon_cluster_id='voldrv', dls_arakoon_cluster_nodes=arakoon_nodes) storagedriver_config.save(reload_config=True)
def _configure_arakoon_to_volumedriver(): print "Update existing vPools" logger.info("Update existing vPools") for storagerouter in StorageRouterList.get_storagerouters(): config = ArakoonClusterConfig("voldrv") config.load_config() arakoon_nodes = [] for node in config.nodes: arakoon_nodes.append({"host": node.ip, "port": node.client_port, "node_id": node.name}) with Remote( storagerouter.ip, [os, RawConfigParser, EtcdConfiguration, StorageDriverConfiguration], "ovs" ) as remote: configuration_dir = "{0}/storagedriver/storagedriver".format( EtcdConfiguration.get("/ovs/framework/paths|cfgdir") ) if not remote.os.path.exists(configuration_dir): remote.os.makedirs(configuration_dir) for json_file in remote.os.listdir(configuration_dir): vpool_name = json_file.replace(".json", "") if json_file.endswith(".json"): if remote.os.path.exists("{0}/{1}.cfg".format(configuration_dir, vpool_name)): continue # There's also a .cfg file, so this is an alba_proxy configuration file storagedriver_config = remote.StorageDriverConfiguration("storagedriver", vpool_name) storagedriver_config.load() storagedriver_config.configure_volume_registry( vregistry_arakoon_cluster_id="voldrv", vregistry_arakoon_cluster_nodes=arakoon_nodes ) storagedriver_config.configure_distributed_lock_store( dls_type="Arakoon", dls_arakoon_cluster_id="voldrv", dls_arakoon_cluster_nodes=arakoon_nodes ) storagedriver_config.save(reload_config=True)
def _configure_arakoon_to_volumedriver(): print 'Update existing vPools' logger.info('Update existing vPools') config = ArakoonClusterConfig('voldrv') config.load_config() arakoon_nodes = [] for node in config.nodes: arakoon_nodes.append({ 'host': node.ip, 'port': node.client_port, 'node_id': node.name }) if EtcdConfiguration.dir_exists('/ovs/vpools'): for vpool_guid in EtcdConfiguration.list('/ovs/vpools'): for storagedriver_id in EtcdConfiguration.list( '/ovs/vpools/{0}/hosts'.format(vpool_guid)): storagedriver_config = StorageDriverConfiguration( 'storagedriver', vpool_guid, storagedriver_id) storagedriver_config.load() storagedriver_config.configure_volume_registry( vregistry_arakoon_cluster_id='voldrv', vregistry_arakoon_cluster_nodes=arakoon_nodes) storagedriver_config.configure_distributed_lock_store( dls_type='Arakoon', dls_arakoon_cluster_id='voldrv', dls_arakoon_cluster_nodes=arakoon_nodes) storagedriver_config.save(reload_config=True)
def _get_nsm_state(abm): state = {} data = VirtualAlbaBackend.data[abm] for nsm in data['nsms']: contents = Configuration.get(nsm['_config_key'], raw=True) config = ArakoonClusterConfig(nsm['id'], filesystem=False) config.read_config(contents) state[nsm['id']] = [node.name for node in config.nodes] return state
def collapse_arakoon(): """ Collapse Arakoon's Tlogs :return: None """ ScheduledTaskController._logger.info('Starting arakoon collapse') storagerouters = StorageRouterList.get_storagerouters() cluster_info = [('cacc', storagerouters[0], True)] cluster_names = [] for service in ServiceList.get_services(): if service.is_internal is True and service.type.name in (ServiceType.SERVICE_TYPES.ARAKOON, ServiceType.SERVICE_TYPES.NS_MGR, ServiceType.SERVICE_TYPES.ALBA_MGR): cluster = service.name.replace('arakoon-', '') if cluster in cluster_names: continue cluster_names.append(cluster) cluster_info.append((cluster, service.storagerouter, False)) workload = {} for cluster, storagerouter, filesystem in cluster_info: ScheduledTaskController._logger.debug(' Collecting info for cluster {0}'.format(cluster)) config = ArakoonClusterConfig(cluster, filesystem=filesystem) config.load_config(storagerouter.ip) for node in config.nodes: if node.ip not in workload: workload[node.ip] = {'node_id': node.name, 'clusters': []} workload[node.ip]['clusters'].append((cluster, filesystem)) for storagerouter in storagerouters: try: if storagerouter.ip not in workload: continue node_workload = workload[storagerouter.ip] client = SSHClient(storagerouter) for cluster, filesystem in node_workload['clusters']: try: ScheduledTaskController._logger.debug(' Collapsing cluster {0} on {1}'.format(cluster, storagerouter.ip)) if filesystem is True: config_path = ArakoonClusterConfig.CONFIG_FILE.format(cluster) else: config_path = Configuration.get_configuration_path(ArakoonClusterConfig.CONFIG_KEY.format(cluster)) client.run(['arakoon', '--collapse-local', node_workload['node_id'], '2', '-config', config_path]) ScheduledTaskController._logger.info(' Collapsing cluster {0} on {1} completed'.format(cluster, storagerouter.ip)) except: ScheduledTaskController._logger.exception(' Collapsing cluster {0} on {1} failed'.format(cluster, storagerouter.ip)) except UnableToConnectException: ScheduledTaskController._logger.error(' Could not collapse any cluster on {0} (not reachable)'.format(storagerouter.name)) ScheduledTaskController._logger.info('Arakoon collapse finished')
def load(vpool): """ Initializes the wrapper for a given vpool :param vpool: vPool for which the ClusterRegistryClient needs to be loaded """ if os.environ.get('RUNNING_UNITTESTS') == 'True': return ClusterRegistry(str(vpool.guid), None, None) key = vpool.identifier if key not in crclient_vpool_cache: arakoon_cluster_name = str(Configuration.get('/ovs/framework/arakoon_clusters|voldrv')) config = ArakoonClusterConfig(cluster_id=arakoon_cluster_name, filesystem=False) config.load_config() arakoon_node_configs = [] for node in config.nodes: arakoon_node_configs.append(ArakoonNodeConfig(str(node.name), str(node.ip), node.client_port)) client = ClusterRegistry(str(vpool.guid), arakoon_cluster_name, arakoon_node_configs) crclient_vpool_cache[key] = client return crclient_vpool_cache[key]
def _configure_arakoon_to_volumedriver(cluster_name): StorageDriverController._logger.info('Update existing vPools') config = ArakoonClusterConfig(cluster_id=cluster_name, filesystem=False) config.load_config() arakoon_nodes = [] for node in config.nodes: arakoon_nodes.append({'host': node.ip, 'port': node.client_port, 'node_id': node.name}) if Configuration.dir_exists('/ovs/vpools'): for vpool_guid in Configuration.list('/ovs/vpools'): for storagedriver_id in Configuration.list('/ovs/vpools/{0}/hosts'.format(vpool_guid)): storagedriver_config = StorageDriverConfiguration('storagedriver', vpool_guid, storagedriver_id) storagedriver_config.load() storagedriver_config.configure_volume_registry(vregistry_arakoon_cluster_id=cluster_name, vregistry_arakoon_cluster_nodes=arakoon_nodes) storagedriver_config.configure_distributed_lock_store(dls_type='Arakoon', dls_arakoon_cluster_id=cluster_name, dls_arakoon_cluster_nodes=arakoon_nodes) storagedriver_config.save(reload_config=True)
def load(vpool): """ Initializes the wrapper for a given vpool :param vpool: vPool for which the ObjectRegistryClient needs to be loaded """ if os.environ.get('RUNNING_UNITTESTS') == 'True': return ORClient(str(vpool.guid), None, None) key = vpool.identifier if key not in oclient_vpool_cache: arakoon_node_configs = [] arakoon_cluster_name = str( Configuration.get('/ovs/framework/arakoon_clusters|voldrv')) config = ArakoonClusterConfig(cluster_id=arakoon_cluster_name, filesystem=False) config.load_config() for node in config.nodes: arakoon_node_configs.append( ArakoonNodeConfig(str(node.name), str(node.ip), node.client_port)) client = ORClient(str(vpool.guid), str(arakoon_cluster_name), arakoon_node_configs) oclient_vpool_cache[key] = client return oclient_vpool_cache[key]
def post_update_core(client, components): """ Execute functionality after the openvstorage core packages have been updated For framework: * Restart support-agent on every client * Restart arakoon-ovsdb on every client (if present and required) For storagedriver: * ALBA proxies on every client * Restart arakoon-voldrv on every client (if present and required) :param client: Client on which to execute this post update functionality :type client: SSHClient :param components: Update components which have been executed :type components: list :return: None """ if 'framework' not in components and 'storagedriver' not in components: return update_information = UpdateController.get_update_information_core({}) services_to_restart = set() if 'storagedriver' in components: services_to_restart.update(update_information.get('storagedriver', {}).get('services_post_update', set())) if 'framework' in components: services_to_restart.update(update_information.get('framework', {}).get('services_post_update', set())) services_to_restart.add('support-agent') if services_to_restart: UpdateController._logger.debug('{0}: Executing hook {1}'.format(client.ip, inspect.currentframe().f_code.co_name)) for service_name in sorted(services_to_restart): if not service_name.startswith('ovs-arakoon-'): UpdateController.change_services_state(services=[service_name], ssh_clients=[client], action='restart') else: cluster_name = ArakoonClusterConfig.get_cluster_name(ExtensionToolbox.remove_prefix(service_name, 'ovs-arakoon-')) if cluster_name == 'config': arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name='cacc', filesystem=True, ip=System.get_my_storagerouter().ip) else: arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name) if arakoon_metadata['internal'] is True: UpdateController._logger.debug('{0}: Restarting arakoon node {1}'.format(client.ip, cluster_name)) ArakoonInstaller.restart_node(cluster_name=cluster_name, client=client) UpdateController._logger.debug('{0}: Executed hook {1}'.format(client.ip, inspect.currentframe().f_code.co_name))
def demote_node(cluster_ip, master_ip, ip_client_map, unique_id, unconfigure_memcached, unconfigure_rabbitmq, offline_nodes=None): """ Demotes a given node """ from ovs.dal.lists.storagerouterlist import StorageRouterList Toolbox.log(logger=NodeTypeController._logger, messages='Demoting node', title=True) if offline_nodes is None: offline_nodes = [] if unconfigure_memcached is True and len(offline_nodes) == 0: if NodeTypeController._validate_local_memcache_servers(ip_client_map) is False: raise RuntimeError('Not all memcache nodes can be reached which is required for demoting a node.') # Find other (arakoon) master nodes arakoon_cluster_name = str(Configuration.get('/ovs/framework/arakoon_clusters|ovsdb')) arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=arakoon_cluster_name) config = ArakoonClusterConfig(cluster_id=arakoon_cluster_name, filesystem=False) config.load_config() master_node_ips = [node.ip for node in config.nodes] if cluster_ip in master_node_ips: master_node_ips.remove(cluster_ip) if len(master_node_ips) == 0: raise RuntimeError('There should be at least one other master node') storagerouter = StorageRouterList.get_by_machine_id(unique_id) storagerouter.node_type = 'EXTRA' storagerouter.save() offline_node_ips = [node.ip for node in offline_nodes] if arakoon_metadata['internal'] is True: Toolbox.log(logger=NodeTypeController._logger, messages='Leaving Arakoon {0} cluster'.format(arakoon_cluster_name)) ArakoonInstaller.shrink_cluster(deleted_node_ip=cluster_ip, remaining_node_ips=master_node_ips, cluster_name=arakoon_cluster_name, offline_nodes=offline_node_ips) try: external_config = Configuration.get('/ovs/framework/external_config') if external_config is None: config_store = Configuration.get_store() if config_store == 'arakoon': Toolbox.log(logger=NodeTypeController._logger, messages='Leaving Arakoon config cluster') ArakoonInstaller.shrink_cluster(deleted_node_ip=cluster_ip, remaining_node_ips=master_node_ips, cluster_name='config', offline_nodes=offline_node_ips, filesystem=True) else: from ovs.extensions.db.etcd.installer import EtcdInstaller Toolbox.log(logger=NodeTypeController._logger, messages='Leaving Etcd cluster') EtcdInstaller.shrink_cluster(master_ip, cluster_ip, 'config', offline_node_ips) except Exception as ex: Toolbox.log(logger=NodeTypeController._logger, messages=['\nFailed to leave configuration cluster', ex], loglevel='exception') Toolbox.log(logger=NodeTypeController._logger, messages='Update configurations') try: if unconfigure_memcached is True: endpoints = Configuration.get('/ovs/framework/memcache|endpoints') endpoint = '{0}:{1}'.format(cluster_ip, 11211) if endpoint in endpoints: endpoints.remove(endpoint) Configuration.set('/ovs/framework/memcache|endpoints', endpoints) if unconfigure_rabbitmq is True: endpoints = Configuration.get('/ovs/framework/messagequeue|endpoints') endpoint = '{0}:{1}'.format(cluster_ip, 5672) if endpoint in endpoints: endpoints.remove(endpoint) Configuration.set('/ovs/framework/messagequeue|endpoints', endpoints) except Exception as ex: Toolbox.log(logger=NodeTypeController._logger, messages=['\nFailed to update configurations', ex], loglevel='exception') if arakoon_metadata['internal'] is True: Toolbox.log(logger=NodeTypeController._logger, messages='Restarting master node services') remaining_nodes = ip_client_map.keys()[:] if cluster_ip in remaining_nodes: remaining_nodes.remove(cluster_ip) PersistentFactory.store = None VolatileFactory.store = None for service in storagerouter.services: if service.name == 'arakoon-ovsdb': service.delete() target_client = None if storagerouter in offline_nodes: if unconfigure_rabbitmq is True: Toolbox.log(logger=NodeTypeController._logger, messages='Removing/unconfiguring offline RabbitMQ node') client = ip_client_map[master_ip] try: client.run(['rabbitmqctl', 'forget_cluster_node', 'rabbit@{0}'.format(storagerouter.name)]) except Exception as ex: Toolbox.log(logger=NodeTypeController._logger, messages=['\nFailed to forget RabbitMQ cluster node', ex], loglevel='exception') else: target_client = ip_client_map[cluster_ip] if unconfigure_rabbitmq is True: Toolbox.log(logger=NodeTypeController._logger, messages='Removing/unconfiguring RabbitMQ') try: if ServiceManager.has_service('rabbitmq-server', client=target_client): Toolbox.change_service_state(target_client, 'rabbitmq-server', 'stop', NodeTypeController._logger) target_client.run(['rabbitmq-server', '-detached']) time.sleep(5) target_client.run(['rabbitmqctl', 'stop_app']) time.sleep(5) target_client.run(['rabbitmqctl', 'reset']) time.sleep(5) target_client.run(['rabbitmqctl', 'stop']) time.sleep(5) target_client.file_unlink("/var/lib/rabbitmq/.erlang.cookie") Toolbox.change_service_state(target_client, 'rabbitmq-server', 'stop', NodeTypeController._logger) # To be sure except Exception as ex: Toolbox.log(logger=NodeTypeController._logger, messages=['\nFailed to remove/unconfigure RabbitMQ', ex], loglevel='exception') Toolbox.log(logger=NodeTypeController._logger, messages='Stopping services') services = ['memcached', 'rabbitmq-server'] if unconfigure_rabbitmq is False: services.remove('rabbitmq-server') if unconfigure_memcached is False: services.remove('memcached') for service in services: if ServiceManager.has_service(service, client=target_client): Toolbox.log(logger=NodeTypeController._logger, messages='Stopping service {0}'.format(service)) try: Toolbox.change_service_state(target_client, service, 'stop', NodeTypeController._logger) except Exception as ex: Toolbox.log(logger=NodeTypeController._logger, messages=['\nFailed to stop service'.format(service), ex], loglevel='exception') Toolbox.log(logger=NodeTypeController._logger, messages='Removing services') services = ['scheduled-tasks', 'webapp-api', 'volumerouter-consumer'] for service in services: if ServiceManager.has_service(service, client=target_client): Toolbox.log(logger=NodeTypeController._logger, messages='Removing service {0}'.format(service)) try: Toolbox.change_service_state(target_client, service, 'stop', NodeTypeController._logger) ServiceManager.remove_service(service, client=target_client) except Exception as ex: Toolbox.log(logger=NodeTypeController._logger, messages=['\nFailed to remove service'.format(service), ex], loglevel='exception') if ServiceManager.has_service('workers', client=target_client): ServiceManager.add_service(name='workers', client=target_client, params={'WORKER_QUEUE': '{0}'.format(unique_id)}) try: NodeTypeController._configure_amqp_to_volumedriver() except Exception as ex: Toolbox.log(logger=NodeTypeController._logger, messages=['\nFailed to configure AMQP to Storage Driver', ex], loglevel='exception') Toolbox.log(logger=NodeTypeController._logger, messages='Restarting services') NodeTypeController.restart_framework_and_memcache_services(clients=ip_client_map, logger=NodeTypeController._logger, offline_node_ips=offline_node_ips) if Toolbox.run_hooks(component='nodetype', sub_component='demote', logger=NodeTypeController._logger, cluster_ip=cluster_ip, master_ip=master_ip, offline_node_ips=offline_node_ips): Toolbox.log(logger=NodeTypeController._logger, messages='Restarting services') NodeTypeController.restart_framework_and_memcache_services(clients=ip_client_map, logger=NodeTypeController._logger, offline_node_ips=offline_node_ips) if storagerouter not in offline_nodes: target_client = ip_client_map[cluster_ip] node_name, _ = target_client.get_hostname() if NodeTypeController.avahi_installed(client=target_client, logger=NodeTypeController._logger) is True: NodeTypeController.configure_avahi(client=target_client, node_name=node_name, node_type='extra', logger=NodeTypeController._logger) Configuration.set('/ovs/framework/hosts/{0}/type'.format(storagerouter.machine_id), 'EXTRA') if target_client is not None and target_client.file_exists('/tmp/ovs_rollback'): target_client.file_write('/tmp/ovs_rollback', 'rollback') Toolbox.log(logger=NodeTypeController._logger, messages='Demote complete', title=True)
def test_cluster_maintenance(self): """ Validates whether a cluster can be correctly created """ Configuration.set('/ovs/framework/hosts/1/ports', {'arakoon': [10000, 10100]}) Configuration.set('/ovs/framework/hosts/2/ports', {'arakoon': [20000, 20100]}) structure = Helper.build_service_structure( {'storagerouters': [1, 2]} ) storagerouters = structure['storagerouters'] System._machine_id = {storagerouters[1].ip: '1', storagerouters[2].ip: '2'} # Create new cluster mountpoint = storagerouters[1].disks[0].partitions[0].mountpoint if os.path.exists(mountpoint) and mountpoint != '/': shutil.rmtree(mountpoint) base_dir = mountpoint + '/test_create_cluster' info = ArakoonInstaller.create_cluster('test', ServiceType.ARAKOON_CLUSTER_TYPES.FWK, storagerouters[1].ip, base_dir) reality = Helper.extract_dir_structure(base_dir) expected = {'dirs': {'arakoon': {'dirs': {'test': {'dirs': {'tlogs': {'dirs': {}, 'files': []}, 'db': {'dirs': {}, 'files': []}}, 'files': []}}, 'files': []}}, 'files': []} self.assertDictEqual(reality, expected) expected = '{0}\n\n{1}\n\n'.format(ArakoonInstallerTester.EXPECTED_CLUSTER_CONFIG.format('1', 'test', ''), ArakoonInstallerTester.EXPECTED_NODE_CONFIG.format( '1', storagerouters[1].ip, 10000, base_dir, '1', 10001 )) self.assertEqual(Configuration.get(ArakoonInstaller.CONFIG_KEY.format('test'), raw=True), expected) # @TODO: assert service availability here. It should be stopped ArakoonInstaller.start_cluster('test', storagerouters[1].ip, filesystem=False) # @TODO: assert the service is running config = ArakoonClusterConfig('test', filesystem=False) config.load_config(storagerouters[1].ip) client = ArakoonInstaller.build_client(config) reality = client.get(ArakoonInstaller.INTERNAL_CONFIG_KEY) self.assertEqual(reality, expected) self.assertFalse(client.exists(ArakoonInstaller.METADATA_KEY)) ArakoonInstaller.claim_cluster('test', storagerouters[1].ip, filesystem=False, metadata=info['metadata']) reality = json.loads(client.get(ArakoonInstaller.METADATA_KEY)) expected = {'cluster_name': 'test', 'cluster_type': 'FWK', 'in_use': True, 'internal': True} self.assertDictEqual(reality, expected) # Extending cluster mountpoint = storagerouters[2].disks[0].partitions[0].mountpoint if os.path.exists(mountpoint) and mountpoint != '/': shutil.rmtree(mountpoint) base_dir2 = mountpoint + '/test_extend_cluster' ArakoonInstaller.extend_cluster(storagerouters[1].ip, storagerouters[2].ip, 'test', base_dir2) reality = Helper.extract_dir_structure(base_dir) expected = {'dirs': {'arakoon': {'dirs': {'test': {'dirs': {'tlogs': {'dirs': {}, 'files': []}, 'db': {'dirs': {}, 'files': []}}, 'files': []}}, 'files': []}}, 'files': []} self.assertDictEqual(reality, expected) expected = '{0}\n\n{1}\n\n{2}\n\n'.format(ArakoonInstallerTester.EXPECTED_CLUSTER_CONFIG.format('1,2', 'test', ''), ArakoonInstallerTester.EXPECTED_NODE_CONFIG.format( '1', storagerouters[1].ip, 10000, base_dir, '1', 10001 ), ArakoonInstallerTester.EXPECTED_NODE_CONFIG.format( '2', storagerouters[2].ip, 20000, base_dir2, '2', 20001 )) self.assertEqual(Configuration.get(ArakoonInstaller.CONFIG_KEY.format('test'), raw=True), expected) # @TODO: assert service availability here. It should be stopped catchup_command = 'arakoon --node 2 -config file://opt/OpenvStorage/config/framework.json?key=/ovs/arakoon/test/config -catchup-only' SSHClient._run_returns[catchup_command] = None SSHClient._run_recordings = [] ArakoonInstaller.restart_cluster_add('test', [storagerouters[1].ip], storagerouters[2].ip, filesystem=False) self.assertIn(catchup_command, SSHClient._run_recordings) # @TODO: assert the service is running config = ArakoonClusterConfig('test', filesystem=False) config.load_config(storagerouters[2].ip) client = ArakoonInstaller.build_client(config) reality = client.get(ArakoonInstaller.INTERNAL_CONFIG_KEY) self.assertEqual(reality, expected) reality = json.loads(client.get(ArakoonInstaller.METADATA_KEY)) expected = {'cluster_name': 'test', 'cluster_type': 'FWK', 'in_use': True, 'internal': True} self.assertDictEqual(reality, expected) # Shrinking cluster ArakoonInstaller.shrink_cluster(storagerouters[1].ip, storagerouters[2].ip, 'test') reality = Helper.extract_dir_structure(base_dir) expected = {'dirs': {'arakoon': {'dirs': {'test': {'dirs': {}, 'files': []}}, 'files': []}}, 'files': []} self.assertDictEqual(reality, expected) expected = '{0}\n\n{1}\n\n'.format(ArakoonInstallerTester.EXPECTED_CLUSTER_CONFIG.format('2', 'test', ''), ArakoonInstallerTester.EXPECTED_NODE_CONFIG.format( '2', storagerouters[2].ip, 20000, base_dir2, '2', 20001 )) self.assertEqual(Configuration.get(ArakoonInstaller.CONFIG_KEY.format('test'), raw=True), expected) # @TODO: assert service availability here. It should be stopped ArakoonInstaller.restart_cluster_remove('test', [storagerouters[2].ip], filesystem=False) # @TODO: assert the service is running config = ArakoonClusterConfig('test', filesystem=False) config.load_config(storagerouters[2].ip) client = ArakoonInstaller.build_client(config) reality = client.get(ArakoonInstaller.INTERNAL_CONFIG_KEY) self.assertEqual(reality, expected) reality = json.loads(client.get(ArakoonInstaller.METADATA_KEY)) expected = {'cluster_name': 'test', 'cluster_type': 'FWK', 'in_use': True, 'internal': True} self.assertDictEqual(reality, expected)
def services_running(self, target): """ Check all services are running :param target: Target to check :return: Boolean """ try: key = 'ovs-watcher-{0}'.format(str(uuid.uuid4())) value = str(time.time()) if target == 'config': self.log_message(target, 'Testing configuration store...', 0) from ovs.extensions.generic.configuration import Configuration try: Configuration.list('/') except Exception as ex: self.log_message( target, ' Error during configuration store test: {0}'.format( ex), 2) return False if Configuration.get_store() == 'arakoon': from ovs.extensions.db.arakoon.configuration import ArakoonConfiguration from ovs.extensions.db.arakoon.ArakoonInstaller import ArakoonInstaller, ArakoonClusterConfig from ovs.extensions.db.arakoon.pyrakoon.pyrakoon.compat import NoGuarantee with open( ArakoonConfiguration.CACC_LOCATION) as config_file: contents = config_file.read() config = ArakoonClusterConfig(cluster_id='cacc', filesystem=True) config.read_config(contents) client = ArakoonInstaller.build_client(config) contents = client.get(ArakoonInstaller.INTERNAL_CONFIG_KEY, consistency=NoGuarantee()) if Watcher.LOG_CONTENTS != contents: try: config.read_config( contents ) # Validate whether the contents are not corrupt except Exception as ex: self.log_message( target, ' Configuration stored in configuration store seems to be corrupt: {0}' .format(ex), 2) return False temp_filename = '{0}~'.format( ArakoonConfiguration.CACC_LOCATION) with open(temp_filename, 'w') as config_file: config_file.write(contents) config_file.flush() os.fsync(config_file) os.rename(temp_filename, ArakoonConfiguration.CACC_LOCATION) Watcher.LOG_CONTENTS = contents self.log_message(target, ' Configuration store OK', 0) return True if target == 'framework': # Volatile self.log_message(target, 'Testing volatile store...', 0) max_tries = 5 tries = 0 while tries < max_tries: try: try: logging.disable(logging.WARNING) from ovs.extensions.storage.volatilefactory import VolatileFactory VolatileFactory.store = None volatile = VolatileFactory.get_client() volatile.set(key, value) if volatile.get(key) == value: volatile.delete(key) break volatile.delete(key) finally: logging.disable(logging.NOTSET) except Exception as message: self.log_message( target, ' Error during volatile store test: {0}'.format( message), 2) key = 'ovs-watcher-{0}'.format(str( uuid.uuid4())) # Get another key time.sleep(1) tries += 1 if tries == max_tries: self.log_message(target, ' Volatile store not working correctly', 2) return False self.log_message( target, ' Volatile store OK after {0} tries'.format(tries), 0) # Persistent self.log_message(target, 'Testing persistent store...', 0) max_tries = 5 tries = 0 while tries < max_tries: try: try: logging.disable(logging.WARNING) persistent = PersistentFactory.get_client() persistent.set(key, value) if persistent.get(key) == value: persistent.delete(key) break persistent.delete(key) finally: logging.disable(logging.NOTSET) except Exception as message: self.log_message( target, ' Error during persistent store test: {0}'.format( message), 2) key = 'ovs-watcher-{0}'.format(str( uuid.uuid4())) # Get another key time.sleep(1) tries += 1 if tries == max_tries: self.log_message( target, ' Persistent store not working correctly', 2) return False self.log_message( target, ' Persistent store OK after {0} tries'.format(tries), 0) if target == 'volumedriver': # Arakoon, voldrv cluster self.log_message(target, 'Testing arakoon (voldrv)...', 0) max_tries = 5 tries = 0 while tries < max_tries: try: from ovs.extensions.generic.configuration import Configuration from ovs.extensions.storage.persistent.pyrakoonstore import PyrakoonStore cluster_name = str( Configuration.get( '/ovs/framework/arakoon_clusters|voldrv')) client = PyrakoonStore(cluster=cluster_name) client.set(key, value) if client.get(key) == value: client.delete(key) break client.delete(key) except Exception as message: self.log_message( target, ' Error during arakoon (voldrv) test: {0}'.format( message), 2) key = 'ovs-watcher-{0}'.format(str( uuid.uuid4())) # Get another key time.sleep(1) tries += 1 if tries == max_tries: self.log_message( target, ' Arakoon (voldrv) not working correctly', 2) return False self.log_message(target, ' Arakoon (voldrv) OK', 0) if target in ['framework', 'volumedriver']: # RabbitMQ self.log_message(target, 'Test rabbitMQ...', 0) import pika from ovs.extensions.generic.configuration import Configuration messagequeue = Configuration.get('/ovs/framework/messagequeue') rmq_servers = messagequeue['endpoints'] good_node = False for server in rmq_servers: try: connection_string = '{0}://{1}:{2}@{3}/%2F'.format( messagequeue['protocol'], messagequeue['user'], messagequeue['password'], server) connection = pika.BlockingConnection( pika.URLParameters(connection_string)) channel = connection.channel() channel.basic_publish( '', 'ovs-watcher', str(time.time()), pika.BasicProperties(content_type='text/plain', delivery_mode=1)) connection.close() good_node = True except Exception as message: self.log_message( target, ' Error during rabbitMQ test on node {0}: {1}'. format(server, message), 2) if good_node is False: self.log_message( target, ' No working rabbitMQ node could be found', 2) return False self.log_message(target, ' RabbitMQ test OK', 0) self.log_message(target, 'All tests OK', 0) return True except Exception as ex: self.log_message(target, 'Unexpected exception: {0}'.format(ex), 2) return False
def get_package_information_core(client, package_info): """ Called by GenericController.refresh_package_information() every hour Retrieve information about the currently installed versions of the core packages Retrieve information about the versions to which each package can potentially be updated If installed version is different from candidate version --> store this information in model Additionally check the services with a 'run' file Verify whether the running version is up-to-date with the candidate version If different --> store this information in the model Result: Every package with updates or which requires services to be restarted is stored in the model :param client: Client on which to collect the version information :type client: SSHClient :param package_info: Dictionary passed in by the thread calling this function :type package_info: dict :return: Package information :rtype: dict """ try: if client.username != 'root': raise RuntimeError('Only the "root" user can retrieve the package information') binaries = PackageManager.get_binary_versions(client=client, package_names=UpdateController.core_packages_with_binaries) installed = PackageManager.get_installed_versions(client=client, package_names=UpdateController.all_core_packages) candidate = PackageManager.get_candidate_versions(client=client, package_names=UpdateController.all_core_packages) if set(installed.keys()) != set(UpdateController.all_core_packages) or set(candidate.keys()) != set(UpdateController.all_core_packages): raise RuntimeError('Failed to retrieve the installed and candidate versions for packages: {0}'.format(', '.join(UpdateController.all_core_packages))) # Retrieve Arakoon information framework_arakoons = [] storagedriver_arakoons = [] for cluster, arakoon_list in {'cacc': framework_arakoons, 'ovsdb': framework_arakoons, 'voldrv': storagedriver_arakoons}.iteritems(): cluster_name = ArakoonClusterConfig.get_cluster_name(cluster) if cluster_name is None: continue if cluster == 'cacc': arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name, filesystem=True, ip=client.ip) else: arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name) if arakoon_metadata['internal'] is True: arakoon_list.append(ArakoonInstaller.get_service_name_for_cluster(cluster_name=arakoon_metadata['cluster_name'])) storagerouter = StorageRouterList.get_by_ip(client.ip) alba_proxies = [] for service in storagerouter.services: if service.type.name == ServiceType.SERVICE_TYPES.ALBA_PROXY: alba_proxies.append(service.name) storagedriver_services = [] for sd in storagerouter.storagedrivers: storagedriver_services.append('ovs-dtl_{0}'.format(sd.vpool.name)) storagedriver_services.append('ovs-volumedriver_{0}'.format(sd.vpool.name)) default_entry = {'candidate': None, 'installed': None, 'services_to_restart': []} # component: package_name: services_with_run_file for component, info in {'framework': {'arakoon': framework_arakoons, 'openvstorage': []}, 'storagedriver': {'alba': alba_proxies, 'arakoon': storagedriver_arakoons, 'volumedriver-no-dedup-base': [], 'volumedriver-no-dedup-server': storagedriver_services}}.iteritems(): component_info = {} for package, services in info.iteritems(): for service in services: service = ExtensionToolbox.remove_prefix(service, 'ovs-') version_file = '/opt/OpenvStorage/run/{0}.version'.format(service) if not client.file_exists(version_file): UpdateController._logger.warning('{0}: Failed to find a version file in /opt/OpenvStorage/run for service {1}'.format(client.ip, service)) continue package_name = package running_versions = client.file_read(version_file).strip() for version in running_versions.split(';'): version = version.strip() running_version = None if '=' in version: package_name = version.split('=')[0] running_version = version.split('=')[1] elif version: running_version = version if package_name not in UpdateController.all_core_packages: raise ValueError('Unknown package dependency found in {0}'.format(version_file)) if package_name not in binaries: raise RuntimeError('Binary version for package {0} was not retrieved'.format(package_name)) if running_version is not None and running_version != binaries[package_name]: if package_name not in component_info: component_info[package_name] = copy.deepcopy(default_entry) component_info[package_name]['installed'] = running_version component_info[package_name]['candidate'] = binaries[package_name] component_info[package_name]['services_to_restart'].append('ovs-{0}'.format(service)) if installed[package] != candidate[package] and package not in component_info: component_info[package] = copy.deepcopy(default_entry) component_info[package]['installed'] = installed[package] component_info[package]['candidate'] = candidate[package] if component_info: if component not in package_info[client.ip]: package_info[client.ip][component] = {} package_info[client.ip][component].update(component_info) except Exception as ex: if 'errors' not in package_info[client.ip]: package_info[client.ip]['errors'] = [] package_info[client.ip]['errors'].append(ex) return package_info
def get_update_information_core(information): """ Called when the 'Update' button in the GUI is pressed This call collects additional information about the packages which can be updated Eg: * Downtime for Arakoons * Downtime for StorageDrivers * Prerequisites that haven't been met * Services which will be stopped during update * Services which will be restarted after update """ # Verify arakoon info arakoon_ovs_info = {'down': False, 'name': None, 'internal': False} arakoon_cacc_info = {'down': False, 'name': None, 'internal': False} arakoon_voldrv_info = {'down': False, 'name': None, 'internal': False} for cluster in ['cacc', 'ovsdb', 'voldrv']: cluster_name = ArakoonClusterConfig.get_cluster_name(cluster) if cluster_name is None: continue if cluster == 'cacc': arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name, filesystem=True, ip=System.get_my_storagerouter().ip) else: arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name) if arakoon_metadata['internal'] is True: config = ArakoonClusterConfig(cluster_id=cluster_name, filesystem=(cluster == 'cacc')) config.load_config(System.get_my_storagerouter().ip if cluster == 'cacc' else None) if cluster == 'ovsdb': arakoon_ovs_info['down'] = len(config.nodes) < 3 arakoon_ovs_info['name'] = arakoon_metadata['cluster_name'] arakoon_ovs_info['internal'] = True elif cluster == 'voldrv': arakoon_voldrv_info['down'] = len(config.nodes) < 3 arakoon_voldrv_info['name'] = arakoon_metadata['cluster_name'] arakoon_voldrv_info['internal'] = True else: arakoon_cacc_info['name'] = arakoon_metadata['cluster_name'] arakoon_cacc_info['internal'] = True # Verify StorageRouter downtime prerequisites = [] all_storagerouters = StorageRouterList.get_storagerouters() for storagerouter in all_storagerouters: try: SSHClient(endpoint=storagerouter, username='******') except UnableToConnectException: prerequisites.append(['node_down', storagerouter.name]) for key in ['framework', 'storagedriver']: if key not in information: information[key] = {'packages': {}, 'downtime': [], 'prerequisites': prerequisites, 'services_stop_start': set(), 'services_post_update': set()} for storagerouter in all_storagerouters: if key not in storagerouter.package_information: continue # Retrieve ALBA proxy issues alba_services = [] alba_downtime = [] for service in storagerouter.services: if service.type.name != ServiceType.SERVICE_TYPES.ALBA_PROXY or service.alba_proxy is None: continue alba_services.append(service.name) alba_downtime.append(['proxy', service.alba_proxy.storagedriver.vpool.name]) # Retrieve StorageDriver issues storagedriver_downtime = [] storagedriver_services = [] for sd in storagerouter.storagedrivers: # Order of services is important, first we want to stop all volume-drivers, then DTLs storagedriver_services.append('ovs-volumedriver_{0}'.format(sd.vpool.name)) for sd in storagerouter.storagedrivers: storagedriver_services.append('ovs-dtl_{0}'.format(sd.vpool.name)) if len(sd.vdisks_guids) > 0: storagedriver_downtime.append(['voldrv', sd.vpool.name]) # Retrieve the actual update information for package_name, package_info in storagerouter.package_information[key].iteritems(): if package_name not in UpdateController.all_core_packages: continue # Only gather information for the core packages information[key]['services_post_update'].update(package_info.pop('services_to_restart')) if package_name not in information[key]['packages']: information[key]['packages'][package_name] = {} information[key]['packages'][package_name].update(package_info) if package_name == 'openvstorage': if ['gui', None] not in information[key]['downtime']: information[key]['downtime'].append(['gui', None]) if ['api', None] not in information[key]['downtime']: information[key]['downtime'].append(['api', None]) information[key]['services_stop_start'].update({'watcher-framework', 'memcached'}) elif package_name == 'alba': for down in alba_downtime: if down not in information[key]['downtime']: information[key]['downtime'].append(down) information[key]['services_post_update'].update(alba_services) elif package_name == 'volumedriver-no-dedup-base': for down in storagedriver_downtime: if down not in information[key]['downtime']: information[key]['downtime'].append(down) information[key]['services_post_update'].update(storagedriver_services) elif package_name == 'volumedriver-no-dedup-server': for down in storagedriver_downtime: if down not in information[key]['downtime']: information[key]['downtime'].append(down) information[key]['services_post_update'].update(storagedriver_services) elif package_name == 'arakoon': if key == 'framework': framework_arakoons = set() if arakoon_ovs_info['internal'] is True: framework_arakoons.add('ovs-arakoon-{0}'.format(arakoon_ovs_info['name'])) if arakoon_cacc_info['internal'] is True: framework_arakoons.add('ovs-arakoon-{0}'.format(arakoon_cacc_info['name'])) information[key]['services_post_update'].update(framework_arakoons) if arakoon_ovs_info['down'] is True and ['ovsdb', None] not in information[key]['downtime']: information[key]['downtime'].append(['ovsdb', None]) elif arakoon_voldrv_info['internal'] is True: information[key]['services_post_update'].update({'ovs-arakoon-{0}'.format(arakoon_voldrv_info['name'])}) if arakoon_voldrv_info['down'] is True and ['voldrv', None] not in information[key]['downtime']: information[key]['downtime'].append(['voldrv', None]) return information
def get_update_information_alba_plugin(information): """ Called when the 'Update' button in the GUI is pressed This call collects additional information about the packages which can be updated Eg: * Downtime for Arakoons * Downtime for StorageDrivers * Prerequisites that haven't been met * Services which will be stopped during update * Services which will be restarted after update """ # Verify arakoon info arakoon_ovs_info = {'down': False, 'name': None, 'internal': False} arakoon_cacc_info = {'down': False, 'name': None, 'internal': False} for cluster in ['cacc', 'ovsdb']: cluster_name = ArakoonClusterConfig.get_cluster_name(cluster) if cluster_name is None: continue if cluster == 'cacc': arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name, filesystem=True, ip=System.get_my_storagerouter().ip) else: arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name) if arakoon_metadata['internal'] is True: config = ArakoonClusterConfig(cluster_id=cluster_name, filesystem=(cluster == 'cacc')) config.load_config(System.get_my_storagerouter().ip if cluster == 'cacc' else None) if cluster == 'ovsdb': arakoon_ovs_info['down'] = len(config.nodes) < 3 arakoon_ovs_info['name'] = arakoon_metadata['cluster_name'] arakoon_ovs_info['internal'] = True else: arakoon_cacc_info['name'] = arakoon_metadata['cluster_name'] arakoon_cacc_info['internal'] = True # Verify StorageRouter downtime fwk_prerequisites = [] all_storagerouters = StorageRouterList.get_storagerouters() for storagerouter in all_storagerouters: try: SSHClient(endpoint=storagerouter, username='******') except UnableToConnectException: fwk_prerequisites.append(['node_down', storagerouter.name]) # Verify ALBA node responsiveness alba_prerequisites = [] for alba_node in AlbaNodeList.get_albanodes(): try: alba_node.client.get_metadata() except Exception: alba_prerequisites.append(['alba_node_unresponsive', alba_node.ip]) for key in ['framework', 'alba']: if key not in information: information[key] = {'packages': {}, 'downtime': [], 'prerequisites': fwk_prerequisites if key == 'framework' else alba_prerequisites, 'services_stop_start': set(), 'services_post_update': set()} for storagerouter in StorageRouterList.get_storagerouters(): if key not in storagerouter.package_information: continue # Retrieve Arakoon issues arakoon_downtime = [] arakoon_services = [] for service in storagerouter.services: if service.type.name not in [ServiceType.SERVICE_TYPES.ALBA_MGR, ServiceType.SERVICE_TYPES.NS_MGR]: continue if service.type.name == ServiceType.SERVICE_TYPES.ALBA_MGR: cluster_name = AlbaController.get_abm_cluster_name(alba_backend=service.abm_service.alba_backend) else: cluster_name = AlbaController.get_nsm_cluster_name(alba_backend=service.nsm_service.alba_backend, number=service.nsm_service.number) if Configuration.exists('/ovs/arakoon/{0}/config'.format(cluster_name), raw=True) is False: continue arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name) if arakoon_metadata['internal'] is True: arakoon_services.append('ovs-{0}'.format(service.name)) config = ArakoonClusterConfig(cluster_id=cluster_name, filesystem=False) config.load_config() if len(config.nodes) < 3: if service.type.name == ServiceType.SERVICE_TYPES.NS_MGR: arakoon_downtime.append(['backend', service.nsm_service.alba_backend.name]) else: arakoon_downtime.append(['backend', service.abm_service.alba_backend.name]) for package_name, package_info in storagerouter.package_information[key].iteritems(): if package_name not in AlbaUpdateController.alba_plugin_packages: continue # Only gather information for the core packages information[key]['services_post_update'].update(package_info.pop('services_to_restart')) if package_name not in information[key]['packages']: information[key]['packages'][package_name] = {} information[key]['packages'][package_name].update(package_info) if package_name == 'openvstorage-backend': if ['gui', None] not in information[key]['downtime']: information[key]['downtime'].append(['gui', None]) if ['api', None] not in information[key]['downtime']: information[key]['downtime'].append(['api', None]) information[key]['services_stop_start'].update({'watcher-framework', 'memcached'}) elif package_name == 'alba': for down in arakoon_downtime: if down not in information[key]['downtime']: information[key]['downtime'].append(down) information[key]['services_post_update'].update(arakoon_services) elif package_name == 'arakoon': if key == 'framework': framework_arakoons = set() if arakoon_ovs_info['internal'] is True: framework_arakoons.add('ovs-arakoon-{0}'.format(arakoon_ovs_info['name'])) if arakoon_cacc_info['internal'] is True: framework_arakoons.add('ovs-arakoon-{0}'.format(arakoon_cacc_info['name'])) information[key]['services_post_update'].update(framework_arakoons) if arakoon_ovs_info['down'] is True and ['ovsdb', None] not in information[key]['downtime']: information[key]['downtime'].append(['ovsdb', None]) else: for down in arakoon_downtime: if down not in information[key]['downtime']: information[key]['downtime'].append(down) information[key]['services_post_update'].update(arakoon_services) for alba_node in AlbaNodeList.get_albanodes(): for package_name, package_info in alba_node.package_information.get(key, {}).iteritems(): if package_name not in AlbaUpdateController.sdm_packages: continue # Only gather information for the SDM packages information[key]['services_post_update'].update(package_info.pop('services_to_restart')) if package_name not in information[key]['packages']: information[key]['packages'][package_name] = {} information[key]['packages'][package_name].update(package_info) return information
def services_running(self, target): """ Check all services are running :param target: Target to check :return: Boolean """ try: key = 'ovs-watcher-{0}'.format(str(uuid.uuid4())) value = str(time.time()) if target == 'config': self.log_message(target, 'Testing configuration store...', 0) from ovs.extensions.generic.configuration import Configuration try: Configuration.list('/') except Exception as ex: self.log_message(target, ' Error during configuration store test: {0}'.format(ex), 2) return False if Configuration.get_store() == 'arakoon': from ovs.extensions.db.arakoon.configuration import ArakoonConfiguration from ovs.extensions.db.arakoon.ArakoonInstaller import ArakoonInstaller, ArakoonClusterConfig from ovs.extensions.db.arakoon.pyrakoon.pyrakoon.compat import NoGuarantee with open(ArakoonConfiguration.CACC_LOCATION) as config_file: contents = config_file.read() config = ArakoonClusterConfig(cluster_id='cacc', filesystem=True) config.read_config(contents) client = ArakoonInstaller.build_client(config) contents = client.get(ArakoonInstaller.INTERNAL_CONFIG_KEY, consistency=NoGuarantee()) if Watcher.LOG_CONTENTS != contents: try: config.read_config(contents) # Validate whether the contents are not corrupt except Exception as ex: self.log_message(target, ' Configuration stored in configuration store seems to be corrupt: {0}'.format(ex), 2) return False temp_filename = '{0}~'.format(ArakoonConfiguration.CACC_LOCATION) with open(temp_filename, 'w') as config_file: config_file.write(contents) config_file.flush() os.fsync(config_file) os.rename(temp_filename, ArakoonConfiguration.CACC_LOCATION) Watcher.LOG_CONTENTS = contents self.log_message(target, ' Configuration store OK', 0) return True if target == 'framework': # Volatile self.log_message(target, 'Testing volatile store...', 0) max_tries = 5 tries = 0 while tries < max_tries: try: try: logging.disable(logging.WARNING) from ovs.extensions.storage.volatilefactory import VolatileFactory VolatileFactory.store = None volatile = VolatileFactory.get_client() volatile.set(key, value) if volatile.get(key) == value: volatile.delete(key) break volatile.delete(key) finally: logging.disable(logging.NOTSET) except Exception as message: self.log_message(target, ' Error during volatile store test: {0}'.format(message), 2) key = 'ovs-watcher-{0}'.format(str(uuid.uuid4())) # Get another key time.sleep(1) tries += 1 if tries == max_tries: self.log_message(target, ' Volatile store not working correctly', 2) return False self.log_message(target, ' Volatile store OK after {0} tries'.format(tries), 0) # Persistent self.log_message(target, 'Testing persistent store...', 0) max_tries = 5 tries = 0 while tries < max_tries: try: try: logging.disable(logging.WARNING) persistent = PersistentFactory.get_client() persistent.set(key, value) if persistent.get(key) == value: persistent.delete(key) break persistent.delete(key) finally: logging.disable(logging.NOTSET) except Exception as message: self.log_message(target, ' Error during persistent store test: {0}'.format(message), 2) key = 'ovs-watcher-{0}'.format(str(uuid.uuid4())) # Get another key time.sleep(1) tries += 1 if tries == max_tries: self.log_message(target, ' Persistent store not working correctly', 2) return False self.log_message(target, ' Persistent store OK after {0} tries'.format(tries), 0) if target == 'volumedriver': # Arakoon, voldrv cluster self.log_message(target, 'Testing arakoon (voldrv)...', 0) max_tries = 5 tries = 0 while tries < max_tries: try: from ovs.extensions.generic.configuration import Configuration from ovs.extensions.storage.persistent.pyrakoonstore import PyrakoonStore cluster_name = str(Configuration.get('/ovs/framework/arakoon_clusters|voldrv')) client = PyrakoonStore(cluster=cluster_name) client.set(key, value) if client.get(key) == value: client.delete(key) break client.delete(key) except Exception as message: self.log_message(target, ' Error during arakoon (voldrv) test: {0}'.format(message), 2) key = 'ovs-watcher-{0}'.format(str(uuid.uuid4())) # Get another key time.sleep(1) tries += 1 if tries == max_tries: self.log_message(target, ' Arakoon (voldrv) not working correctly', 2) return False self.log_message(target, ' Arakoon (voldrv) OK', 0) if target in ['framework', 'volumedriver']: # RabbitMQ self.log_message(target, 'Test rabbitMQ...', 0) import pika from ovs.extensions.generic.configuration import Configuration messagequeue = Configuration.get('/ovs/framework/messagequeue') rmq_servers = messagequeue['endpoints'] good_node = False for server in rmq_servers: try: connection_string = '{0}://{1}:{2}@{3}/%2F'.format(messagequeue['protocol'], messagequeue['user'], messagequeue['password'], server) connection = pika.BlockingConnection(pika.URLParameters(connection_string)) channel = connection.channel() channel.basic_publish('', 'ovs-watcher', str(time.time()), pika.BasicProperties(content_type='text/plain', delivery_mode=1)) connection.close() good_node = True except Exception as message: self.log_message(target, ' Error during rabbitMQ test on node {0}: {1}'.format(server, message), 2) if good_node is False: self.log_message(target, ' No working rabbitMQ node could be found', 2) return False self.log_message(target, ' RabbitMQ test OK', 0) self.log_message(target, 'All tests OK', 0) return True except Exception as ex: self.log_message(target, 'Unexpected exception: {0}'.format(ex), 2) return False
def test_cluster_maintenance(self): """ Validates whether a cluster can be correctly created """ Configuration.set('/ovs/framework/hosts/1/ports', {'arakoon': [10000, 10100]}) Configuration.set('/ovs/framework/hosts/2/ports', {'arakoon': [20000, 20100]}) structure = Helper.build_service_structure({'storagerouters': [1, 2]}) storagerouters = structure['storagerouters'] System._machine_id = { storagerouters[1].ip: '1', storagerouters[2].ip: '2' } # Create new cluster mountpoint = storagerouters[1].disks[0].partitions[0].mountpoint if os.path.exists(mountpoint) and mountpoint != '/': shutil.rmtree(mountpoint) base_dir = mountpoint + '/test_create_cluster' info = ArakoonInstaller.create_cluster( 'test', ServiceType.ARAKOON_CLUSTER_TYPES.FWK, storagerouters[1].ip, base_dir) reality = Helper.extract_dir_structure(base_dir) expected = { 'dirs': { 'arakoon': { 'dirs': { 'test': { 'dirs': { 'tlogs': { 'dirs': {}, 'files': [] }, 'db': { 'dirs': {}, 'files': [] } }, 'files': [] } }, 'files': [] } }, 'files': [] } self.assertDictEqual(reality, expected) expected = '{0}\n\n{1}\n\n'.format( ArakoonInstallerTester.EXPECTED_CLUSTER_CONFIG.format( '1', 'test', ''), ArakoonInstallerTester.EXPECTED_NODE_CONFIG.format( '1', storagerouters[1].ip, 10000, base_dir, '1', 10001)) self.assertEqual( Configuration.get(ArakoonInstaller.CONFIG_KEY.format('test'), raw=True), expected) # @TODO: assert service availability here. It should be stopped ArakoonInstaller.start_cluster('test', storagerouters[1].ip, filesystem=False) # @TODO: assert the service is running config = ArakoonClusterConfig('test', filesystem=False) config.load_config(storagerouters[1].ip) client = ArakoonInstaller.build_client(config) reality = client.get(ArakoonInstaller.INTERNAL_CONFIG_KEY) self.assertEqual(reality, expected) self.assertFalse(client.exists(ArakoonInstaller.METADATA_KEY)) ArakoonInstaller.claim_cluster('test', storagerouters[1].ip, filesystem=False, metadata=info['metadata']) reality = json.loads(client.get(ArakoonInstaller.METADATA_KEY)) expected = { 'cluster_name': 'test', 'cluster_type': 'FWK', 'in_use': True, 'internal': True } self.assertDictEqual(reality, expected) # Extending cluster mountpoint = storagerouters[2].disks[0].partitions[0].mountpoint if os.path.exists(mountpoint) and mountpoint != '/': shutil.rmtree(mountpoint) base_dir2 = mountpoint + '/test_extend_cluster' ArakoonInstaller.extend_cluster(storagerouters[1].ip, storagerouters[2].ip, 'test', base_dir2) reality = Helper.extract_dir_structure(base_dir) expected = { 'dirs': { 'arakoon': { 'dirs': { 'test': { 'dirs': { 'tlogs': { 'dirs': {}, 'files': [] }, 'db': { 'dirs': {}, 'files': [] } }, 'files': [] } }, 'files': [] } }, 'files': [] } self.assertDictEqual(reality, expected) expected = '{0}\n\n{1}\n\n{2}\n\n'.format( ArakoonInstallerTester.EXPECTED_CLUSTER_CONFIG.format( '1,2', 'test', ''), ArakoonInstallerTester.EXPECTED_NODE_CONFIG.format( '1', storagerouters[1].ip, 10000, base_dir, '1', 10001), ArakoonInstallerTester.EXPECTED_NODE_CONFIG.format( '2', storagerouters[2].ip, 20000, base_dir2, '2', 20001)) self.assertEqual( Configuration.get(ArakoonInstaller.CONFIG_KEY.format('test'), raw=True), expected) # @TODO: assert service availability here. It should be stopped catchup_command = 'arakoon --node 2 -config file://opt/OpenvStorage/config/framework.json?key=/ovs/arakoon/test/config -catchup-only' SSHClient._run_returns[catchup_command] = None SSHClient._run_recordings = [] ArakoonInstaller.restart_cluster_add('test', [storagerouters[1].ip], storagerouters[2].ip, filesystem=False) self.assertIn(catchup_command, SSHClient._run_recordings) # @TODO: assert the service is running config = ArakoonClusterConfig('test', filesystem=False) config.load_config(storagerouters[2].ip) client = ArakoonInstaller.build_client(config) reality = client.get(ArakoonInstaller.INTERNAL_CONFIG_KEY) self.assertEqual(reality, expected) reality = json.loads(client.get(ArakoonInstaller.METADATA_KEY)) expected = { 'cluster_name': 'test', 'cluster_type': 'FWK', 'in_use': True, 'internal': True } self.assertDictEqual(reality, expected) # Shrinking cluster ArakoonInstaller.shrink_cluster(storagerouters[1].ip, storagerouters[2].ip, 'test') reality = Helper.extract_dir_structure(base_dir) expected = { 'dirs': { 'arakoon': { 'dirs': { 'test': { 'dirs': {}, 'files': [] } }, 'files': [] } }, 'files': [] } self.assertDictEqual(reality, expected) expected = '{0}\n\n{1}\n\n'.format( ArakoonInstallerTester.EXPECTED_CLUSTER_CONFIG.format( '2', 'test', ''), ArakoonInstallerTester.EXPECTED_NODE_CONFIG.format( '2', storagerouters[2].ip, 20000, base_dir2, '2', 20001)) self.assertEqual( Configuration.get(ArakoonInstaller.CONFIG_KEY.format('test'), raw=True), expected) # @TODO: assert service availability here. It should be stopped ArakoonInstaller.restart_cluster_remove('test', [storagerouters[2].ip], filesystem=False) # @TODO: assert the service is running config = ArakoonClusterConfig('test', filesystem=False) config.load_config(storagerouters[2].ip) client = ArakoonInstaller.build_client(config) reality = client.get(ArakoonInstaller.INTERNAL_CONFIG_KEY) self.assertEqual(reality, expected) reality = json.loads(client.get(ArakoonInstaller.METADATA_KEY)) expected = { 'cluster_name': 'test', 'cluster_type': 'FWK', 'in_use': True, 'internal': True } self.assertDictEqual(reality, expected)
def promote_node(cluster_ip, master_ip, ip_client_map, unique_id, configure_memcached, configure_rabbitmq): """ Promotes a given node """ from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.dal.lists.servicetypelist import ServiceTypeList from ovs.dal.lists.servicelist import ServiceList from ovs.dal.hybrids.service import Service Toolbox.log(logger=NodeTypeController._logger, messages='Promoting node', title=True) if configure_memcached is True: if NodeTypeController._validate_local_memcache_servers(ip_client_map) is False: raise RuntimeError('Not all memcache nodes can be reached which is required for promoting a node.') target_client = ip_client_map[cluster_ip] machine_id = System.get_my_machine_id(target_client) node_name, _ = target_client.get_hostname() master_client = ip_client_map[master_ip] storagerouter = StorageRouterList.get_by_machine_id(unique_id) storagerouter.node_type = 'MASTER' storagerouter.save() external_config = Configuration.get('/ovs/framework/external_config') if external_config is None: config_store = Configuration.get_store() if config_store == 'arakoon': Toolbox.log(logger=NodeTypeController._logger, messages='Joining Arakoon configuration cluster') metadata = ArakoonInstaller.extend_cluster(master_ip=master_ip, new_ip=cluster_ip, cluster_name='config', base_dir=Configuration.get('/ovs/framework/paths|ovsdb'), ports=[26400, 26401], filesystem=True) ArakoonInstaller.restart_cluster_add(cluster_name='config', current_ips=metadata['ips'], new_ip=cluster_ip, filesystem=True) ServiceManager.register_service(node_name=machine_id, service_metadata=metadata['service_metadata']) else: from ovs.extensions.db.etcd.installer import EtcdInstaller Toolbox.log(logger=NodeTypeController._logger, messages='Joining Etcd cluster') EtcdInstaller.extend_cluster(master_ip, cluster_ip, 'config') # Find other (arakoon) master nodes arakoon_cluster_name = str(Configuration.get('/ovs/framework/arakoon_clusters|ovsdb')) arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=arakoon_cluster_name) config = ArakoonClusterConfig(cluster_id=arakoon_cluster_name, filesystem=False) config.load_config() master_node_ips = [node.ip for node in config.nodes] if cluster_ip in master_node_ips: master_node_ips.remove(cluster_ip) if len(master_node_ips) == 0: raise RuntimeError('There should be at least one other master node') arakoon_ports = [] if arakoon_metadata['internal'] is True: Toolbox.log(logger=NodeTypeController._logger, messages='Joining Arakoon OVS DB cluster') result = ArakoonInstaller.extend_cluster(master_ip=master_ip, new_ip=cluster_ip, cluster_name=arakoon_cluster_name, base_dir=Configuration.get('/ovs/framework/paths|ovsdb')) ArakoonInstaller.restart_cluster_add(cluster_name=arakoon_cluster_name, current_ips=result['ips'], new_ip=cluster_ip, filesystem=False) arakoon_ports = [result['client_port'], result['messaging_port']] if configure_memcached is True: NodeTypeController.configure_memcached(client=target_client, logger=NodeTypeController._logger) NodeTypeController.add_services(client=target_client, node_type='master', logger=NodeTypeController._logger) Toolbox.log(logger=NodeTypeController._logger, messages='Update configurations') if configure_memcached is True: endpoints = Configuration.get('/ovs/framework/memcache|endpoints') endpoint = '{0}:11211'.format(cluster_ip) if endpoint not in endpoints: endpoints.append(endpoint) Configuration.set('/ovs/framework/memcache|endpoints', endpoints) if configure_rabbitmq is True: endpoints = Configuration.get('/ovs/framework/messagequeue|endpoints') endpoint = '{0}:5672'.format(cluster_ip) if endpoint not in endpoints: endpoints.append(endpoint) Configuration.set('/ovs/framework/messagequeue|endpoints', endpoints) if arakoon_metadata['internal'] is True: Toolbox.log(logger=NodeTypeController._logger, messages='Restarting master node services') ArakoonInstaller.restart_cluster_add(cluster_name=arakoon_cluster_name, current_ips=master_node_ips, new_ip=cluster_ip, filesystem=False) PersistentFactory.store = None VolatileFactory.store = None if 'arakoon-ovsdb' not in [s.name for s in ServiceList.get_services() if s.is_internal is False or s.storagerouter.ip == cluster_ip]: service = Service() service.name = 'arakoon-ovsdb' service.type = ServiceTypeList.get_by_name(ServiceType.SERVICE_TYPES.ARAKOON) service.ports = arakoon_ports service.storagerouter = storagerouter service.save() if configure_rabbitmq is True: NodeTypeController.configure_rabbitmq(client=target_client, logger=NodeTypeController._logger) # Copy rabbitmq cookie rabbitmq_cookie_file = '/var/lib/rabbitmq/.erlang.cookie' Toolbox.log(logger=NodeTypeController._logger, messages='Copying Rabbit MQ cookie') contents = master_client.file_read(rabbitmq_cookie_file) master_hostname, _ = master_client.get_hostname() target_client.dir_create(os.path.dirname(rabbitmq_cookie_file)) target_client.file_write(rabbitmq_cookie_file, contents) target_client.file_chmod(rabbitmq_cookie_file, mode=400) target_client.run(['rabbitmq-server', '-detached']) time.sleep(5) target_client.run(['rabbitmqctl', 'stop_app']) time.sleep(5) target_client.run(['rabbitmqctl', 'join_cluster', 'rabbit@{0}'.format(master_hostname)]) time.sleep(5) target_client.run(['rabbitmqctl', 'stop']) time.sleep(5) # Enable HA for the rabbitMQ queues Toolbox.change_service_state(target_client, 'rabbitmq-server', 'start', NodeTypeController._logger) NodeTypeController.check_rabbitmq_and_enable_ha_mode(client=target_client, logger=NodeTypeController._logger) NodeTypeController._configure_amqp_to_volumedriver() Toolbox.log(logger=NodeTypeController._logger, messages='Starting services') services = ['memcached', 'arakoon-ovsdb', 'rabbitmq-server', 'etcd-config'] if arakoon_metadata['internal'] is True: services.remove('arakoon-ovsdb') for service in services: if ServiceManager.has_service(service, client=target_client): Toolbox.change_service_state(target_client, service, 'start', NodeTypeController._logger) Toolbox.log(logger=NodeTypeController._logger, messages='Restarting services') NodeTypeController.restart_framework_and_memcache_services(clients=ip_client_map, logger=NodeTypeController._logger) if Toolbox.run_hooks(component='nodetype', sub_component='promote', logger=NodeTypeController._logger, cluster_ip=cluster_ip, master_ip=master_ip): Toolbox.log(logger=NodeTypeController._logger, messages='Restarting services') NodeTypeController.restart_framework_and_memcache_services(clients=ip_client_map, logger=NodeTypeController._logger) if NodeTypeController.avahi_installed(client=target_client, logger=NodeTypeController._logger) is True: NodeTypeController.configure_avahi(client=target_client, node_name=node_name, node_type='master', logger=NodeTypeController._logger) Configuration.set('/ovs/framework/hosts/{0}/type'.format(machine_id), 'MASTER') target_client.run(['chown', '-R', 'ovs:ovs', '/opt/OpenvStorage/config']) Configuration.set('/ovs/framework/hosts/{0}/promotecompleted'.format(machine_id), True) if target_client.file_exists('/tmp/ovs_rollback'): target_client.file_delete('/tmp/ovs_rollback') Toolbox.log(logger=NodeTypeController._logger, messages='Promote complete')
def promote_or_demote_node(node_action, cluster_ip=None, execute_rollback=False): """ Promotes or demotes the local node :param node_action: Demote or promote :type node_action: str :param cluster_ip: IP of node to promote or demote :type cluster_ip: str :param execute_rollback: In case of failure revert the changes made :type execute_rollback: bool :return: None """ if node_action not in ('promote', 'demote'): raise ValueError('Nodes can only be promoted or demoted') Toolbox.log(logger=NodeTypeController._logger, messages='Open vStorage Setup - {0}'.format(node_action.capitalize()), boxed=True) try: Toolbox.log(logger=NodeTypeController._logger, messages='Collecting information', title=True) machine_id = System.get_my_machine_id() if Configuration.get('/ovs/framework/hosts/{0}/setupcompleted'.format(machine_id)) is False: raise RuntimeError('No local OVS setup found.') if cluster_ip and not re.match(Toolbox.regex_ip, cluster_ip): raise RuntimeError('Incorrect IP provided ({0})'.format(cluster_ip)) if cluster_ip: client = SSHClient(endpoint=cluster_ip) machine_id = System.get_my_machine_id(client) node_type = Configuration.get('/ovs/framework/hosts/{0}/type'.format(machine_id)) if node_action == 'promote' and node_type == 'MASTER': raise RuntimeError('This node is already master.') elif node_action == 'demote' and node_type == 'EXTRA': raise RuntimeError('This node should be a master.') elif node_type not in ['MASTER', 'EXTRA']: raise RuntimeError('This node is not correctly configured.') master_ip = None offline_nodes = [] online = True target_client = None if node_action == 'demote' and cluster_ip: # Demote an offline node from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.lib.storagedriver import StorageDriverController ip = cluster_ip unique_id = None ip_client_map = {} for storage_router in StorageRouterList.get_storagerouters(): try: client = SSHClient(storage_router.ip, username='******') if storage_router.node_type == 'MASTER': master_ip = storage_router.ip ip_client_map[storage_router.ip] = client except UnableToConnectException: if storage_router.ip == cluster_ip: online = False unique_id = storage_router.machine_id StorageDriverController.mark_offline(storagerouter_guid=storage_router.guid) offline_nodes.append(storage_router) if online is True: raise RuntimeError("If the node is online, please use 'ovs setup demote' executed on the node you wish to demote") if master_ip is None: raise RuntimeError('Failed to retrieve another responsive MASTER node') else: target_password = Toolbox.ask_validate_password(ip='127.0.0.1', logger=NodeTypeController._logger) target_client = SSHClient('127.0.0.1', username='******', password=target_password) unique_id = System.get_my_machine_id(target_client) ip = Configuration.get('/ovs/framework/hosts/{0}/ip'.format(unique_id)) storagerouter_info = NodeTypeController.retrieve_storagerouter_info_via_host(ip=target_client.ip, password=target_password) node_ips = [sr_info['ip'] for sr_info in storagerouter_info.itervalues()] master_node_ips = [sr_info['ip'] for sr_info in storagerouter_info.itervalues() if sr_info['type'] == 'master' and sr_info['ip'] != ip] if len(master_node_ips) == 0: if node_action == 'promote': raise RuntimeError('No master node could be found') else: raise RuntimeError('It is not possible to remove the only master') master_ip = master_node_ips[0] ip_client_map = dict((node_ip, SSHClient(node_ip, username='******')) for node_ip in node_ips) if node_action == 'demote': for cluster_name in Configuration.list('/ovs/arakoon'): config = ArakoonClusterConfig(cluster_name, False) config.load_config() arakoon_client = ArakoonInstaller.build_client(config) metadata = json.loads(arakoon_client.get(ArakoonInstaller.METADATA_KEY)) if len(config.nodes) == 1 and config.nodes[0].ip == ip and metadata.get('internal') is True: raise RuntimeError('Demote is not supported when single node Arakoon cluster(s) are present on the node to be demoted.') configure_rabbitmq = Toolbox.is_service_internally_managed(service='rabbitmq') configure_memcached = Toolbox.is_service_internally_managed(service='memcached') if node_action == 'promote': try: NodeTypeController.promote_node(cluster_ip=ip, master_ip=master_ip, ip_client_map=ip_client_map, unique_id=unique_id, configure_memcached=configure_memcached, configure_rabbitmq=configure_rabbitmq) except Exception: if execute_rollback is True: NodeTypeController.demote_node(cluster_ip=ip, master_ip=master_ip, ip_client_map=ip_client_map, unique_id=unique_id, unconfigure_memcached=configure_memcached, unconfigure_rabbitmq=configure_rabbitmq, offline_nodes=offline_nodes) elif target_client is not None: target_client.file_write('/tmp/ovs_rollback', 'demote') raise else: try: NodeTypeController.demote_node(cluster_ip=ip, master_ip=master_ip, ip_client_map=ip_client_map, unique_id=unique_id, unconfigure_memcached=configure_memcached, unconfigure_rabbitmq=configure_rabbitmq, offline_nodes=offline_nodes) except Exception: if execute_rollback is True: NodeTypeController.promote_node(cluster_ip=ip, master_ip=master_ip, ip_client_map=ip_client_map, unique_id=unique_id, configure_memcached=configure_memcached, configure_rabbitmq=configure_rabbitmq) elif target_client is not None: target_client.file_write('/tmp/ovs_rollback', 'promote') raise Toolbox.log(logger=NodeTypeController._logger, messages='\n') Toolbox.log(logger=NodeTypeController._logger, messages='{0} complete.'.format(node_action.capitalize()), boxed=True) except Exception as exception: Toolbox.log(logger=NodeTypeController._logger, messages='\n') Toolbox.log(logger=NodeTypeController._logger, messages=['An unexpected error occurred:', str(exception)], boxed=True, loglevel='exception') sys.exit(1) except KeyboardInterrupt: Toolbox.log(logger=NodeTypeController._logger, messages='\n') Toolbox.log(logger=NodeTypeController._logger, messages='This setup was aborted. Open vStorage may be in an inconsistent state, make sure to validate the installation.', boxed=True, loglevel='error') sys.exit(1)
def collapse_arakoon(): """ Collapse Arakoon's Tlogs :return: None """ ScheduledTaskController._logger.info('Starting arakoon collapse') storagerouters = StorageRouterList.get_storagerouters() cluster_info = [('cacc', storagerouters[0], True)] cluster_names = [] for service in ServiceList.get_services(): if service.is_internal is True and service.type.name in ( ServiceType.SERVICE_TYPES.ARAKOON, ServiceType.SERVICE_TYPES.NS_MGR, ServiceType.SERVICE_TYPES.ALBA_MGR): cluster = service.name.replace('arakoon-', '') if cluster in cluster_names: continue cluster_names.append(cluster) cluster_info.append((cluster, service.storagerouter, False)) workload = {} for cluster, storagerouter, filesystem in cluster_info: ScheduledTaskController._logger.debug( ' Collecting info for cluster {0}'.format(cluster)) config = ArakoonClusterConfig(cluster, filesystem=filesystem) config.load_config(storagerouter.ip) for node in config.nodes: if node.ip not in workload: workload[node.ip] = {'node_id': node.name, 'clusters': []} workload[node.ip]['clusters'].append((cluster, filesystem)) for storagerouter in storagerouters: try: if storagerouter.ip not in workload: continue node_workload = workload[storagerouter.ip] client = SSHClient(storagerouter) for cluster, filesystem in node_workload['clusters']: try: ScheduledTaskController._logger.debug( ' Collapsing cluster {0} on {1}'.format( cluster, storagerouter.ip)) if filesystem is True: config_path = ArakoonClusterConfig.CONFIG_FILE.format( cluster) else: config_path = Configuration.get_configuration_path( ArakoonClusterConfig.CONFIG_KEY.format( cluster)) client.run([ 'arakoon', '--collapse-local', node_workload['node_id'], '2', '-config', config_path ]) ScheduledTaskController._logger.info( ' Collapsing cluster {0} on {1} completed'.format( cluster, storagerouter.ip)) except: ScheduledTaskController._logger.exception( ' Collapsing cluster {0} on {1} failed'.format( cluster, storagerouter.ip)) except UnableToConnectException: ScheduledTaskController._logger.error( ' Could not collapse any cluster on {0} (not reachable)'. format(storagerouter.name)) ScheduledTaskController._logger.info('Arakoon collapse finished')