def register(node_id): """ Adds a Node with a given node_id to the model :param node_id: ID of the ALBA node :type node_id: str :return: None """ node = AlbaNodeList.get_albanode_by_node_id(node_id) if node is None: main_config = EtcdConfiguration.get('/ovs/alba/asdnodes/{0}/config/main'.format(node_id)) node = AlbaNode() node.ip = main_config['ip'] node.port = main_config['port'] node.username = main_config['username'] node.password = main_config['password'] node.storagerouter = StorageRouterList.get_by_ip(main_config['ip']) data = node.client.get_metadata() if data['_success'] is False and data['_error'] == 'Invalid credentials': raise RuntimeError('Invalid credentials') if data['node_id'] != node_id: AlbaNodeController._logger.error('Unexpected node_id: {0} vs {1}'.format(data['node_id'], node_id)) raise RuntimeError('Unexpected node identifier') node.node_id = node_id node.type = 'ASD' node.save() # increase maintenance agents count for all nodes by 1 for backend in AlbaBackendList.get_albabackends(): nr_of_agents_key = AlbaNodeController.NR_OF_AGENTS_ETCD_TEMPLATE.format(backend.guid) if EtcdConfiguration.exists(nr_of_agents_key): EtcdConfiguration.set(nr_of_agents_key, int(EtcdConfiguration.get(nr_of_agents_key) + 1)) else: EtcdConfiguration.set(nr_of_agents_key, 1) AlbaNodeController.checkup_maintenance_agents()
def restart_slot(node_guid, slot_id): """ Restarts a slot :param node_guid: Guid of the ALBA Node to restart a slot on :type node_guid: str :param slot_id: ID of the slot (eg. pci-0000:03:00.0-sas-0x5000c29f4cf04566-lun-0) :type slot_id: str :return: None :rtype: NoneType """ node = AlbaNode(node_guid) AlbaNodeController._logger.debug( 'Restarting slot {0} on node {1}'.format(slot_id, node.ip)) try: if slot_id not in node.client.get_stack(): AlbaNodeController._logger.exception( 'Slot {0} not available for restart on ALBA Node {1}'. format(slot_id, node.ip)) raise RuntimeError('Could not find slot') except (requests.ConnectionError, requests.Timeout): AlbaNodeController._logger.warning( 'Could not connect to node {0} to validate slot'.format( node.guid)) raise result = node.client.restart_slot(slot_id=slot_id) if result['_success'] is False: raise RuntimeError('Error restarting slot: {0}'.format( result['_error'])) for backend in AlbaBackendList.get_albabackends(): backend.invalidate_dynamics()
def restart_disk(node_guid, disk): """ Restarts a disk :param node_guid: Guid of the node to restart a disk of :type node_guid: str :param disk: Disk name to be restarted :type disk: str :return: None """ node = AlbaNode(node_guid) AlbaNodeController._logger.debug('Restarting disk {0} at node {1}'.format(disk, node.ip)) try: disks = node.client.get_disks() if disk not in disks: AlbaNodeController._logger.exception('Disk {0} not available for restart on node {1}'.format(disk, node.ip)) raise RuntimeError('Could not find disk') except (requests.ConnectionError, requests.Timeout): AlbaNodeController._logger.warning('Could not connect to node {0} to validate disk'.format(node.guid)) raise result = node.client.restart_disk(disk) if result['_success'] is False: raise RuntimeError('Error restarting disk: {0}'.format(result['_error'])) for backend in AlbaBackendList.get_albabackends(): backend.invalidate_dynamics()
def _all_disks(self): """ Returns a live list of all disks on this node """ try: disks = self.client.get_disks(reraise=True) except (requests.ConnectionError, requests.Timeout): from ovs.dal.lists.albabackendlist import AlbaBackendList disks = [] asd_ids = [] for backend in AlbaBackendList.get_albabackends(): # All backends of this node config = 'etcd://127.0.0.1:2379/ovs/arakoon/{0}-abm/config'.format(backend.name) osds = AlbaCLI.run('list-all-osds', config=config, as_json=True) for osd in osds: node_id = osd.get('node_id') asd_id = osd.get('long_id') decommissioned = osd.get('decommissioned') if node_id == self.node_id and asd_id not in asd_ids and decommissioned is False: asd_ids.append(asd_id) disks.append({'asd_id': asd_id, 'node_id': osd.get('node_id'), 'port': osd.get('port'), 'ips': osd.get('ips'), 'available': False, 'state': {'state': 'error', 'detail': 'nodedown'}, 'log_level': 'info', 'device': asd_id, 'home': asd_id, 'mountpoint': asd_id, 'name': asd_id, 'usage': {'available': 0, 'size': 0, 'used': 0}}) return disks
def restart_disk(node_guid, device_alias): """ Restarts a disk :param node_guid: Guid of the node to restart a disk of :type node_guid: str :param device_alias: Alias of the device to restart (eg: /dev/disk/by-path/pci-0000:03:00.0-sas-0x5000c29f4cf04566-lun-0) :type device_alias: str :return: None """ node = AlbaNode(node_guid) device_id = device_alias.split('/')[-1] AlbaNodeController._logger.debug('Restarting disk {0} at node {1}'.format(device_alias, node.ip)) try: if device_id not in node.client.get_disks(): AlbaNodeController._logger.exception('Disk {0} not available for restart on node {1}'.format(device_alias, node.ip)) raise RuntimeError('Could not find disk') except (requests.ConnectionError, requests.Timeout): AlbaNodeController._logger.warning('Could not connect to node {0} to validate disk'.format(node.guid)) raise result = node.client.restart_disk(disk_id=device_id) if result['_success'] is False: raise RuntimeError('Error restarting disk: {0}'.format(result['_error'])) for backend in AlbaBackendList.get_albabackends(): backend.invalidate_dynamics()
def remove_node(node_guid): """ Removes an ALBA node :param node_guid: Guid of the ALBA node to remove :type node_guid: str :return: None :rtype: NoneType """ node = AlbaNode(node_guid) if node.type == AlbaNode.NODE_TYPES.ASD: for slot_id, slot_info in node.stack.iteritems(): for osd_id, osd_info in slot_info['osds'].iteritems(): if AlbaOSDList.get_by_osd_id(osd_id=osd_id) is not None: AlbaNodeController.remove_osd(node_guid=node.guid, osd_id=osd_id, expected_safety=None) if slot_info['available'] is False: AlbaNodeController.remove_slot(node_guid=node.guid, slot_id=slot_id) name_guid_map = dict( (alba_backend.name, alba_backend.guid) for alba_backend in AlbaBackendList.get_albabackends()) try: # This loop will delete the services AND their configuration from the configuration management node.invalidate_dynamics('maintenance_services') for alba_backend_name, service_info in node.maintenance_services.iteritems( ): for service_name, status in service_info: node.client.remove_maintenance_service( name=service_name, alba_backend_guid=name_guid_map.get( alba_backend_name)) except (requests.ConnectionError, requests.Timeout): AlbaNodeController._logger.exception( 'Could not connect to node {0} to retrieve the maintenance services' .format(node.guid)) except InvalidCredentialsError: AlbaNodeController._logger.warning( 'Failed to retrieve the maintenance services for ALBA node {0}' .format(node.node_id)) node.delete() for alba_backend in AlbaBackendList.get_albabackends(): alba_backend.invalidate_dynamics(['live_status']) alba_backend.backend.invalidate_dynamics(['live_status']) AlbaController.checkup_maintenance_agents.delay()
def remove_disk(node_guid, device_alias): """ Removes a disk :param node_guid: Guid of the node to remove a disk from :type node_guid: str :param device_alias: Alias of the device to remove (eg: /dev/disk/by-path/pci-0000:03:00.0-sas-0x5000c29f4cf04566-lun-0) :type device_alias: str :return: None """ asds = {} node = AlbaNode(node_guid) node_id = node.node_id device_id = device_alias.split('/')[-1] offline_node = False # Verify client connectivity try: _ = node.client.get_disks() except (requests.ConnectionError, requests.Timeout, InvalidCredentialsError): AlbaNodeController._logger.warning('Could not connect to node {0} to validate disks'.format(node.guid)) offline_node = True # Retrieve ASD information for the ALBA Disk for backend in AlbaBackendList.get_albabackends(): local_stack = backend.local_stack if node_id in local_stack and device_id in local_stack[node_id]: asds.update(local_stack[node_id][device_id]['asds']) for asd_info in asds.values(): if (offline_node is False and asd_info.get('status') != 'available') or (offline_node is True and asd_info.get('status_detail') == 'nodedown'): AlbaNodeController._logger.error('Disk {0} has still non-available ASDs on node {1}'.format(device_alias, node.ip)) raise RuntimeError('Disk {0} on ALBA node {1} has still some non-available ASDs'.format(device_alias, node_id)) # Retrieve the Disk from the framework model matching the ALBA Disk disk_to_clear = None for disk in DiskList.get_disks(): if device_alias in disk.aliases: disk_to_clear = disk break # Remove the ALBA Disk making use of the ASD Manager Client if offline_node is False: result = node.client.remove_disk(disk_id=device_id, partition_aliases=disk_to_clear.partitions[0].aliases if len(disk_to_clear.partitions) > 0 else []) if result['_success'] is False: raise RuntimeError('Error removing disk {0}: {1}'.format(device_alias, result['_error'])) # Clean the model for model_disk in node.disks: if device_alias in model_disk.aliases: for osd in model_disk.osds: osd.delete() model_disk.delete() if disk_to_clear is not None: for partition in disk_to_clear.partitions: partition.roles = [] partition.mountpoint = None partition.save() node.invalidate_dynamics() if node.storagerouter is not None: DiskController.sync_with_reality(storagerouter_guid=node.storagerouter_guid)
def get_albabackends(): """ Fetches all the alba backends on the cluster :return: alba backends :rtype: list """ return AlbaBackendList.get_albabackends()
def get_stats_nsms(cls): """ Retrieve the amount of NSMs deployed and their statistics """ if cls._config is None: cls.validate_and_retrieve_config() stats = [] errors = False environment = cls._config['environment'] for alba_backend in AlbaBackendList.get_albabackends(): for nsm in alba_backend.nsm_clusters: stats.append({ 'tags': { 'nsm_number': nsm.number, 'environment': environment, 'backend_name': alba_backend.name, 'abm_service_name': alba_backend.abm_cluster.name }, 'fields': { 'load': float(AlbaArakoonController.get_load(nsm)) }, 'measurement': 'nsm' }) config_path = Configuration.get_configuration_path( alba_backend.abm_cluster.config_location) try: nsm_host_ids = [ nsm_host['id'] for nsm_host in AlbaCLI.run(command='list-nsm-hosts', config=config_path) ] nsm_hosts_statistics = AlbaCLI.run( command='nsm-hosts-statistics', config=config_path, named_params={'nsm-hosts': ','.join(nsm_host_ids)}) for nsm_host_id, statistics in nsm_hosts_statistics.iteritems( ): stats.append({ 'tags': { 'nsm_name': nsm_host_id, 'environment': environment, 'backend_name': alba_backend.name }, 'fields': cls._convert_to_float_values(statistics['statistics']), 'measurement': 'nsm_statistic' }) except Exception: errors = True cls._logger.exception( 'Retrieving NSM statistics for ALBA Backend {0} failed'. format(alba_backend.name)) return errors, stats
def get_by_name(name): """ Retrieve an ALBA backend object based on its name :param name: Name of the ALBA backend :type name: str :return: ALBA backend or None :rtype: AlbaBackend """ for alba_backend in AlbaBackendList.get_albabackends(): if alba_backend.name == name: return alba_backend
def get_stats_osds(cls): """ Retrieve the OSD statistics for all ALBA Backends """ def _get_stats_osds_for_alba_backend(alba_backend, statistics, errored_calls): try: for osd_id, result in alba_backend.osd_statistics.iteritems(): # Remove the 'version' key as it is a non-numeric value result.pop('version', None) statistics.append({ 'tags': { 'guid': alba_backend.guid, 'long_id': osd_id, 'environment': environment, 'backend_name': alba_backend.name }, 'fields': cls._convert_to_float_values(result), 'measurement': 'asd' }) except Exception: errored_calls.append(alba_backend.name) cls._logger.exception( 'Retrieving OSD statistics failed for ALBA Backend {0}'. format(alba_backend.name)) if cls._config is None: cls.validate_and_retrieve_config() stats = [] errors = [] threads = [] environment = cls._config['environment'] for ab in AlbaBackendList.get_albabackends(): thread = Thread(name=ab.name, target=_get_stats_osds_for_alba_backend, args=(ab, stats, errors)) thread.start() threads.append(thread) for thr in threads: thr.join(timeout=20) if len(errors) > 0: raise Exception( 'Retrieving OSD statistics failed for ALBA Backends:\n * {0}'. format('\n * '.join(errors))) return False, stats
def list(self, request): """ Lists all available ALBA Backends: :param request: The raw request :type request: Request """ backends = AlbaBackendList.get_albabackends() allowed_backends = [] for alba_backend in backends: if Toolbox.access_granted(request.client, user_rights=alba_backend.backend.user_rights, client_rights=alba_backend.backend.client_rights): allowed_backends.append(alba_backend) return allowed_backends
def get_stats_alba_backends(cls): """ Retrieve statistics about all ALBA Backends and their maintenance work """ if cls._config is None: cls.validate_and_retrieve_config() stats = [] errors = False environment = cls._config['environment'] for alba_backend in AlbaBackendList.get_albabackends(): try: local_summary = alba_backend.local_summary sizes = local_summary['sizes'] devices = local_summary['devices'] stats.append({ 'tags': { 'environment': environment, 'backend_name': alba_backend.name }, 'fields': { 'red': int(devices['red']), 'free': float(sizes['size'] - sizes['used']), 'used': float(sizes['used']), 'green': int(devices['green']), 'orange': int(devices['orange']), 'maintenance_work': int( AlbaCLI.run( command='list-work', config=Configuration.get_configuration_path( alba_backend.abm_cluster.config_location)) ['count']) }, 'measurement': 'backend' }) except Exception: errors = True cls._logger.exception( 'Retrieving statistics for ALBA Backend {0} failed'.format( alba_backend.name)) return errors, stats
def list(self, request): """ Lists all available ALBA Backends: :param request: The raw request :type request: Request :return: All ALBA Backends for which the current user has permissions :rtype: list """ backends = AlbaBackendList.get_albabackends() allowed_backends = [] for alba_backend in backends: if ApiToolbox.access_granted( request.client, user_rights=alba_backend.backend.user_rights, client_rights=alba_backend.backend.client_rights): allowed_backends.append(alba_backend) return allowed_backends
def remove_disk(node_guid, disk): """ Removes a disk :param node_guid: Guid of the node to remove a disk from :type node_guid: str :param disk: Disk name to remove :type disk: str :return: None """ node = AlbaNode(node_guid) offline_node = False try: if disk not in node.client.get_disks(): raise RuntimeError('Disk {0} not available on node {1}'.format(disk, node.guid)) except (requests.ConnectionError, requests.Timeout): AlbaNodeController._logger.warning('Could not connect to node {0} to validate disks'.format(node.guid)) offline_node = True node_id = node.node_id asds = {} for backend in AlbaBackendList.get_albabackends(): storage_stack = backend.storage_stack if node_id in storage_stack and disk in storage_stack[node_id]: asds.update(storage_stack[node_id][disk]['asds']) for asd_info in asds.values(): if (offline_node is False and asd_info['status'] != 'available') or (offline_node is True and asd_info['status_detail'] == 'nodedown'): AlbaNodeController._logger.error('Disk {0} has still non-available ASDs on node {1}'.format(disk, node.ip)) raise RuntimeError('Disk {0} has still some non-available ASDs'.format(disk)) if offline_node is False: result = node.client.remove_disk(disk) if result['_success'] is False: raise RuntimeError('Error removing disk {0}: {1}'.format(disk, result['_error'])) for model_disk in node.disks: if model_disk.name == disk: for asd in model_disk.asds: asd.delete() model_disk.delete() node.invalidate_dynamics() if node.storagerouter is not None: DiskController.sync_with_reality(node.storagerouter_guid)
def verify_namespaces(): """ Verify namespaces for all backends """ logger.info('verify namespace task scheduling started') job_factor = 10 job_factor_key = '/ovs/alba/backends/job_factor' if EtcdConfiguration.exists(job_factor_key): job_factor = EtcdConfiguration.get(job_factor_key) else: EtcdConfiguration.set(job_factor_key, job_factor) for albabackend in AlbaBackendList.get_albabackends(): config = 'etcd://127.0.0.1:2379/ovs/arakoon/{0}-abm/config'.format(albabackend.backend.name) namespaces = AlbaCLI.run('list-namespaces', config=config, as_json=True) for namespace in namespaces: logger.info('verifying namespace: {0} scheduled ...'.format(namespace['name'])) AlbaCLI.run('verify-namespace {0} --factor={1}'.format(namespace['name'], job_factor)) logger.info('verify namespace task scheduling finished')
def get_albabackend_by_name(albabackend_name): """ Get a Albabackend by name :param albabackend_name: albabackend name :type albabackend_name: str :return: alba backend object :rtype: ovs.dal.hybrids.albabackend """ try: return [ alba_backend for alba_backend in AlbaBackendList.get_albabackends() if alba_backend.name == albabackend_name ][0] except IndexError: error_msg = "No Alba backend found with name: {0}".format( albabackend_name) BackendHelper.LOGGER.error(error_msg) raise NameError(error_msg)
def verify_namespaces(): """ Verify namespaces for all backends """ AlbaScheduledTaskController._logger.info('verify namespace task scheduling started') verification_factor = 10 verification_factor_key = '/ovs/alba/backends/verification_factor' if EtcdConfiguration.exists(verification_factor_key): verification_factor = EtcdConfiguration.get(verification_factor_key) else: EtcdConfiguration.set(verification_factor_key, verification_factor) for albabackend in AlbaBackendList.get_albabackends(): backend_name = albabackend.abm_services[0].service.name if albabackend.abm_services else albabackend.name + '-abm' config = 'etcd://127.0.0.1:2379/ovs/arakoon/{0}/config'.format(backend_name) namespaces = AlbaCLI.run('list-namespaces', config=config, as_json=True) for namespace in namespaces: AlbaScheduledTaskController._logger.info('verifying namespace: {0} scheduled ...'.format(namespace['name'])) AlbaCLI.run('verify-namespace {0} --factor={1}'.format(namespace['name'], verification_factor)) AlbaScheduledTaskController._logger.info('verify namespace task scheduling finished')
def _stack(self): """ Returns an overview of this node's storage stack """ from ovs.dal.hybrids.albabackend import AlbaBackend from ovs.dal.lists.albabackendlist import AlbaBackendList def _move(info): for move in [('state', 'status'), ('state_detail', 'status_detail')]: if move[0] in info: info[move[1]] = info[move[0]] del info[move[0]] stack = {} node_down = False # Fetch stack from asd-manager try: remote_stack = self.client.get_stack() for slot_id, slot_data in remote_stack.iteritems(): stack[slot_id] = {'status': 'ok'} stack[slot_id].update(slot_data) # Migrate state > status _move(stack[slot_id]) for osd_data in slot_data.get('osds', {}).itervalues(): _move(osd_data) except (requests.ConnectionError, requests.Timeout, InvalidCredentialsError): self._logger.warning( 'Error during stack retrieval. Assuming that the node is down') node_down = True model_osds = {} found_osds = {} # Apply own model to fetched stack for osd in self.osds: model_osds[osd.osd_id] = osd # Initially set the info if osd.slot_id not in stack: stack[osd.slot_id] = { 'status': self.OSD_STATUSES.UNKNOWN if node_down is True else self.OSD_STATUSES.MISSING, 'status_detail': self.OSD_STATUS_DETAILS.NODEDOWN if node_down is True else '', 'osds': {} } osd_data = stack[osd.slot_id]['osds'].get(osd.osd_id, {}) stack[osd.slot_id]['osds'][ osd.osd_id] = osd_data # Initially set the info in the stack osd_data.update(osd.stack_info) if node_down is True: osd_data['status'] = self.OSD_STATUSES.UNKNOWN osd_data['status_detail'] = self.OSD_STATUS_DETAILS.NODEDOWN elif osd.alba_backend_guid is not None: # Osds has been claimed # Load information from alba if osd.alba_backend_guid not in found_osds: found_osds[osd.alba_backend_guid] = {} if osd.alba_backend.abm_cluster is not None: config = Configuration.get_configuration_path( osd.alba_backend.abm_cluster.config_location) try: for found_osd in AlbaCLI.run( command='list-all-osds', config=config): found_osds[osd.alba_backend_guid][ found_osd['long_id']] = found_osd except (AlbaError, RuntimeError): self._logger.exception( 'Listing all osds has failed') osd_data['status'] = self.OSD_STATUSES.UNKNOWN osd_data[ 'status_detail'] = self.OSD_STATUS_DETAILS.ALBAERROR continue if osd.osd_id not in found_osds[osd.alba_backend_guid]: # Not claimed by any backend thus not in use continue found_osd = found_osds[osd.alba_backend_guid][osd.osd_id] if found_osd['decommissioned'] is True: osd_data['status'] = self.OSD_STATUSES.UNAVAILABLE osd_data[ 'status_detail'] = self.OSD_STATUS_DETAILS.DECOMMISSIONED continue backend_interval_key = '/ovs/alba/backends/{0}/gui_error_interval'.format( osd.alba_backend_guid) if Configuration.exists(backend_interval_key): interval = Configuration.get(backend_interval_key) else: interval = Configuration.get( '/ovs/alba/backends/global_gui_error_interval') read = found_osd['read'] or [0] write = found_osd['write'] or [0] errors = found_osd['errors'] osd_data['status'] = self.OSD_STATUSES.WARNING osd_data['status_detail'] = self.OSD_STATUS_DETAILS.ERROR if len(errors) == 0 or (len(read + write) > 0 and max(min(read), min(write)) > max(error[0] for error in errors) + interval): osd_data['status'] = self.OSD_STATUSES.OK osd_data['status_detail'] = '' statistics = {} for slot_info in stack.itervalues(): for osd_id, osd in slot_info['osds'].iteritems(): if osd.get( 'status_detail') == self.OSD_STATUS_DETAILS.ACTIVATING: osd['claimed_by'] = 'unknown' # We won't be able to connect to it just yet continue if osd_id not in model_osds: # The osd is known by the remote node but not in the model # In that case, let's connect to the OSD to see whether we get some info from it try: ips = osd['hosts'] if 'hosts' in osd and len( osd['hosts']) > 0 else osd.get('ips', []) port = osd['port'] claimed_by = 'unknown' for ip in ips: try: # Output will be None if it is not claimed claimed_by = AlbaCLI.run('get-osd-claimed-by', named_params={ 'host': ip, 'port': port }) break except (AlbaError, RuntimeError): self._logger.warning( 'get-osd-claimed-by failed for IP:port {0}:{1}' .format(ip, port)) alba_backend = AlbaBackendList.get_by_alba_id( claimed_by) osd['claimed_by'] = alba_backend.guid if alba_backend is not None else claimed_by except KeyError: osd['claimed_by'] = 'unknown' except: self._logger.exception( 'Could not load OSD info: {0}'.format(osd_id)) osd['claimed_by'] = 'unknown' if osd.get('status') not in ['error', 'warning']: osd['status'] = self.OSD_STATUSES.ERROR osd['status_detail'] = self.OSD_STATUS_DETAILS.UNREACHABLE claimed_by = osd.get('claimed_by', 'unknown') if claimed_by == 'unknown': continue try: alba_backend = AlbaBackend(claimed_by) except ObjectNotFoundException: continue # Add usage information if alba_backend not in statistics: statistics[alba_backend] = alba_backend.osd_statistics osd_statistics = statistics[alba_backend] if osd_id not in osd_statistics: continue stats = osd_statistics[osd_id] osd['usage'] = { 'size': int(stats['capacity']), 'used': int(stats['disk_usage']), 'available': int(stats['capacity'] - stats['disk_usage']) } return stack
def list(self): """ Lists all available ALBA Backends """ return AlbaBackendList.get_albabackends()
def monitor_arakoon_clusters(cls): """ Get an overview of where the Arakoon clusters for each ALBA Backend have been deployed The overview is printed on stdout :return: None """ try: while True: output = [ '', 'Open vStorage - NSM/ABM debug information', '=========================================', 'timestamp: {0}'.format(datetime.datetime.now()), '' ] alba_backends = sorted(AlbaBackendList.get_albabackends(), key=lambda k: k.name) for storagerouter in sorted( StorageRouterList.get_storagerouters(), key=lambda k: k.name): if len([ service for service in storagerouter.services if service.type.name in [ ServiceType.SERVICE_TYPES.NS_MGR, ServiceType.SERVICE_TYPES.ALBA_MGR ] and service.storagerouter == storagerouter ]) == 0: continue output.append('+ {0} ({1})'.format(storagerouter.name, storagerouter.ip)) for alba_backend in alba_backends: is_internal = alba_backend.abm_cluster.abm_services[ 0].service.is_internal if is_internal is False: output.append(' + ABM (externally managed)') else: abm_service = [ abm_service for abm_service in alba_backend.abm_cluster.abm_services if abm_service.service.storagerouter == storagerouter ] nsm_clusters = [ nsm_cluster for nsm_cluster in alba_backend.nsm_clusters for nsm_service in nsm_cluster.nsm_services if nsm_service.service.storagerouter == storagerouter ] if len(abm_service) > 0 or len(nsm_clusters) > 0: output.append(' + {0}'.format( alba_backend.name)) if len(abm_service) > 0: output.append( ' + ABM - port {0}'.format( abm_service[0].service.ports)) for nsm_cluster in sorted(nsm_clusters, key=lambda k: k.number): load = None try: load = AlbaArakoonController.get_load( nsm_cluster) except: pass # Don't print load when Arakoon unreachable load = 'infinite' if load == float( 'inf') else '{0}%'.format(round( load, 2)) if load is not None else 'unknown' capacity = 'infinite' if float( nsm_cluster.capacity) < 0 else float( nsm_cluster.capacity) for nsm_service in nsm_cluster.nsm_services: if nsm_service.service.storagerouter != storagerouter: continue if is_internal is True: output.append( ' + NSM {0} - port {1} - capacity: {2}, load: {3}' .format(nsm_cluster.number, nsm_service.service.ports, capacity, load)) else: output.append( ' + NSM {0} (externally managed) - capacity: {1}, load: {2}' .format(nsm_cluster.number, capacity, load)) output += ['', 'Press ^C to exit', ''] print '\x1b[2J\x1b[H' + '\n'.join(output) time.sleep(1) except KeyboardInterrupt: pass
def check_nsm_load(cls, result_handler, max_load=None, use_total_capacity=False, total_capacity_warning=None, total_capacity_error=None): """ Checks all NSM services registered within the Framework and will report their load :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :param max_load: Maximum load percentage before marking it as overloaded. Defaults to ovs/framework/plugins/alba/config|nsm.maxload :type max_load: float :param use_total_capacity: Base NSM load of the total possible capacity (capacity of NSMs before they are marked as overloaded) instead of checking the least filled NSM. Use threshold arguments for tuning' :type use_total_capacity: bool :param total_capacity_warning: Number of remaining namespaces threshold before throwing a warning. Defaults 20% of the total namespaces :type total_capacity_warning: int :param total_capacity_error: Number of remaining namespaces threshold before throwing an error. Defaults to 5% of the total namespaces :type total_capacity_error: int :return: None :rtype: NoneType """ max_nsm_load_config = Configuration.get('ovs/framework/plugins/alba/config|nsm.maxload') max_load = max_load or max_nsm_load_config for alba_backend in AlbaBackendList.get_albabackends(): if alba_backend.abm_cluster is None: result_handler.failure('No ABM cluster found for ALBA Backend {0}'.format(alba_backend.name)) continue if len(alba_backend.abm_cluster.abm_services) == 0: result_handler.failure('ALBA Backend {0} does not have any registered ABM services'.format(alba_backend.name)) continue if len(alba_backend.nsm_clusters) == 0: result_handler.failure('ALBA Backend {0} does not have any registered NSM services'.format(alba_backend.name)) continue internal = alba_backend.abm_cluster.abm_services[0].service.is_internal if use_total_capacity: maximum_capacity_before_overload = AlbaHealthCheck._get_nsm_max_capacity_before_overload(alba_backend, max_nsm_load_config) total_capacity_warning = total_capacity_warning or math.ceil(maximum_capacity_before_overload * 1.0/5) total_capacity_error = total_capacity_error or math.ceil(maximum_capacity_before_overload * 1.0/20) config = Configuration.get_configuration_path(key=alba_backend.abm_cluster.config_location) hosts_data = AlbaCLI.run(command='list-nsm-hosts', config=config) current_capacity = sum([host['namespaces_count'] for host in hosts_data if not host['lost']]) remaining_capacity = maximum_capacity_before_overload - current_capacity if remaining_capacity > total_capacity_warning and remaining_capacity > total_capacity_error: # Only error could be specified result_handler.success('NSMs for backend {0} have enough capacity remaining ({1}/{2} used)'.format(alba_backend.name, current_capacity, maximum_capacity_before_overload), code=ErrorCodes.nsm_load_ok) elif total_capacity_warning >= remaining_capacity > total_capacity_error: result_handler.warning('NSMs for backend {0} have reached the warning threshold ' '({1} namespaces had to be remaining, {2}/{3} used)'.format(alba_backend.name, total_capacity_warning, current_capacity, maximum_capacity_before_overload), code=ErrorCodes.nsm_load_ok) else: result_handler.failure('NSMs for backend {0} have reached the error threshold ' '({1} namespaces had to be remaining, ({2}/{3} used)'.format(alba_backend.name, total_capacity_error, current_capacity, maximum_capacity_before_overload), code=ErrorCodes.nsm_load_ok) else: nsm_loads = {} sorted_nsm_clusters = sorted(alba_backend.nsm_clusters, key=lambda k: k.number) for nsm_cluster in sorted_nsm_clusters: nsm_loads[nsm_cluster.number] = AlbaController.get_load(nsm_cluster) overloaded = min(nsm_loads.values()) >= max_load if overloaded is False: result_handler.success('NSMs for backend {0} are not overloaded'.format(alba_backend.name), code=ErrorCodes.nsm_load_ok) else: if internal is True: result_handler.warning('NSMs for backend {0} are overloaded. The NSM checkup will take care of this'.format(alba_backend.name), code=ErrorCodes.nsm_load_warn) else: result_handler.failure('NSMs for backend {0} are overloaded. Please add your own NSM clusters to the backend'.format(alba_backend.name), code=ErrorCodes.nsm_load_failure)
def _all_disks(self): """ Returns a live list of all disks known to this AlbaBackend """ from ovs.dal.lists.albanodelist import AlbaNodeList from ovs.dal.lists.albabackendlist import AlbaBackendList alba_backend_map = {} for a_backend in AlbaBackendList.get_albabackends(): alba_backend_map[a_backend.alba_id] = a_backend node_disk_map = {} alba_nodes = AlbaNodeList.get_albanodes() for node in alba_nodes: node_disk_map[node.node_id] = [] # Load OSDs config = 'etcd://127.0.0.1:2379/ovs/arakoon/{0}-abm/config'.format(self.backend.name) for found_osd in AlbaCLI.run('list-all-osds', config=config, as_json=True): node_id = found_osd['node_id'] if node_id in node_disk_map: node_disk_map[node_id].append({'osd': found_osd}) # Load all_disk information def _load_disks(_node, _list): for _disk in _node.all_disks: found = False for container in _list: if 'osd' in container and container['osd']['long_id'] == _disk.get('asd_id'): container['disk'] = _disk found = True break if found is False: _list.append({'disk': _disk}) threads = [] for node in alba_nodes: thread = Thread(target=_load_disks, args=(node, node_disk_map[node.node_id])) thread.start() threads.append(thread) for thread in threads: thread.join() # Make mapping between node IDs and the relevant OSDs and disks def _process_disk(_info, _disks, _node): disk = _info.get('disk') if disk is None: return disk_status = 'uninitialized' disk_status_detail = '' disk_alba_backend_guid = '' if disk['available'] is False: osd = _info.get('osd') disk_alba_state = disk['state']['state'] if disk_alba_state == 'ok': if osd is None: disk_status = 'initialized' elif osd['id'] is None: alba_id = osd['alba_id'] if alba_id is None: disk_status = 'available' else: disk_status = 'unavailable' alba_backend = alba_backend_map.get(alba_id) if alba_backend is not None: disk_alba_backend_guid = alba_backend.guid else: disk_status = 'error' disk_status_detail = 'communicationerror' disk_alba_backend_guid = self.guid for asd in _node.asds: if asd.asd_id == disk['asd_id'] and asd.statistics != {}: disk_status = 'warning' disk_status_detail = 'recenterrors' read = osd['read'] or [0] write = osd['write'] or [0] errors = osd['errors'] global_interval_key = '/ovs/alba/backends/global_gui_error_interval' backend_interval_key = '/ovs/alba/backends/{0}/gui_error_interval'.format(self.guid) interval = EtcdConfiguration.get(global_interval_key) if EtcdConfiguration.exists(backend_interval_key): interval = EtcdConfiguration.get(backend_interval_key) if len(errors) == 0 or (len(read + write) > 0 and max(min(read), min(write)) > max(error[0] for error in errors) + interval): disk_status = 'claimed' disk_status_detail = '' elif disk_alba_state == 'decommissioned': disk_status = 'unavailable' disk_status_detail = 'decommissioned' else: disk_status = 'error' disk_status_detail = disk['state']['detail'] alba_backend = alba_backend_map.get(osd.get('alba_id')) if alba_backend is not None: disk_alba_backend_guid = alba_backend.guid disk['status'] = disk_status disk['status_detail'] = disk_status_detail disk['alba_backend_guid'] = disk_alba_backend_guid _disks.append(disk) def _worker(_queue, _disks): while True: try: item = _queue.get(False) _process_disk(item['info'], _disks, item['node']) except Empty: return queue = Queue() for node in alba_nodes: for info in node_disk_map[node.node_id]: queue.put({'info': info, 'node': node}) disks = [] threads = [] for i in range(5): thread = Thread(target=_worker, args=(queue, disks)) thread.start() threads.append(thread) for thread in threads: thread.join() return disks
def migrate(): """ Executes async migrations. It doesn't matter too much when they are executed, as long as they get eventually executed. This code will typically contain: * "dangerous" migration code (it needs certain running services) * Migration code depending on a cluster-wide state * ... """ AlbaMigrationController._logger.info( 'Preparing out of band migrations...') from ovs.dal.hybrids.diskpartition import DiskPartition from ovs.dal.lists.albabackendlist import AlbaBackendList from ovs.dal.lists.albanodelist import AlbaNodeList from ovs.dal.lists.albaosdlist import AlbaOSDList from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.extensions.generic.configuration import Configuration from ovs.extensions.generic.sshclient import SSHClient, UnableToConnectException from ovs.extensions.migration.migration.albamigrator import ExtensionMigrator from ovs.extensions.packages.albapackagefactory import PackageFactory from ovs.extensions.services.albaservicefactory import ServiceFactory from ovs.extensions.plugins.albacli import AlbaCLI, AlbaError from ovs.lib.alba import AlbaController from ovs.lib.disk import DiskController AlbaMigrationController._logger.info('Start out of band migrations...') ############################################# # Introduction of IP:port combination on OSDs osd_info_map = {} alba_backends = AlbaBackendList.get_albabackends() for alba_backend in alba_backends: AlbaMigrationController._logger.info( 'Verifying ALBA Backend {0}'.format(alba_backend.name)) if alba_backend.abm_cluster is None: AlbaMigrationController._logger.warning( 'ALBA Backend {0} does not have an ABM cluster registered'. format(alba_backend.name)) continue AlbaMigrationController._logger.debug( 'Retrieving configuration path for ALBA Backend {0}'.format( alba_backend.name)) try: config = Configuration.get_configuration_path( alba_backend.abm_cluster.config_location) except: AlbaMigrationController._logger.exception( 'Failed to retrieve the configuration path for ALBA Backend {0}' .format(alba_backend.name)) continue AlbaMigrationController._logger.info( 'Retrieving OSD information for ALBA Backend {0}'.format( alba_backend.name)) try: osd_info = AlbaCLI.run(command='list-all-osds', config=config) except (AlbaError, RuntimeError): AlbaMigrationController._logger.exception( 'Failed to retrieve OSD information for ALBA Backend {0}'. format(alba_backend.name)) continue for osd_info in osd_info: if osd_info.get('long_id'): osd_info_map[osd_info['long_id']] = { 'ips': osd_info.get('ips', []), 'port': osd_info.get('port') } for osd in AlbaOSDList.get_albaosds(): if osd.osd_id not in osd_info_map: AlbaMigrationController._logger.warning( 'OSD with ID {0} is modelled but could not be found through ALBA' .format(osd.osd_id)) continue ips = osd_info_map[osd.osd_id]['ips'] port = osd_info_map[osd.osd_id]['port'] changes = False if osd.ips is None: changes = True osd.ips = ips if osd.port is None: changes = True osd.port = port if changes is True: AlbaMigrationController._logger.info( 'Updating OSD with ID {0} with IPS {1} and port {2}'. format(osd.osd_id, ips, port)) osd.save() ################################################### # Read preference for GLOBAL ALBA Backends (1.10.3) (https://github.com/openvstorage/framework-alba-plugin/issues/452) if Configuration.get(key='/ovs/framework/migration|read_preference', default=False) is False: try: name_backend_map = dict((alba_backend.name, alba_backend) for alba_backend in alba_backends) for alba_node in AlbaNodeList.get_albanodes(): AlbaMigrationController._logger.info( 'Processing maintenance services running on ALBA Node {0} with ID {1}' .format(alba_node.ip, alba_node.node_id)) alba_node.invalidate_dynamics('maintenance_services') for alba_backend_name, services in alba_node.maintenance_services.iteritems( ): if alba_backend_name not in name_backend_map: AlbaMigrationController._logger.error( 'ALBA Node {0} has services for an ALBA Backend {1} which is not modelled' .format(alba_node.ip, alba_backend_name)) continue alba_backend = name_backend_map[alba_backend_name] AlbaMigrationController._logger.info( 'Processing {0} ALBA Backend {1} with GUID {2}'. format(alba_backend.scaling, alba_backend.name, alba_backend.guid)) if alba_backend.scaling == alba_backend.SCALINGS.LOCAL: read_preferences = [alba_node.node_id] else: read_preferences = AlbaController.get_read_preferences_for_global_backend( alba_backend=alba_backend, alba_node_id=alba_node.node_id, read_preferences=[]) for service_name, _ in services: AlbaMigrationController._logger.info( 'Processing service {0}'.format(service_name)) old_config_key = '/ovs/alba/backends/{0}/maintenance/config'.format( alba_backend.guid) new_config_key = '/ovs/alba/backends/{0}/maintenance/{1}/config'.format( alba_backend.guid, service_name) if Configuration.exists(key=old_config_key): new_config = Configuration.get( key=old_config_key) new_config[ 'read_preference'] = read_preferences Configuration.set(key=new_config_key, value=new_config) for alba_backend in alba_backends: Configuration.delete( key='/ovs/alba/backends/{0}/maintenance/config'.format( alba_backend.guid)) AlbaController.checkup_maintenance_agents.delay() Configuration.set( key='/ovs/framework/migration|read_preference', value=True) except Exception: AlbaMigrationController._logger.exception( 'Updating read preferences for ALBA Backends failed') ####################################################### # Storing actual package name in version files (1.11.0) (https://github.com/openvstorage/framework/issues/1876) changed_clients = set() storagerouters = StorageRouterList.get_storagerouters() if Configuration.get( key= '/ovs/framework/migration|actual_package_name_in_version_file_alba', default=False) is False: try: service_manager = ServiceFactory.get_manager() alba_pkg_name, alba_version_cmd = PackageFactory.get_package_and_version_cmd_for( component=PackageFactory.COMP_ALBA) for storagerouter in storagerouters: try: root_client = SSHClient( endpoint=storagerouter.ip, username='******' ) # Use '.ip' instead of StorageRouter object because this code is executed during post-update at which point the heartbeat has not been updated for some time except UnableToConnectException: AlbaMigrationController._logger.exception( 'Updating actual package name for version files failed on StorageRouter {0}' .format(storagerouter.ip)) continue for file_name in root_client.file_list( directory=ServiceFactory.RUN_FILE_DIR): if not file_name.endswith('.version'): continue file_path = '{0}/{1}'.format( ServiceFactory.RUN_FILE_DIR, file_name) contents = root_client.file_read(filename=file_path) if alba_pkg_name == PackageFactory.PKG_ALBA_EE and '{0}='.format( PackageFactory.PKG_ALBA) in contents: # Rewrite the version file in the RUN_FILE_DIR contents = contents.replace( PackageFactory.PKG_ALBA, PackageFactory.PKG_ALBA_EE) root_client.file_write(filename=file_path, contents=contents) # Regenerate the service and update the EXTRA_VERSION_CMD in the configuration management service_name = file_name.split('.')[0] service_config_key = ServiceFactory.SERVICE_CONFIG_KEY.format( storagerouter.machine_id, service_name) if Configuration.exists(key=service_config_key): service_config = Configuration.get( key=service_config_key) if 'EXTRA_VERSION_CMD' in service_config: service_config[ 'EXTRA_VERSION_CMD'] = '{0}=`{1}`'.format( alba_pkg_name, alba_version_cmd) Configuration.set(key=service_config_key, value=service_config) service_manager.regenerate_service( name='ovs-arakoon', client=root_client, target_name='ovs-{0}'.format( service_name) ) # Leave out .version changed_clients.add(root_client) Configuration.set( key= '/ovs/framework/migration|actual_package_name_in_version_file_alba', value=True) except Exception: AlbaMigrationController._logger.exception( 'Updating actual package name for version files failed') for root_client in changed_clients: try: root_client.run(['systemctl', 'daemon-reload']) except Exception: AlbaMigrationController._logger.exception( 'Executing command "systemctl daemon-reload" failed') #################################### # Fix for migration version (1.11.0) # Previous code could potentially store a higher version number in the config management than the actual version number if Configuration.get( key='/ovs/framework/migration|alba_migration_version_fix', default=False) is False: try: for storagerouter in storagerouters: config_key = '/ovs/framework/hosts/{0}/versions'.format( storagerouter.machine_id) if Configuration.exists(key=config_key): versions = Configuration.get(key=config_key) if versions.get(PackageFactory.COMP_MIGRATION_ALBA, 0) > ExtensionMigrator.THIS_VERSION: versions[ PackageFactory. COMP_MIGRATION_ALBA] = ExtensionMigrator.THIS_VERSION Configuration.set(key=config_key, value=versions) Configuration.set( key='/ovs/framework/migration|alba_migration_version_fix', value=True) except Exception: AlbaMigrationController._logger.exception( 'Updating migration version failed') #################################### # Enable auto-cleanup migration_auto_cleanup_key = '/ovs/framework/migration|alba_auto_cleanup' if Configuration.get(key=migration_auto_cleanup_key, default=False) is False: try: for storagerouter in StorageRouterList.get_storagerouters(): storagerouter.invalidate_dynamics( 'features') # New feature was added errors = [] for alba_backend in AlbaBackendList.get_albabackends(): try: AlbaController.set_auto_cleanup(alba_backend.guid) except Exception as ex: AlbaMigrationController._logger.exception( 'Failed to set the auto-cleanup for ALBA Backend {0}' .format(alba_backend.name)) errors.append(ex) if len(errors) == 0: Configuration.set(key=migration_auto_cleanup_key, value=True) except Exception: AlbaMigrationController._logger.exception( 'Updating auto cleanup failed') #################################### # Change cache eviction migration_random_eviction_key = '/ovs/framework/migration|alba_cache_eviction_random' if Configuration.get(key=migration_random_eviction_key, default=False) is False: try: errors = [] for alba_backend in AlbaBackendList.get_albabackends(): try: AlbaController.set_cache_eviction(alba_backend.guid) except Exception as ex: AlbaMigrationController._logger.exception( 'Failed to set the auto-cleanup for ALBA Backend {0}' .format(alba_backend.name)) errors.append(ex) if len(errors) == 0: Configuration.set(key=migration_random_eviction_key, value=True) except Exception: AlbaMigrationController._logger.exception( 'Updating auto cleanup failed') ################################################### # Sync all disks and apply the backend role. Backend role was removed with the AD (since 1.10) albanode_backend_role_sync_key = '/ovs/framework/migration|albanode_backend_role_sync' if not Configuration.get(key=albanode_backend_role_sync_key, default=False): try: errors = [] for alba_node in AlbaNodeList.get_albanodes(): try: if not alba_node.storagerouter: continue stack = alba_node.client.get_stack() # type: dict for slot_id, slot_information in stack.iteritems(): osds = slot_information.get('osds', {}) # type: dict slot_aliases = slot_information.get( 'aliases', []) # type: list if not osds: # No osds means no partition was made continue # Sync to add all potential partitions that will need a backend role DiskController.sync_with_reality( storagerouter_guid=alba_node.storagerouter_guid ) for disk in alba_node.storagerouter.disks: if set(disk.aliases).intersection( set(slot_aliases)): partition = disk.partitions[0] if DiskPartition.ROLES.BACKEND not in partition.roles: partition.roles.append( DiskPartition.ROLES.BACKEND) partition.save() except Exception as ex: AlbaMigrationController._logger.exception( 'Syncing for storagerouter/albanode {0} failed'. format(alba_node.storagerouter.ip)) errors.append(ex) if not errors: Configuration.set(key=albanode_backend_role_sync_key, value=True) except Exception: AlbaMigrationController._logger.exception( 'Syncing up the disks for backend roles failed') AlbaMigrationController._logger.info('Finished out of band migrations')
def nsm_checkup(alba_backend_guid=None, min_internal_nsms=1, external_nsm_cluster_names=None): # type: (Optional[str], Optional[int], Optional[List[str]]) -> None """ Validates the current NSM setup/configuration and takes actions where required. Assumptions: * A 2 node NSM is considered safer than a 1 node NSM. * When adding an NSM, the nodes with the least amount of NSM participation are preferred :param alba_backend_guid: Run for a specific ALBA Backend :type alba_backend_guid: str :param min_internal_nsms: Minimum amount of NSM hosts that need to be provided :type min_internal_nsms: int :param external_nsm_cluster_names: Information about the additional clusters to claim (only for externally managed Arakoon clusters) :type external_nsm_cluster_names: list :return: None :rtype: NoneType """ ############### # Validations # ############### if external_nsm_cluster_names is None: external_nsm_cluster_names = [] AlbaArakoonController._logger.info('NSM checkup started') if min_internal_nsms < 1: raise ValueError( 'Minimum amount of NSM clusters must be 1 or more') if not isinstance(external_nsm_cluster_names, list): raise ValueError( "'external_nsm_cluster_names' must be of type 'list'") if len(external_nsm_cluster_names) > 0: if alba_backend_guid is None: raise ValueError( 'Additional NSMs can only be configured for a specific ALBA Backend' ) if min_internal_nsms > 1: raise ValueError( "'min_internal_nsms' and 'external_nsm_cluster_names' are mutually exclusive" ) external_nsm_cluster_names = list(set( external_nsm_cluster_names)) # Remove duplicate cluster names for cluster_name in external_nsm_cluster_names: try: ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=cluster_name) except NotFoundException: raise ValueError( 'Arakoon cluster with name {0} does not exist'.format( cluster_name)) if alba_backend_guid is None: alba_backends = [ alba_backend for alba_backend in AlbaBackendList.get_albabackends() if alba_backend.backend.status == 'RUNNING' ] else: alba_backends = [AlbaBackend(alba_backend_guid)] masters = StorageRouterList.get_masters() storagerouters = set() for alba_backend in alba_backends: if alba_backend.abm_cluster is None: raise ValueError( 'No ABM cluster found for ALBA Backend {0}'.format( alba_backend.name)) if len(alba_backend.abm_cluster.abm_services) == 0: raise ValueError( 'ALBA Backend {0} does not have any registered ABM services' .format(alba_backend.name)) if len(alba_backend.nsm_clusters) + len( external_nsm_cluster_names) > MAX_NSM_AMOUNT: raise ValueError( 'The maximum of {0} NSM Arakoon clusters will be exceeded. Amount of clusters that can be deployed for this ALBA Backend: {1}' .format(MAX_NSM_AMOUNT, MAX_NSM_AMOUNT - len(alba_backend.nsm_clusters))) # Validate enough externally managed Arakoon clusters are available if alba_backend.abm_cluster.abm_services[ 0].service.is_internal is False: unused_cluster_names = set([ cluster_info['cluster_name'] for cluster_info in ArakoonInstaller.get_unused_arakoon_clusters( cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.NSM) ]) if set(external_nsm_cluster_names).difference( unused_cluster_names): raise ValueError( 'Some of the provided cluster_names have already been claimed before' ) storagerouters.update( set(masters) ) # For externally managed we need an available master node else: for abm_service in alba_backend.abm_cluster.abm_services: # For internally managed we need all StorageRouters online storagerouters.add(abm_service.service.storagerouter) for nsm_cluster in alba_backend.nsm_clusters: # For internally managed we need all StorageRouters online for nsm_service in nsm_cluster.nsm_services: storagerouters.add(nsm_service.service.storagerouter) ssh_clients = {} for storagerouter in storagerouters: try: ssh_clients[storagerouter] = SSHClient(endpoint=storagerouter) except UnableToConnectException: raise RuntimeError( 'StorageRouter {0} with IP {1} is not reachable'.format( storagerouter.name, storagerouter.ip)) version_str = AlbaArakoonInstaller.get_alba_version_string() nsm_installer = NSMInstaller(version_str=version_str, ssh_clients=ssh_clients) ################## # Check Clusters # ################## safety = Configuration.get( '/ovs/framework/plugins/alba/config|nsm.safety') maxload = Configuration.get( '/ovs/framework/plugins/alba/config|nsm.maxload') AlbaArakoonController._logger.debug( 'NSM safety is configured at: {0}'.format(safety)) AlbaArakoonController._logger.debug( 'NSM max load is configured at: {0}'.format(maxload)) master_client = None failed_backends = [] for alba_backend in alba_backends: try: # Gather information AlbaArakoonController._logger.info( 'ALBA Backend {0} - Ensuring NSM safety'.format( alba_backend.name)) internal = AlbaArakoonInstaller.is_internally_managed( alba_backend) nsm_loads = AlbaArakoonController.get_nsm_loads(alba_backend) nsm_storagerouters = AlbaArakoonController.get_nsms_per_storagerouter( alba_backend) sorted_nsm_clusters = sorted(alba_backend.nsm_clusters, key=lambda k: k.number) if not internal and len(external_nsm_cluster_names) > 0: for sr, cl in ssh_clients.iteritems(): if sr.node_type == 'MASTER': master_client = cl break if master_client is None: # Internal is False and we specified the NSM clusters to claim, but no MASTER nodes online raise ValueError( 'Could not find an online master node') AlbaArakoonController._logger.debug( 'ALBA Backend {0} - Arakoon clusters are {1} managed'. format(alba_backend.name, 'internally' if internal is True else 'externally')) for nsm_number, nsm_load in nsm_loads.iteritems(): AlbaArakoonController._logger.debug( 'ALBA Backend {0} - NSM Cluster {1} - Load {2}'.format( alba_backend.name, nsm_number, nsm_load)) for sr, count in nsm_storagerouters.iteritems(): AlbaArakoonController._logger.debug( 'ALBA Backend {0} - StorageRouter {1} - NSM Services {2}' .format(alba_backend.name, sr.name, count)) if internal: # Extend existing NSM clusters if safety not met for nsm_cluster in sorted_nsm_clusters: AlbaArakoonController._logger.debug( 'ALBA Backend {0} - Processing NSM {1} - Expected safety {2} - Current safety {3}' .format(alba_backend.name, nsm_cluster.number, safety, len(nsm_cluster.nsm_services))) AlbaArakoonController.ensure_nsm_cluster_safety( nsm_cluster, nsm_storagerouters, nsm_installer=nsm_installer) AlbaArakoonController.ensure_nsm_clusters_load( alba_backend, nsms_per_storagerouter=nsm_storagerouters, ssh_clients=ssh_clients, version_str=version_str, min_internal_nsms=min_internal_nsms, external_nsm_cluster_names=external_nsm_cluster_names) except Exception: AlbaArakoonController._logger.exception( 'NSM Checkup failed for Backend {0}'.format( alba_backend.name)) failed_backends.append(alba_backend.name)
def get_disk_safety(): """ Send disk safety for each vpool and the amount of namespaces with the lowest disk safety """ points = [] abms = [] for service in ServiceList.get_services(): if service.type.name == ServiceType.SERVICE_TYPES.ALBA_MGR: abms.append(service.name) abms = list(set(abms)) abl = AlbaBackendList.get_albabackends() for ab in abl: service_name = Service(ab.abm_services[0].service_guid).name if service_name not in abms: continue config = "etcd://127.0.0.1:2379/ovs/arakoon/{}/config".format(service_name) try: disk_safety = AlbaCLI.run('get-disk-safety', config=config, to_json=True) except Exception as ex: StatsmonkeyScheduledTaskController._logger.error('{0}: {1}'.format(service_name, ex.message)) continue presets = ab.presets used_preset = None for preset in presets: try: policies = preset['policy_metadata'] for policy in policies: if policies[policy]['is_active'] and policies[policy]['in_use']: used_preset = policy if used_preset is not None: used_preset = json.loads(used_preset.replace('(', '[').replace(')', ']')) max_disk_safety = used_preset[1] safety = { 'measurement': 'disk_safety', 'tags': { 'backend_name': ab.name, 'max_disk_safety': max_disk_safety, 'min_disk_safety': max_disk_safety }, 'fields': { 'amount_max_disk_safety': 0, 'amount_between_disk_safety': 0, 'amount_min_disk_safety': 0 } } stats = {} for disk in disk_safety: if disk['safety'] is not None: if disk['safety'] not in stats: stats[disk['safety']] = 0 stats[disk['safety']] += 1 min_disk_safety = min(stats.keys()) safety['tags']['min_disk_safety'] = min_disk_safety for stat in stats: if stat == max_disk_safety: safety['fields']['amount_max_disk_safety'] = stats[stat] elif stat == min_disk_safety: safety['fields']['amount_min_disk_safety'] = stats[stat] else: safety['fields']['amount_between_disk_safety'] += stats[stat] points.append(safety) except Exception as ex: StatsmonkeyScheduledTaskController._logger.error(ex.message) if len(points) == 0: StatsmonkeyScheduledTaskController._logger.info("No statistics found") return StatsmonkeyScheduledTaskController._send_stats(points) return points
def get_stats_vdisks(cls): """ Retrieve statistics about all vDisks on the system. Check the safety, storage amount on the Backend, fail-over status and others """ if cls._config is None: cls.validate_and_retrieve_config() stats = [] errors = False environment = cls._config['environment'] alba_backend_info = {} for alba_backend in AlbaBackendList.get_albabackends(): config_path = Configuration.get_configuration_path( alba_backend.abm_cluster.config_location) disk_safety = {} namespace_usage = {} # Retrieve namespace, preset and disk safety information try: preset_info = AlbaCLI.run( command='list-presets', config=config_path ) # Not using alba_backend.presets, because it takes a whole lot longer to retrieve all_namespace_info = AlbaCLI.run(command='show-namespaces', config=config_path, extra_params=['--max=-1'])[1] all_disk_safety_info = AlbaCLI.run(command='get-disk-safety', config=config_path) except Exception: errors = True cls._logger.exception( 'Retrieving information for ALBA Backend {0} failed'. format(alba_backend.name)) continue alba_backend_info[alba_backend.guid] = { 'disk_safety': disk_safety, 'namespace_usage': namespace_usage } # Parse namespace information for namespace_info in all_namespace_info: namespace_usage[namespace_info['name']] = float( namespace_info['statistics']['storage']) # Parse preset information policies = [] preset_name = None for preset in preset_info: if preset['in_use'] is not True: continue preset_name = preset['name'] policies.extend(preset['policies']) if preset_name is None: continue # Parse disk safety information total_objects = 0 max_lost_disks = 0 max_disk_safety = 0 bucket_overview = {} disk_lost_overview = {} disk_safety_overview = {} for disk_safety_info in all_disk_safety_info: safety = disk_safety_info['safety'] volume_id = disk_safety_info['namespace'] disk_safety[volume_id] = float( safety) if safety is not None else safety for bucket_safety in disk_safety_info['bucket_safety']: bucket = bucket_safety['bucket'] objects = bucket_safety['count'] remaining_safety = bucket_safety['remaining_safety'] if bucket[1] > max_lost_disks: max_lost_disks = bucket[1] if remaining_safety > max_disk_safety: max_disk_safety = remaining_safety for policy in policies: k = policy[0] == bucket[0] m = policy[1] == bucket[1] c = policy[2] <= bucket[2] x = policy[3] >= bucket[3] if k and m and c and x: if preset_name not in bucket_overview: bucket_overview[preset_name] = { 'policy': str(policy), 'presets': {} } bucket[2] -= bucket_safety['applicable_dead_osds'] if str(bucket ) not in bucket_overview[preset_name]['presets']: bucket_overview[preset_name]['presets'][str( bucket)] = { 'objects': 0, 'disk_safety': 0 } disk_lost = bucket[0] + bucket[1] - bucket[ 2] # Data fragments + parity fragments - amount of fragments to write + dead osds if disk_lost not in disk_lost_overview: disk_lost_overview[disk_lost] = 0 if remaining_safety not in disk_safety_overview: disk_safety_overview[remaining_safety] = 0 total_objects += objects disk_lost_overview[disk_lost] += objects disk_safety_overview[remaining_safety] += objects bucket_overview[preset_name]['presets'][str( bucket)]['objects'] += objects bucket_overview[preset_name]['presets'][str( bucket)]['disk_safety'] = remaining_safety # Create statistics regarding disk safety for disk_lost_number in xrange(max_lost_disks + 1): stats.append({ 'tags': { 'disk_lost': disk_lost_number, 'environment': environment, 'backend_name': alba_backend.name }, 'fields': { 'objects': disk_lost_overview.get(disk_lost_number, 0), 'total_objects': total_objects }, 'measurement': 'disk_lost' }) for disk_safety_number in xrange(max_disk_safety + 1): stats.append({ 'tags': { 'disk_safety': disk_safety_number, 'environment': environment, 'backend_name': alba_backend.name }, 'fields': { 'objects': disk_safety_overview.get(disk_safety_number, 0), 'total_objects': total_objects }, 'measurement': 'disk_safety' }) for preset_name, result in bucket_overview.iteritems(): for bucket_count, bucket_result in result['presets'].iteritems( ): stats.append({ 'tags': { 'bucket': bucket_count, 'policy': result['policy'], 'preset_name': preset_name, 'environment': environment, 'disk_safety': bucket_result['disk_safety'], 'backend_name': alba_backend.name }, 'fields': { 'objects': bucket_result['objects'], 'total_objects': total_objects }, 'measurement': 'bucket' }) # Integrate namespace and disk safety information in vPool stats for vpool in VPoolList.get_vpools(): alba_backend_guid = vpool.metadata['backend']['backend_info'][ 'alba_backend_guid'] for vdisk in vpool.vdisks: try: metrics = cls._convert_to_float_values( cls._pop_realtime_info(vdisk.statistics)) metrics['failover_mode'] = vdisk.dtl_status metrics['frontend_size'] = float(vdisk.size) metrics['failover_mode_status'] = cls._FAILOVER_MAP.get( vdisk.dtl_status, 3) if alba_backend_guid in alba_backend_info: metrics['disk_safety'] = alba_backend_info[ alba_backend_guid]['disk_safety'].get( vdisk.volume_id) metrics['backend_stored'] = alba_backend_info[ alba_backend_guid]['namespace_usage'].get( vdisk.volume_id) stats.append({ 'tags': { 'disk_name': vdisk.name, 'volume_id': vdisk.volume_id, 'vpool_name': vdisk.vpool.name, 'environment': environment, 'storagerouter_name': StorageRouter(vdisk.storagerouter_guid).name }, 'fields': metrics, 'measurement': 'vdisk' }) except Exception: errors = True cls._logger.exception( 'Retrieving statistics for vDisk {0} with guid {1} failed' .format(vdisk.name, vdisk.guid)) return errors, stats
def _bootstrap_dal_models(self): """ Load/hook dal models as snmp oids """ _guids = set() enabled_key = "{0}_config_dal_enabled".format(STORAGE_PREFIX) self.instance_oid = 0 try: enabled = self.persistent.get(enabled_key) except KeyNotFoundException: enabled = True # Enabled by default, can be disabled by setting the key if enabled: from ovs.dal.lists.vdisklist import VDiskList from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.dal.lists.pmachinelist import PMachineList from ovs.dal.lists.vmachinelist import VMachineList from ovs.dal.lists.vpoollist import VPoolList from ovs.dal.lists.storagedriverlist import StorageDriverList for storagerouter in StorageRouterList.get_storagerouters(): _guids.add(storagerouter.guid) if not self._check_added(storagerouter): self._register_dal_model(10, storagerouter, 'guid', "0") self._register_dal_model(10, storagerouter, 'name', "1") self._register_dal_model(10, storagerouter, 'pmachine', "3", key = 'host_status') self._register_dal_model(10, storagerouter, 'description', "4") self._register_dal_model(10, storagerouter, 'devicename', "5") self._register_dal_model(10, storagerouter, 'dtl_mode', "6") self._register_dal_model(10, storagerouter, 'ip', "8") self._register_dal_model(10, storagerouter, 'machineid', "9") self._register_dal_model(10, storagerouter, 'status', "10") self._register_dal_model(10, storagerouter, '#vdisks', "11", func = lambda storagerouter: len([vdisk for vpool_vdisks in [storagedriver.vpool.vdisks for storagedriver in storagerouter.storagedrivers] for vdisk in vpool_vdisks if vdisk.storagedriver_id == storagedriver.storagedriver_id]), atype = int) self._register_dal_model(10, storagerouter, '#vmachines', "12", func = lambda storagerouter: len(set([vdisk.vmachine.guid for vpool_vdisks in [storagedriver.vpool.vdisks for storagedriver in storagerouter.storagedrivers] for vdisk in vpool_vdisks if vdisk.storagedriver_id == storagedriver.storagedriver_id])), atype = int) self._register_dal_model(10, storagerouter, '#stored_data', "13", func = lambda storagerouter: sum([vdisk.vmachine.stored_data for vpool_vdisks in [storagedriver.vpool.vdisks for storagedriver in storagerouter.storagedrivers] for vdisk in vpool_vdisks if vdisk.storagedriver_id == storagedriver.storagedriver_id]), atype = int) self.instance_oid += 1 for vm in VMachineList.get_vmachines(): _guids.add(vm.guid) if not self._check_added(vm): if vm.is_vtemplate: self._register_dal_model(11, vm, 'guid', "0") self._register_dal_model(11, vm, 'name', "1") def _children(vmt): children = 0 disks = [vd.guid for vd in vmt.vdisks] for vdisk in [vdisk.parent_vdisk_guid for item in [vm.vdisks for vm in VMachineList.get_vmachines() if not vm.is_vtemplate] for vdisk in item]: for disk in disks: if vdisk == disk: children += 1 return children self._register_dal_model(11, vm, '#children', 2, func = _children, atype = int) self.instance_oid += 1 for vm in VMachineList.get_vmachines(): _guids.add(vm.guid) if not self._check_added(vm): if not vm.is_vtemplate: self._register_dal_model(0, vm, 'guid', "0") self._register_dal_model(0, vm, 'name', "1") self._register_dal_model(0, vm, 'statistics', "2.0", key = "operations", atype = int) self._register_dal_model(0, vm, 'statistics', "2.1", key = "cluster_cache_misses_ps", atype = int) self._register_dal_model(0, vm, 'statistics', "2.2", key = "data_read", atype = int) self._register_dal_model(0, vm, 'statistics', "2.3", key = "sco_cache_misses", atype = int) self._register_dal_model(0, vm, 'statistics', "2.4", key = "sco_cache_hits_ps", atype = int) self._register_dal_model(0, vm, 'statistics', "2.5", key = "sco_cache_hits", atype = int) self._register_dal_model(0, vm, 'statistics', "2.6", key = "write_operations", atype = int) self._register_dal_model(0, vm, 'statistics', "2.7", key = "cluster_cache_misses", atype = int) self._register_dal_model(0, vm, 'statistics', "2.8", key = "read_operations_ps", atype = int) self._register_dal_model(0, vm, 'statistics', "2.9", key = "sco_cache_misses_ps", atype = int) self._register_dal_model(0, vm, 'statistics', "2.10", key = "backend_write_operations", atype = int) self._register_dal_model(0, vm, 'statistics', "2.11", key = "backend_data_read", atype = int) self._register_dal_model(0, vm, 'statistics', "2.12", key = "cache_hits", atype = int) self._register_dal_model(0, vm, 'statistics', "2.13", key = "backend_write_operations_ps", atype = int) self._register_dal_model(0, vm, 'statistics', "2.14", key = "metadata_store_hits_ps", atype = int) self._register_dal_model(0, vm, 'statistics', "2.15", key = "metadata_store_misses", atype = int) self._register_dal_model(0, vm, 'statistics', "2.16", key = "backend_data_written", atype = int) self._register_dal_model(0, vm, 'statistics', "2.17", key = "data_read_ps", atype = int) self._register_dal_model(0, vm, 'statistics', "2.18", key = "read_operations", atype = int) self._register_dal_model(0, vm, 'statistics', "2.19", key = "cluster_cache_hits", atype = int) self._register_dal_model(0, vm, 'statistics', "2.20", key = "data_written_ps", atype = int) self._register_dal_model(0, vm, 'statistics', "2.21", key = "cluster_cache_hits_ps", atype = int) self._register_dal_model(0, vm, 'statistics', "2.22", key = "cache_hits_ps", atype = int) self._register_dal_model(0, vm, 'statistics', "2.23", key = "timestamp", atype = int) self._register_dal_model(0, vm, 'statistics', "2.24", key = "metadata_store_misses_ps", atype = int) self._register_dal_model(0, vm, 'statistics', "2.25", key = "backend_data_written_ps", atype = int) self._register_dal_model(0, vm, 'statistics', "2.26", key = "backend_read_operations", atype = int) self._register_dal_model(0, vm, 'statistics', "2.27", key = "data_written", atype = int) self._register_dal_model(0, vm, 'statistics', "2.28", key = "metadata_store_hits", atype = int) self._register_dal_model(0, vm, 'statistics', "2.29", key = "backend_data_read_ps", atype = int) self._register_dal_model(0, vm, 'statistics', "2.30", key = "operations_ps", atype = int) self._register_dal_model(0, vm, 'statistics', "2.31", key = "backend_read_operations_ps", atype = int) self._register_dal_model(0, vm, 'statistics', "2.32", key = "data_transferred_ps", atype = int) self._register_dal_model(0, vm, 'statistics', "2.33", key = "write_operations_ps", atype = int) self._register_dal_model(0, vm, 'statistics', "2.34", key = "data_transferred", atype = int) self._register_dal_model(0, vm, 'stored_data', "3", atype = int) self._register_dal_model(0, vm, 'description', "4") self._register_dal_model(0, vm, 'devicename', "5") self._register_dal_model(0, vm, 'dtl_mode', "6") self._register_dal_model(0, vm, 'hypervisorid', "7") self._register_dal_model(0, vm, 'ip', "8") self._register_dal_model(0, vm, 'status', "10") self._register_dal_model(0, vm, 'stored_data', "10", atype = int) self._register_dal_model(0, vm, 'snapshots', "11", atype = int) self._register_dal_model(0, vm, 'vdisks', "12", atype = int) self._register_dal_model(0, vm, 'DTL', '13', func = lambda vm: 'DEGRADED' if all(item == 'DEGRADED' for item in [vd.info['failover_mode'] for vd in vm.vdisks]) else 'OK') self.instance_oid += 1 for vd in VDiskList.get_vdisks(): _guids.add(vd.guid) if not self._check_added(vd): self._register_dal_model(1, vd, 'guid', "0") self._register_dal_model(1, vd, 'name', "1") self._register_dal_model(1, vd, 'statistics', "2.0", key = "operations", atype = int) self._register_dal_model(1, vd, 'statistics', "2.1", key = "data_written_ps", atype = int) self._register_dal_model(1, vd, 'statistics', "2.2", key = "data_read", atype = int) self._register_dal_model(1, vd, 'statistics', "2.3", key = "sco_cache_misses", atype = int) self._register_dal_model(1, vd, 'statistics', "2.4", key = "sco_cache_hits_ps", atype = int) self._register_dal_model(1, vd, 'statistics', "2.5", key = "sco_cache_hits", atype = int) self._register_dal_model(1, vd, 'statistics', "2.6", key = "write_operations", atype = int) self._register_dal_model(1, vd, 'statistics', "2.7", key = "cluster_cache_misses", atype = int) self._register_dal_model(1, vd, 'statistics', "2.8", key = "read_operations_ps", atype = int) self._register_dal_model(1, vd, 'statistics', "2.9", key = "sco_cache_misses_ps", atype = int) self._register_dal_model(1, vd, 'statistics', "2.10", key = "backend_write_operations", atype = int) self._register_dal_model(1, vd, 'statistics', "2.11", key = "backend_data_read", atype = int) self._register_dal_model(1, vd, 'statistics', "2.12", key = "cache_hits", atype = int) self._register_dal_model(1, vd, 'statistics', "2.13", key = "backend_write_operations_ps", atype = int) self._register_dal_model(1, vd, 'statistics', "2.14", key = "metadata_store_hits_ps", atype = int) self._register_dal_model(1, vd, 'statistics', "2.15", key = "metadata_store_misses", atype = int) self._register_dal_model(1, vd, 'statistics', "2.16", key = "backend_data_written", atype = int) self._register_dal_model(1, vd, 'statistics', "2.17", key = "data_read_ps", atype = int) self._register_dal_model(1, vd, 'statistics', "2.18", key = "read_operations", atype = int) self._register_dal_model(1, vd, 'statistics', "2.19", key = "cluster_cache_hits", atype = int) self._register_dal_model(1, vd, 'statistics', "2.20", key = "cluster_cache_misses_ps", atype = int) self._register_dal_model(1, vd, 'statistics', "2.21", key = "cluster_cache_hits_ps", atype = int) self._register_dal_model(1, vd, 'statistics', "2.22", key = "cache_hits_ps", atype = int) self._register_dal_model(1, vd, 'statistics', "2.23", key = "timestamp", atype = int) self._register_dal_model(1, vd, 'statistics', "2.24", key = "metadata_store_misses_ps", atype = int) self._register_dal_model(1, vd, 'statistics', "2.25", key = "backend_data_written_ps", atype = int) self._register_dal_model(1, vd, 'statistics', "2.26", key = "backend_read_operations", atype = int) self._register_dal_model(1, vd, 'statistics', "2.27", key = "data_written", atype = int) self._register_dal_model(1, vd, 'statistics', "2.28", key = "metadata_store_hits", atype = int) self._register_dal_model(1, vd, 'statistics', "2.29", key = "backend_data_read_ps", atype = int) self._register_dal_model(1, vd, 'statistics', "2.30", key = "operations_ps", atype = int) self._register_dal_model(1, vd, 'statistics', "2.31", key = "backend_read_operations_ps", atype = int) self._register_dal_model(1, vd, 'statistics', "2.32", key = "data_transferred_ps", atype = int) self._register_dal_model(1, vd, 'statistics', "2.33", key = "write_operations_ps", atype = int) self._register_dal_model(1, vd, 'statistics', "2.34", key = "data_transferred", atype = int) self._register_dal_model(1, vd, 'info', "3", key = 'stored', atype = int) self._register_dal_model(1, vd, 'info', "4", key = 'failover_mode', atype = int) self._register_dal_model(1, vd, 'snapshots', "5", atype = int) self.instance_oid += 1 for pm in PMachineList.get_pmachines(): _guids.add(pm.guid) if not self._check_added(pm): self._register_dal_model(2, pm, 'guid', "0") self._register_dal_model(2, pm, 'name', "1") self._register_dal_model(2, pm, 'host_status', "2") self.instance_oid += 1 for vp in VPoolList.get_vpools(): _guids.add(vp.guid) if not self._check_added(vp): self._register_dal_model(3, vp, 'guid', "0") self._register_dal_model(3, vp, 'name', "1") self._register_dal_model(3, vp, 'statistics', "2.0", key = "operations", atype = int) self._register_dal_model(3, vp, 'statistics', "2.1", key = "cluster_cache_misses_ps", atype = int) self._register_dal_model(3, vp, 'statistics', "2.2", key = "data_read", atype = int) self._register_dal_model(3, vp, 'statistics', "2.3", key = "sco_cache_misses", atype = int) self._register_dal_model(3, vp, 'statistics', "2.4", key = "sco_cache_hits_ps", atype = int) self._register_dal_model(3, vp, 'statistics', "2.5", key = "sco_cache_hits", atype = int) self._register_dal_model(3, vp, 'statistics', "2.6", key = "write_operations", atype = int) self._register_dal_model(3, vp, 'statistics', "2.7", key = "cluster_cache_misses", atype = int) self._register_dal_model(3, vp, 'statistics', "2.8", key = "read_operations_ps", atype = int) self._register_dal_model(3, vp, 'statistics', "2.9", key = "sco_cache_misses_ps", atype = int) self._register_dal_model(3, vp, 'statistics', "2.10", key = "backend_write_operations", atype = int) self._register_dal_model(3, vp, 'statistics', "2.11", key = "backend_data_read", atype = int) self._register_dal_model(3, vp, 'statistics', "2.12", key = "cache_hits", atype = int) self._register_dal_model(3, vp, 'statistics', "2.13", key = "backend_write_operations_ps", atype = int) self._register_dal_model(3, vp, 'statistics', "2.14", key = "metadata_store_hits_ps", atype = int) self._register_dal_model(3, vp, 'statistics', "2.15", key = "metadata_store_misses", atype = int) self._register_dal_model(3, vp, 'statistics', "2.16", key = "backend_data_written", atype = int) self._register_dal_model(3, vp, 'statistics', "2.17", key = "data_read_ps", atype = int) self._register_dal_model(3, vp, 'statistics', "2.18", key = "read_operations", atype = int) self._register_dal_model(3, vp, 'statistics', "2.19", key = "cluster_cache_hits", atype = int) self._register_dal_model(3, vp, 'statistics', "2.20", key = "data_written_ps", atype = int) self._register_dal_model(3, vp, 'statistics', "2.21", key = "cluster_cache_hits_ps", atype = int) self._register_dal_model(3, vp, 'statistics', "2.22", key = "cache_hits_ps", atype = int) self._register_dal_model(3, vp, 'statistics', "2.23", key = "timestamp", atype = int) self._register_dal_model(3, vp, 'statistics', "2.24", key = "metadata_store_misses_ps", atype = int) self._register_dal_model(3, vp, 'statistics', "2.25", key = "backend_data_written_ps", atype = int) self._register_dal_model(3, vp, 'statistics', "2.26", key = "backend_read_operations", atype = int) self._register_dal_model(3, vp, 'statistics', "2.27", key = "data_written", atype = int) self._register_dal_model(3, vp, 'statistics', "2.28", key = "metadata_store_hits", atype = int) self._register_dal_model(3, vp, 'statistics', "2.29", key = "backend_data_read_ps", atype = int) self._register_dal_model(3, vp, 'statistics', "2.30", key = "operations_ps", atype = int) self._register_dal_model(3, vp, 'statistics', "2.31", key = "backend_read_operations_ps", atype = int) self._register_dal_model(3, vp, 'statistics', "2.32", key = "data_transferred_ps", atype = int) self._register_dal_model(3, vp, 'statistics', "2.33", key = "write_operations_ps", atype = int) self._register_dal_model(3, vp, 'statistics', "2.34", key = "data_transferred", atype = int) self._register_dal_model(3, vp, 'status', "3") self._register_dal_model(3, vp, 'description', "4") self._register_dal_model(3, vp, 'vdisks', "5", atype = int) self._register_dal_model(3, vp, '#vmachines', "6", func = lambda vp: len(set([vd.vmachine.guid for vd in vp.vdisks])), atype = int) self.instance_oid += 1 for storagedriver in StorageDriverList.get_storagedrivers(): _guids.add(storagedriver.guid) if not self._check_added(storagedriver): self._register_dal_model(4, storagedriver, 'guid', "0") self._register_dal_model(4, storagedriver, 'name', "1") self._register_dal_model(4, storagedriver, 'stored_data', "2", atype = int) self.instance_oid += 1 try: # try to load OVS Backends from ovs.dal.lists.albabackendlist import AlbaBackendList for backend in AlbaBackendList.get_albabackends(): _guids.add(backend.guid) if not self._check_added(backend): self._register_dal_model(5, backend, 'guid', 0) self._register_dal_model(5, backend, 'name', 1) for disk_id in range(len((backend.all_disks))): self._register_dal_model(5, backend, 'all_disks', '2.{0}.0'.format(disk_id), key = "name", index=disk_id) self._register_dal_model(5, backend, 'all_disks', '2.{0}.1'.format(disk_id), key = "usage.size", atype = long, index=disk_id) self._register_dal_model(5, backend, 'all_disks', '2.{0}.2'.format(disk_id), key = "usage.used", atype = long, index=disk_id) self._register_dal_model(5, backend, 'all_disks', '2.{0}.3'.format(disk_id), key = "usage.available", atype = long, index=disk_id) self._register_dal_model(5, backend, 'all_disks', '2.{0}.4'.format(disk_id), key = "state.state", index=disk_id) self._register_dal_model(5, backend, 'all_disks', '2.{0}.5'.format(disk_id), key = "node_id", index=disk_id) self.instance_oid += 1 except ImportError: print('OVS Backend not present') pass reload = False for object_guid in list(self.model_oids): if object_guid not in _guids: self.model_oids.remove(object_guid) reload = True if reload: self._reload_snmp()
def _local_stack(self): """ Returns a live list of all disks known to this AlbaBackend """ from ovs.dal.lists.albanodelist import AlbaNodeList from ovs.dal.lists.albabackendlist import AlbaBackendList if len(self.abm_services) == 0: return {} # No ABM services yet, so backend not fully installed yet alba_backend_map = {} for alba_backend in AlbaBackendList.get_albabackends(): alba_backend_map[alba_backend.alba_id] = alba_backend # Load information based on the model asd_map = {} storage_map = {} alba_nodes = AlbaNodeList.get_albanodes() for node in alba_nodes: node_id = node.node_id storage_map[node_id] = {} for disk in node.disks: disk_id = disk.aliases[0].split('/')[-1] storage_map[node_id][disk_id] = {'asds': {}, 'name': disk_id, 'guid': disk.guid, 'status': 'error', 'aliases': disk.aliases, 'status_detail': 'unknown'} for osd in disk.osds: osd_id = osd.osd_id data = {'asd_id': osd_id, 'guid': osd.guid, 'status': 'error', 'status_detail': 'unknown', 'alba_backend_guid': osd.alba_backend_guid} asd_map[osd_id] = data storage_map[node_id][disk_id]['asds'][osd_id] = data # Load information from node def _load_live_info(_node, _node_data): _data = _node.storage_stack if _data['status'] != 'ok': for disk_entry in _node_data.values(): disk_entry['status_detail'] = _data['status'] for entry in disk_entry.get('asds', {}).values(): entry['status_detail'] = _data['status'] else: for _disk_id, disk_asd_info in _data['stack'].iteritems(): if _disk_id not in _node_data: _node_data[_disk_id] = {'asds': {}} entry = _node_data[_disk_id] disk_info = copy.deepcopy(disk_asd_info) del disk_info['asds'] entry.update(disk_info) asds_info = disk_asd_info['asds'] for _asd_id, asd_info in asds_info.iteritems(): if _asd_id not in _node_data[_disk_id]['asds']: _node_data[_disk_id]['asds'][_asd_id] = asd_info else: _node_data[_disk_id]['asds'][_asd_id].update(asd_info) threads = [] for node in alba_nodes: thread = Thread(target=_load_live_info, args=(node, storage_map[node.node_id])) thread.start() threads.append(thread) for thread in threads: thread.join() # Mix in usage information for asd_id, stats in self.asd_statistics.iteritems(): if asd_id in asd_map: asd_map[asd_id]['usage'] = {'size': int(stats['capacity']), 'used': int(stats['disk_usage']), 'available': int(stats['capacity'] - stats['disk_usage'])} # Load information from alba backend_interval_key = '/ovs/alba/backends/{0}/gui_error_interval'.format(self.guid) if Configuration.exists(backend_interval_key): interval = Configuration.get(backend_interval_key) else: interval = Configuration.get('/ovs/alba/backends/global_gui_error_interval') config = Configuration.get_configuration_path('/ovs/arakoon/{0}-abm/config'.format(self.name)) asds = {} for found_osd in AlbaCLI.run(command='list-all-osds', config=config): asds[found_osd['long_id']] = found_osd for node_data in storage_map.values(): for _disk in node_data.values(): for asd_id, asd_data in _disk['asds'].iteritems(): if asd_id not in asds: continue found_osd = asds[asd_id] if 'state' not in asd_data: continue if found_osd.get('decommissioned') is True: asd_data['status'] = 'unavailable' asd_data['status_detail'] = 'decommissioned' continue state = asd_data['state'] if state == 'ok': if found_osd['id'] is None: alba_id = found_osd['alba_id'] if alba_id is None: asd_data['status'] = 'available' else: asd_data['status'] = 'unavailable' alba_backend = alba_backend_map.get(alba_id) if alba_backend is not None: asd_data['alba_backend_guid'] = alba_backend.guid else: asd_data['alba_backend_guid'] = self.guid asd_data['status'] = 'warning' asd_data['status_detail'] = 'recenterrors' read = found_osd['read'] or [0] write = found_osd['write'] or [0] errors = found_osd['errors'] if len(errors) == 0 or (len(read + write) > 0 and max(min(read), min(write)) > max(error[0] for error in errors) + interval): asd_data['status'] = 'claimed' asd_data['status_detail'] = '' else: asd_data['status'] = 'error' asd_data['status_detail'] = asd_data.get('state_detail', '') alba_backend = alba_backend_map.get(found_osd.get('alba_id')) if alba_backend is not None: asd_data['alba_backend_guid'] = alba_backend.guid return storage_map
def _alba_arakoon_checkup(cls, alba_backend_guid=None, abm_cluster=None, nsm_clusters=None): # type: (Optional[str], Optional[str], Optional[List[str]]) -> None """ Creates a new Arakoon cluster if required and extends cluster if possible on all available master nodes :param alba_backend_guid: Guid of the ALBA Backend :type alba_backend_guid: str :param nsm_clusters: NSM clusters for this ALBA Backend The code will claim the Arakoon clusters for this backend when provided :type nsm_clusters: list[str] :param abm_cluster: ABM cluster for this ALBA Backend The code will claim the Arakoon cluster for this backend when provided :type abm_cluster: str|None :return:None :rtype: NoneType """ slaves = StorageRouterList.get_slaves() masters = StorageRouterList.get_masters() clients = {} for storagerouter in masters + slaves: try: clients[storagerouter] = SSHClient(storagerouter) except UnableToConnectException: cls._logger.warning( 'Storage Router with IP {0} is not reachable'.format( storagerouter.ip)) available_storagerouters = cls.get_available_arakoon_storagerouters( clients) # Call here, because this potentially raises error, which should happen before actually making changes abm_installer = ABMInstaller(ssh_clients=clients) nsm_installer = NSMInstaller(version_str=abm_installer.version_str, ssh_clients=clients) # Cluster creation if alba_backend_guid is not None: alba_backend = AlbaBackend(alba_backend_guid) # @todo revisit. This might enforce the ABM name for externals (might be unintended) abm_cluster_name = '{0}-abm'.format(alba_backend.name) # ABM Arakoon cluster creation if alba_backend.abm_cluster is None: # Fallback to installing the cluster on an available storagerouter storagerouter, partition = available_storagerouters.items()[0] abm_installer.deploy_abm_cluster( alba_backend, abm_cluster_name, requested_abm_cluster_name=abm_cluster, storagerouter=storagerouter) # NSM Arakoon cluster creation if len(alba_backend.nsm_clusters ) == 0 and nsm_clusters is not None: storagerouter, partition = available_storagerouters.items()[0] nsm_installer.deploy_nsm_cluster(alba_backend, storagerouter=storagerouter, nsm_clusters=nsm_clusters) # ABM Cluster extension for alba_backend in AlbaBackendList.get_albabackends(): if alba_backend.abm_cluster is None: AlbaArakoonController._logger.warning( 'ALBA Backend {0} does not have an ABM cluster registered'. format(alba_backend.name)) continue cls.ensure_abm_cluster_safety(alba_backend.abm_cluster, available_storagerouters, abm_installer=abm_installer)
def checkup_maintenance_agents(): """ Check if requested nr of maintenance agents / backend is actually present Add / remove as necessary :return: None """ service_template_key = 'alba-maintenance_{0}-{1}' maintenance_agents_map = {} asd_nodes = AlbaNodeList.get_albanodes() nr_of_storage_nodes = len(asd_nodes) def _get_node_load(backend_name): highest_load = 0 lowest_load = sys.maxint agent_load = {'high_load_node': asd_nodes[0] if asd_nodes else None, 'low_load_node': asd_nodes[0] if asd_nodes else None, 'total_load': 0} for asd_node in asd_nodes: actual_nr_of_agents = 0 maint_services = asd_node.client.list_maintenance_services() for service_name in maint_services: if service_template_key.format(backend_name, '') in service_name: actual_nr_of_agents += 1 if actual_nr_of_agents > highest_load: agent_load['high_load_node'] = asd_node highest_load = actual_nr_of_agents if actual_nr_of_agents < lowest_load: agent_load['low_load_node'] = asd_node lowest_load = actual_nr_of_agents agent_load['total_load'] += actual_nr_of_agents return agent_load alba_backends = AlbaBackendList.get_albabackends() for alba_backend in alba_backends: nr_of_agents_key = AlbaNodeController.NR_OF_AGENTS_ETCD_TEMPLATE.format(alba_backend.guid) name = alba_backend.backend.name if not EtcdConfiguration.exists(nr_of_agents_key): EtcdConfiguration.set(nr_of_agents_key, nr_of_storage_nodes) required_nr = EtcdConfiguration.get(nr_of_agents_key) maintenance_agents_map[name] = {'required': required_nr, 'actual': _get_node_load(name)['total_load'], 'backend': alba_backend.backend} for name, values in maintenance_agents_map.iteritems(): AlbaNodeController._logger.info('Checking backend: {0}'.format(name)) to_process = values['required'] - values['actual'] if to_process == 0: AlbaNodeController._logger.info('No action required for: {0}'.format(name)) elif to_process >= 0: AlbaNodeController._logger.info('Adding {0} maintenance agent(s) for {1}'.format(to_process, name)) for _ in xrange(to_process): unique_hash = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(16)) node = _get_node_load(name)['low_load_node'] AlbaNodeController._logger.info('Service to add: ' + service_template_key.format(name, unique_hash)) if node and node.client: node.client.add_maintenance_service(service_template_key.format(name, unique_hash), values['backend'].alba_backend.guid, AlbaController.get_abm_service_name(values['backend'])) AlbaNodeController._logger.info('Service added') else: to_process = abs(to_process) AlbaNodeController._logger.info('Removing {0} maintenance agent(s) for {1}'.format(to_process, name)) for _ in xrange(to_process): node = _get_node_load(name)['high_load_node'] services = node.client.list_maintenance_services() if services and node and node.client: for service in services: if 'alba-maintenance_' + name in service: node.client.remove_maintenance_service(service) break
def _storage_stack(self): """ Returns a live list of all disks known to this AlbaBackend """ from ovs.dal.lists.albanodelist import AlbaNodeList from ovs.dal.lists.albabackendlist import AlbaBackendList if len(self.abm_services) == 0: return {} # No ABM services yet, so backend not fully installed yet storage_map = {} asd_map = {} alba_backend_map = {} for alba_backend in AlbaBackendList.get_albabackends(): alba_backend_map[alba_backend.alba_id] = alba_backend # Load information based on the model alba_nodes = AlbaNodeList.get_albanodes() for node in alba_nodes: node_id = node.node_id storage_map[node_id] = {} for disk in node.disks: disk_id = disk.name storage_map[node_id][disk_id] = {'name': disk_id, 'guid': disk.guid, 'status': 'error', 'status_detail': 'unknown', 'asds': {}} for asd in disk.asds: asd_id = asd.asd_id data = {'asd_id': asd_id, 'guid': asd.guid, 'status': 'error', 'status_detail': 'unknown', 'alba_backend_guid': asd.alba_backend_guid} asd_map[asd_id] = data storage_map[node_id][disk_id]['asds'][asd_id] = data # Load information from node def _load_live_info(_node, _node_data): # Live disk information try: disk_data = _node.client.get_disks() except (requests.ConnectionError, requests.Timeout): for entry in _node_data.values(): entry['status_detail'] = 'nodedown' disk_data = {} for _disk_id, disk_info in disk_data.iteritems(): if _disk_id in _node_data: entry = _node_data[_disk_id] else: entry = {'name': _disk_id, 'status': 'unknown', 'status_detail': '', 'asds': {}} _node_data[_disk_id] = entry entry.update(disk_info) if disk_info['state'] == 'ok': entry['status'] = 'uninitialized' if disk_info['available'] is True else 'initialized' entry['status_detail'] = '' else: entry['status'] = disk_info['state'] entry['status_detail'] = disk_info.get('state_detail', '') # Live ASD information try: _asd_data = _node.client.get_asds() except (requests.ConnectionError, requests.Timeout): for disk_entry in _node_data.values(): for entry in disk_entry['asds'].values(): entry['status_detail'] = 'nodedown' _asd_data = {} for _disk_id, asds in _asd_data.iteritems(): if _disk_id not in _node_data: continue for _asd_id, asd_info in asds.iteritems(): entry = {'asd_id': _asd_id, 'status': 'error' if asd_info['state'] == 'error' else 'initialized', 'status_detail': asd_info.get('state_detail', ''), 'state': asd_info['state'], 'state_detail': asd_info.get('state_detail', '')} if _asd_id not in _node_data[_disk_id]['asds']: _node_data[_disk_id]['asds'][_asd_id] = entry asd_map[_asd_id] = entry else: _node_data[_disk_id]['asds'][_asd_id].update(entry) threads = [] for node in alba_nodes: thread = Thread(target=_load_live_info, args=(node, storage_map[node.node_id])) thread.start() threads.append(thread) for thread in threads: thread.join() # Mix in usage information for asd_id, stats in self.asd_statistics.iteritems(): if asd_id in asd_map: asd_map[asd_id]['usage'] = {'size': int(stats['capacity']), 'used': int(stats['disk_usage']), 'available': int(stats['capacity'] - stats['disk_usage'])} # Load information from alba backend_interval_key = '/ovs/alba/backends/{0}/gui_error_interval'.format(self.guid) if EtcdConfiguration.exists(backend_interval_key): interval = EtcdConfiguration.get(backend_interval_key) else: interval = EtcdConfiguration.get('/ovs/alba/backends/global_gui_error_interval') config = 'etcd://127.0.0.1:2379/ovs/arakoon/{0}/config'.format(self.abm_services[0].service.name) for found_osd in AlbaCLI.run('list-all-osds', config=config, as_json=True): node_id = found_osd['node_id'] asd_id = found_osd['long_id'] for _disk in storage_map.get(node_id, {}).values(): asd_data = _disk['asds'].get(asd_id, {}) if 'state' not in asd_data: continue if found_osd.get('decommissioned') is True: asd_data['status'] = 'unavailable' asd_data['status_detail'] = 'decommissioned' continue state = asd_data['state'] if state == 'ok': if found_osd['id'] is None: alba_id = found_osd['alba_id'] if alba_id is None: asd_data['status'] = 'available' else: asd_data['status'] = 'unavailable' alba_backend = alba_backend_map.get(alba_id) if alba_backend is not None: asd_data['alba_backend_guid'] = alba_backend.guid else: asd_data['alba_backend_guid'] = self.guid asd_data['status'] = 'warning' asd_data['status_detail'] = 'recenterrors' read = found_osd['read'] or [0] write = found_osd['write'] or [0] errors = found_osd['errors'] if len(errors) == 0 or (len(read + write) > 0 and max(min(read), min(write)) > max(error[0] for error in errors) + interval): asd_data['status'] = 'claimed' asd_data['status_detail'] = '' else: asd_data['status'] = 'error' asd_data['status_detail'] = asd_data.get('state_detail', '') alba_backend = alba_backend_map.get(found_osd.get('alba_id')) if alba_backend is not None: asd_data['alba_backend_guid'] = alba_backend.guid return storage_map
def migrate(previous_version): """ Migrates from a given version to the current version. It uses 'previous_version' to be smart wherever possible, but the code should be able to migrate any version towards the expected version. When this is not possible, the code can set a minimum version and raise when it is not met. :param previous_version: The previous version from which to start the migration :type previous_version: float """ working_version = previous_version if working_version == 0: from ovs.dal.hybrids.servicetype import ServiceType # Initial version: # * Add any basic configuration or model entries # Add backends for backend_type_info in [('ALBA', 'alba')]: code = backend_type_info[1] backend_type = BackendTypeList.get_backend_type_by_code(code) if backend_type is None: backend_type = BackendType() backend_type.name = backend_type_info[0] backend_type.code = code backend_type.save() # Add service types for service_type_info in [ ServiceType.SERVICE_TYPES.NS_MGR, ServiceType.SERVICE_TYPES.ALBA_MGR, ServiceType.SERVICE_TYPES.ALBA_S3_TRANSACTION ]: service_type = ServiceType() service_type.name = service_type_info service_type.save() # From here on, all actual migration should happen to get to the expected state for THIS RELEASE elif working_version < DALMigrator.THIS_VERSION: import hashlib from ovs.dal.exceptions import ObjectNotFoundException from ovs.dal.helpers import HybridRunner, Descriptor from ovs.dal.hybrids.albaabmcluster import ABMCluster from ovs.dal.hybrids.albaosd import AlbaOSD from ovs.dal.hybrids.albansmcluster import NSMCluster from ovs.dal.hybrids.j_abmservice import ABMService from ovs.dal.hybrids.j_nsmservice import NSMService from ovs.dal.hybrids.service import Service from ovs.dal.hybrids.servicetype import ServiceType from ovs.dal.lists.albabackendlist import AlbaBackendList from ovs.dal.lists.albanodelist import AlbaNodeList from ovs.dal.lists.servicetypelist import ServiceTypeList from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.extensions.db.arakooninstaller import ArakoonClusterConfig, ArakoonInstaller from ovs.extensions.generic.configuration import Configuration, NotFoundException from ovs_extensions.generic.toolbox import ExtensionsToolbox from ovs.extensions.plugins.albacli import AlbaCLI from ovs.extensions.storage.persistentfactory import PersistentFactory # Migrate unique constraints & indexes client = PersistentFactory.get_client() hybrid_structure = HybridRunner.get_hybrids() for class_descriptor in hybrid_structure.values(): cls = Descriptor().load(class_descriptor).get_object() classname = cls.__name__.lower() unique_key = 'ovs_unique_{0}_{{0}}_'.format(classname) index_prefix = 'ovs_index_{0}|{{0}}|'.format(classname) index_key = 'ovs_index_{0}|{{0}}|{{1}}'.format(classname) uniques = [] indexes = [] # noinspection PyProtectedMember for prop in cls._properties: if prop.unique is True and len([ k for k in client.prefix( unique_key.format(prop.name)) ]) == 0: uniques.append(prop.name) if prop.indexed is True and len([ k for k in client.prefix( index_prefix.format(prop.name)) ]) == 0: indexes.append(prop.name) if len(uniques) > 0 or len(indexes) > 0: prefix = 'ovs_data_{0}_'.format(classname) for key, data in client.prefix_entries(prefix): for property_name in uniques: ukey = '{0}{1}'.format( unique_key.format(property_name), hashlib.sha1(str( data[property_name])).hexdigest()) client.set(ukey, key) for property_name in indexes: if property_name not in data: continue # This is the case when there's a new indexed property added. ikey = index_key.format( property_name, hashlib.sha1(str( data[property_name])).hexdigest()) index = list( client.get_multi([ikey], must_exist=False))[0] transaction = client.begin_transaction() if index is None: client.assert_value(ikey, None, transaction=transaction) client.set(ikey, [key], transaction=transaction) elif key not in index: client.assert_value(ikey, index[:], transaction=transaction) client.set(ikey, index + [key], transaction=transaction) client.apply_transaction(transaction) ############################################# # Introduction of ABMCluster and NSMCluster # ############################################# # Verify presence of unchanged ALBA Backends alba_backends = AlbaBackendList.get_albabackends() changes_required = False for alba_backend in alba_backends: if alba_backend.abm_cluster is None or len( alba_backend.nsm_clusters) == 0: changes_required = True break if changes_required: # Retrieve ABM and NSM clusters abm_cluster_info = [] nsm_cluster_info = [] for cluster_name in Configuration.list('/ovs/arakoon'): try: metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=cluster_name) if metadata[ 'cluster_type'] == ServiceType.ARAKOON_CLUSTER_TYPES.ABM: abm_cluster_info.append(metadata) elif metadata[ 'cluster_type'] == ServiceType.ARAKOON_CLUSTER_TYPES.NSM: nsm_cluster_info.append(metadata) except NotFoundException: continue # Retrieve NSM Arakoon cluster information cluster_arakoon_map = {} for cluster_info in abm_cluster_info + nsm_cluster_info: cluster_name = cluster_info['cluster_name'] arakoon_config = ArakoonClusterConfig( cluster_id=cluster_name) cluster_arakoon_map[ cluster_name] = arakoon_config.export_dict() storagerouter_map = dict( (storagerouter.machine_id, storagerouter) for storagerouter in StorageRouterList.get_storagerouters()) alba_backend_id_map = dict((alba_backend.alba_id, alba_backend) for alba_backend in alba_backends) for cluster_info in abm_cluster_info: internal = cluster_info['internal'] cluster_name = cluster_info['cluster_name'] config_location = Configuration.get_configuration_path( key=ArakoonClusterConfig.CONFIG_KEY.format( cluster_name)) try: alba_id = AlbaCLI.run(command='get-alba-id', config=config_location, named_params={'attempts': 3})['id'] nsm_hosts = AlbaCLI.run(command='list-nsm-hosts', config=config_location, named_params={'attempts': 3}) except RuntimeError: continue alba_backend = alba_backend_id_map.get(alba_id) if alba_backend is None: # ALBA Backend with ID not found in model continue if alba_backend.abm_cluster is not None and len( alba_backend.nsm_clusters ) > 0: # Clusters already exist continue # Create ABM Cluster if alba_backend.abm_cluster is None: abm_cluster = ABMCluster() abm_cluster.name = cluster_name abm_cluster.alba_backend = alba_backend abm_cluster.config_location = ArakoonClusterConfig.CONFIG_KEY.format( cluster_name) abm_cluster.save() else: abm_cluster = alba_backend.abm_cluster # Create ABM Services abm_arakoon_config = cluster_arakoon_map[cluster_name] abm_arakoon_config.pop('global') arakoon_nodes = abm_arakoon_config.keys() if internal is False: services_to_create = 1 else: if set(arakoon_nodes).difference( set(storagerouter_map.keys())): continue services_to_create = len(arakoon_nodes) for index in range(services_to_create): service = Service() service.name = 'arakoon-{0}-abm'.format( alba_backend.name) service.type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ALBA_MGR) if internal is True: arakoon_node_config = abm_arakoon_config[ arakoon_nodes[index]] service.ports = [ arakoon_node_config['client_port'], arakoon_node_config['messaging_port'] ] service.storagerouter = storagerouter_map[ arakoon_nodes[index]] else: service.ports = [] service.storagerouter = None service.save() abm_service = ABMService() abm_service.service = service abm_service.abm_cluster = abm_cluster abm_service.save() # Create NSM Clusters for cluster_index, nsm_host in enumerate( sorted(nsm_hosts, key=lambda host: ExtensionsToolbox. advanced_sort(host['cluster_id'], '_'))): nsm_cluster_name = nsm_host['cluster_id'] nsm_arakoon_config = cluster_arakoon_map.get( nsm_cluster_name) if nsm_arakoon_config is None: continue number = cluster_index if internal is False else int( nsm_cluster_name.split('_')[-1]) nsm_cluster = NSMCluster() nsm_cluster.name = nsm_cluster_name nsm_cluster.number = number nsm_cluster.alba_backend = alba_backend nsm_cluster.config_location = ArakoonClusterConfig.CONFIG_KEY.format( nsm_cluster_name) nsm_cluster.save() # Create NSM Services nsm_arakoon_config.pop('global') arakoon_nodes = nsm_arakoon_config.keys() if internal is False: services_to_create = 1 else: if set(arakoon_nodes).difference( set(storagerouter_map.keys())): continue services_to_create = len(arakoon_nodes) for service_index in range(services_to_create): service = Service() service.name = 'arakoon-{0}-nsm_{1}'.format( alba_backend.name, number) service.type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.NS_MGR) if internal is True: arakoon_node_config = nsm_arakoon_config[ arakoon_nodes[service_index]] service.ports = [ arakoon_node_config['client_port'], arakoon_node_config['messaging_port'] ] service.storagerouter = storagerouter_map[ arakoon_nodes[service_index]] else: service.ports = [] service.storagerouter = None service.save() nsm_service = NSMService() nsm_service.service = service nsm_service.nsm_cluster = nsm_cluster nsm_service.save() # Clean up all junction services no longer linked to an ALBA Backend all_nsm_services = [ service.nsm_service for service in ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.NS_MGR).services if service.nsm_service.nsm_cluster is None ] all_abm_services = [ service.abm_service for service in ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ALBA_MGR).services if service.abm_service.abm_cluster is None ] for abm_service in all_abm_services: abm_service.delete() abm_service.service.delete() for nsm_service in all_nsm_services: nsm_service.delete() nsm_service.service.delete() ################################ # Introduction of Active Drive # ################################ # Update slot_id and Alba Node relation for all OSDs client = PersistentFactory.get_client() disk_osd_map = {} for key, data in client.prefix_entries('ovs_data_albaosd_'): alba_disk_guid = data.get('alba_disk', {}).get('guid') if alba_disk_guid is not None: if alba_disk_guid not in disk_osd_map: disk_osd_map[alba_disk_guid] = [] disk_osd_map[alba_disk_guid].append( key.replace('ovs_data_albaosd_', '')) try: value = client.get(key) value.pop('alba_disk', None) client.set(key=key, value=value) except Exception: pass # We don't care if we would have any leftover AlbaDisk information in _data, but its cleaner not to alba_guid_node_map = dict( (an.guid, an) for an in AlbaNodeList.get_albanodes()) for key, data in client.prefix_entries('ovs_data_albadisk_'): alba_disk_guid = key.replace('ovs_data_albadisk_', '') alba_node_guid = data.get('alba_node', {}).get('guid') if alba_disk_guid in disk_osd_map and alba_node_guid in alba_guid_node_map and len( data.get('aliases', [])) > 0: slot_id = data['aliases'][0].split('/')[-1] for osd_guid in disk_osd_map[alba_disk_guid]: try: osd = AlbaOSD(osd_guid) except ObjectNotFoundException: continue osd.slot_id = slot_id osd.alba_node = alba_guid_node_map[alba_node_guid] osd.save() client.delete(key=key, must_exist=False) # Remove unique constraints for AlbaNode IP for key in client.prefix('ovs_unique_albanode_ip_'): client.delete(key=key, must_exist=False) # Remove relation for all Alba Disks for key in client.prefix('ovs_reverseindex_albadisk_'): client.delete(key=key, must_exist=False) # Remove the relation between AlbaNode and AlbaDisk for key in client.prefix('ovs_reverseindex_albanode_'): if '|disks|' in key: client.delete(key=key, must_exist=False) return DALMigrator.THIS_VERSION
def get_backend_stats(): """ Send backend stats for each backend to InfluxDB """ points = [] abms = [] abs = [] for service in ServiceList.get_services(): if service.type.name == ServiceType.SERVICE_TYPES.ALBA_MGR: abms.append(service.name) for ab in AlbaNodeList.get_albanodes(): abs.append(ab.node_id) abms = list(set(abms)) config = "etcd://127.0.0.1:2379/ovs/arakoon/{}/config".format(abms[0]) try: decommissioning_osds = AlbaCLI.run('list-decommissioning-osds', config=config, to_json=True) except Exception as ex: StatsmonkeyScheduledTaskController._logger.error('{0}'.format(ex.message)) return None filtered_osds = [] for ab in abs: filtered_osds += [osd for osd in decommissioning_osds if osd['node_id'] == ab] abl = AlbaBackendList.get_albabackends() for ab in abl: try: stat = { 'measurement': 'backend_stats', 'tags': { 'backend_name': ab.name }, 'fields': { 'gets': ab.statistics['multi_get']['n'], 'puts': ab.statistics['apply']['n'] } } stat_asd = { 'decommissioning': len(filtered_osds), 'decommissioned': 0, 'claimed': 0, 'warning': 0, 'failure': 0, 'error': 0 } for disks in ab.local_stack.values(): for disk in disks.values(): for asd in disk['asds'].values(): if asd['alba_backend_guid'] == ab.guid: status = asd['status'] status_detail = asd['status_detail'] if status_detail == 'decommissioned': status = status_detail if status not in stat_asd: stat_asd[status] = 0 stat_asd[status] += 1 for status in stat_asd: stat['fields'][status] = stat_asd[status] points.append(stat) except Exception as ex: StatsmonkeyScheduledTaskController._logger.error(ex.message) if len(points) == 0: StatsmonkeyScheduledTaskController._logger.info("No statistics found") return None StatsmonkeyScheduledTaskController._send_stats(points) return points