def register(node_id):
        """
        Adds a Node with a given node_id to the model
        :param node_id: ID of the ALBA node
        :type node_id: str

        :return: None
        """
        node = AlbaNodeList.get_albanode_by_node_id(node_id)
        if node is None:
            main_config = EtcdConfiguration.get('/ovs/alba/asdnodes/{0}/config/main'.format(node_id))
            node = AlbaNode()
            node.ip = main_config['ip']
            node.port = main_config['port']
            node.username = main_config['username']
            node.password = main_config['password']
            node.storagerouter = StorageRouterList.get_by_ip(main_config['ip'])
        data = node.client.get_metadata()
        if data['_success'] is False and data['_error'] == 'Invalid credentials':
            raise RuntimeError('Invalid credentials')
        if data['node_id'] != node_id:
            AlbaNodeController._logger.error('Unexpected node_id: {0} vs {1}'.format(data['node_id'], node_id))
            raise RuntimeError('Unexpected node identifier')
        node.node_id = node_id
        node.type = 'ASD'
        node.save()

        # increase maintenance agents count for all nodes by 1
        for backend in AlbaBackendList.get_albabackends():
            nr_of_agents_key = AlbaNodeController.NR_OF_AGENTS_ETCD_TEMPLATE.format(backend.guid)
            if EtcdConfiguration.exists(nr_of_agents_key):
                EtcdConfiguration.set(nr_of_agents_key, int(EtcdConfiguration.get(nr_of_agents_key) + 1))
            else:
                EtcdConfiguration.set(nr_of_agents_key, 1)
        AlbaNodeController.checkup_maintenance_agents()
Example #2
0
    def restart_slot(node_guid, slot_id):
        """
        Restarts a slot
        :param node_guid: Guid of the ALBA Node to restart a slot on
        :type node_guid: str
        :param slot_id: ID of the slot (eg. pci-0000:03:00.0-sas-0x5000c29f4cf04566-lun-0)
        :type slot_id: str
        :return: None
        :rtype: NoneType
        """
        node = AlbaNode(node_guid)
        AlbaNodeController._logger.debug(
            'Restarting slot {0} on node {1}'.format(slot_id, node.ip))
        try:
            if slot_id not in node.client.get_stack():
                AlbaNodeController._logger.exception(
                    'Slot {0} not available for restart on ALBA Node {1}'.
                    format(slot_id, node.ip))
                raise RuntimeError('Could not find slot')
        except (requests.ConnectionError, requests.Timeout):
            AlbaNodeController._logger.warning(
                'Could not connect to node {0} to validate slot'.format(
                    node.guid))
            raise

        result = node.client.restart_slot(slot_id=slot_id)
        if result['_success'] is False:
            raise RuntimeError('Error restarting slot: {0}'.format(
                result['_error']))
        for backend in AlbaBackendList.get_albabackends():
            backend.invalidate_dynamics()
    def restart_disk(node_guid, disk):
        """
        Restarts a disk
        :param node_guid: Guid of the node to restart a disk of
        :type node_guid: str

        :param disk: Disk name to be restarted
        :type disk: str

        :return: None
        """
        node = AlbaNode(node_guid)
        AlbaNodeController._logger.debug('Restarting disk {0} at node {1}'.format(disk, node.ip))
        try:
            disks = node.client.get_disks()
            if disk not in disks:
                AlbaNodeController._logger.exception('Disk {0} not available for restart on node {1}'.format(disk, node.ip))
                raise RuntimeError('Could not find disk')
        except (requests.ConnectionError, requests.Timeout):
            AlbaNodeController._logger.warning('Could not connect to node {0} to validate disk'.format(node.guid))
            raise

        result = node.client.restart_disk(disk)
        if result['_success'] is False:
            raise RuntimeError('Error restarting disk: {0}'.format(result['_error']))
        for backend in AlbaBackendList.get_albabackends():
            backend.invalidate_dynamics()
 def _all_disks(self):
     """
     Returns a live list of all disks on this node
     """
     try:
         disks = self.client.get_disks(reraise=True)
     except (requests.ConnectionError, requests.Timeout):
         from ovs.dal.lists.albabackendlist import AlbaBackendList
         disks = []
         asd_ids = []
         for backend in AlbaBackendList.get_albabackends():
             # All backends of this node
             config = 'etcd://127.0.0.1:2379/ovs/arakoon/{0}-abm/config'.format(backend.name)
             osds = AlbaCLI.run('list-all-osds', config=config, as_json=True)
             for osd in osds:
                 node_id = osd.get('node_id')
                 asd_id = osd.get('long_id')
                 decommissioned = osd.get('decommissioned')
                 if node_id == self.node_id and asd_id not in asd_ids and decommissioned is False:
                     asd_ids.append(asd_id)
                     disks.append({'asd_id': asd_id,
                                   'node_id': osd.get('node_id'),
                                   'port': osd.get('port'),
                                   'ips': osd.get('ips'),
                                   'available': False,
                                   'state': {'state': 'error', 'detail': 'nodedown'},
                                   'log_level': 'info',
                                   'device': asd_id,
                                   'home': asd_id,
                                   'mountpoint': asd_id,
                                   'name': asd_id,
                                   'usage': {'available': 0, 'size': 0, 'used': 0}})
     return disks
    def restart_disk(node_guid, device_alias):
        """
        Restarts a disk
        :param node_guid: Guid of the node to restart a disk of
        :type node_guid: str
        :param device_alias: Alias of the device to restart  (eg: /dev/disk/by-path/pci-0000:03:00.0-sas-0x5000c29f4cf04566-lun-0)
        :type device_alias: str
        :return: None
        """
        node = AlbaNode(node_guid)
        device_id = device_alias.split('/')[-1]
        AlbaNodeController._logger.debug('Restarting disk {0} at node {1}'.format(device_alias, node.ip))
        try:
            if device_id not in node.client.get_disks():
                AlbaNodeController._logger.exception('Disk {0} not available for restart on node {1}'.format(device_alias, node.ip))
                raise RuntimeError('Could not find disk')
        except (requests.ConnectionError, requests.Timeout):
            AlbaNodeController._logger.warning('Could not connect to node {0} to validate disk'.format(node.guid))
            raise

        result = node.client.restart_disk(disk_id=device_id)
        if result['_success'] is False:
            raise RuntimeError('Error restarting disk: {0}'.format(result['_error']))
        for backend in AlbaBackendList.get_albabackends():
            backend.invalidate_dynamics()
Example #6
0
    def remove_node(node_guid):
        """
        Removes an ALBA node
        :param node_guid: Guid of the ALBA node to remove
        :type node_guid: str
        :return: None
        :rtype: NoneType
        """
        node = AlbaNode(node_guid)
        if node.type == AlbaNode.NODE_TYPES.ASD:
            for slot_id, slot_info in node.stack.iteritems():
                for osd_id, osd_info in slot_info['osds'].iteritems():
                    if AlbaOSDList.get_by_osd_id(osd_id=osd_id) is not None:
                        AlbaNodeController.remove_osd(node_guid=node.guid,
                                                      osd_id=osd_id,
                                                      expected_safety=None)
                if slot_info['available'] is False:
                    AlbaNodeController.remove_slot(node_guid=node.guid,
                                                   slot_id=slot_id)

            name_guid_map = dict(
                (alba_backend.name, alba_backend.guid)
                for alba_backend in AlbaBackendList.get_albabackends())
            try:
                # This loop will delete the services AND their configuration from the configuration management
                node.invalidate_dynamics('maintenance_services')
                for alba_backend_name, service_info in node.maintenance_services.iteritems(
                ):
                    for service_name, status in service_info:
                        node.client.remove_maintenance_service(
                            name=service_name,
                            alba_backend_guid=name_guid_map.get(
                                alba_backend_name))
            except (requests.ConnectionError, requests.Timeout):
                AlbaNodeController._logger.exception(
                    'Could not connect to node {0} to retrieve the maintenance services'
                    .format(node.guid))
            except InvalidCredentialsError:
                AlbaNodeController._logger.warning(
                    'Failed to retrieve the maintenance services for ALBA node {0}'
                    .format(node.node_id))

        node.delete()
        for alba_backend in AlbaBackendList.get_albabackends():
            alba_backend.invalidate_dynamics(['live_status'])
            alba_backend.backend.invalidate_dynamics(['live_status'])
        AlbaController.checkup_maintenance_agents.delay()
    def remove_disk(node_guid, device_alias):
        """
        Removes a disk
        :param node_guid: Guid of the node to remove a disk from
        :type node_guid: str
        :param device_alias: Alias of the device to remove  (eg: /dev/disk/by-path/pci-0000:03:00.0-sas-0x5000c29f4cf04566-lun-0)
        :type device_alias: str
        :return: None
        """
        asds = {}
        node = AlbaNode(node_guid)
        node_id = node.node_id
        device_id = device_alias.split('/')[-1]
        offline_node = False

        # Verify client connectivity
        try:
            _ = node.client.get_disks()
        except (requests.ConnectionError, requests.Timeout, InvalidCredentialsError):
            AlbaNodeController._logger.warning('Could not connect to node {0} to validate disks'.format(node.guid))
            offline_node = True

        # Retrieve ASD information for the ALBA Disk
        for backend in AlbaBackendList.get_albabackends():
            local_stack = backend.local_stack
            if node_id in local_stack and device_id in local_stack[node_id]:
                asds.update(local_stack[node_id][device_id]['asds'])
        for asd_info in asds.values():
            if (offline_node is False and asd_info.get('status') != 'available') or (offline_node is True and asd_info.get('status_detail') == 'nodedown'):
                AlbaNodeController._logger.error('Disk {0} has still non-available ASDs on node {1}'.format(device_alias, node.ip))
                raise RuntimeError('Disk {0} on ALBA node {1} has still some non-available ASDs'.format(device_alias, node_id))

        # Retrieve the Disk from the framework model matching the ALBA Disk
        disk_to_clear = None
        for disk in DiskList.get_disks():
            if device_alias in disk.aliases:
                disk_to_clear = disk
                break

        # Remove the ALBA Disk making use of the ASD Manager Client
        if offline_node is False:
            result = node.client.remove_disk(disk_id=device_id, partition_aliases=disk_to_clear.partitions[0].aliases if len(disk_to_clear.partitions) > 0 else [])
            if result['_success'] is False:
                raise RuntimeError('Error removing disk {0}: {1}'.format(device_alias, result['_error']))

        # Clean the model
        for model_disk in node.disks:
            if device_alias in model_disk.aliases:
                for osd in model_disk.osds:
                    osd.delete()
                model_disk.delete()
        if disk_to_clear is not None:
            for partition in disk_to_clear.partitions:
                partition.roles = []
                partition.mountpoint = None
                partition.save()
        node.invalidate_dynamics()
        if node.storagerouter is not None:
            DiskController.sync_with_reality(storagerouter_guid=node.storagerouter_guid)
    def get_albabackends():
        """
        Fetches all the alba backends on the cluster

        :return: alba backends
        :rtype: list
        """

        return AlbaBackendList.get_albabackends()
Example #9
0
    def get_stats_nsms(cls):
        """
        Retrieve the amount of NSMs deployed and their statistics
        """
        if cls._config is None:
            cls.validate_and_retrieve_config()

        stats = []
        errors = False
        environment = cls._config['environment']
        for alba_backend in AlbaBackendList.get_albabackends():
            for nsm in alba_backend.nsm_clusters:
                stats.append({
                    'tags': {
                        'nsm_number': nsm.number,
                        'environment': environment,
                        'backend_name': alba_backend.name,
                        'abm_service_name': alba_backend.abm_cluster.name
                    },
                    'fields': {
                        'load': float(AlbaArakoonController.get_load(nsm))
                    },
                    'measurement': 'nsm'
                })

            config_path = Configuration.get_configuration_path(
                alba_backend.abm_cluster.config_location)
            try:
                nsm_host_ids = [
                    nsm_host['id']
                    for nsm_host in AlbaCLI.run(command='list-nsm-hosts',
                                                config=config_path)
                ]
                nsm_hosts_statistics = AlbaCLI.run(
                    command='nsm-hosts-statistics',
                    config=config_path,
                    named_params={'nsm-hosts': ','.join(nsm_host_ids)})
                for nsm_host_id, statistics in nsm_hosts_statistics.iteritems(
                ):
                    stats.append({
                        'tags': {
                            'nsm_name': nsm_host_id,
                            'environment': environment,
                            'backend_name': alba_backend.name
                        },
                        'fields':
                        cls._convert_to_float_values(statistics['statistics']),
                        'measurement':
                        'nsm_statistic'
                    })
            except Exception:
                errors = True
                cls._logger.exception(
                    'Retrieving NSM statistics for ALBA Backend {0} failed'.
                    format(alba_backend.name))
        return errors, stats
    def get_by_name(name):
        """
        Retrieve an ALBA backend object based on its name
        :param name: Name of the ALBA backend
        :type name: str

        :return: ALBA backend or None
         :rtype: AlbaBackend
        """
        for alba_backend in AlbaBackendList.get_albabackends():
            if alba_backend.name == name:
                return alba_backend
Example #11
0
    def get_stats_osds(cls):
        """
        Retrieve the OSD statistics for all ALBA Backends
        """
        def _get_stats_osds_for_alba_backend(alba_backend, statistics,
                                             errored_calls):
            try:
                for osd_id, result in alba_backend.osd_statistics.iteritems():
                    # Remove the 'version' key as it is a non-numeric value
                    result.pop('version', None)
                    statistics.append({
                        'tags': {
                            'guid': alba_backend.guid,
                            'long_id': osd_id,
                            'environment': environment,
                            'backend_name': alba_backend.name
                        },
                        'fields':
                        cls._convert_to_float_values(result),
                        'measurement':
                        'asd'
                    })
            except Exception:
                errored_calls.append(alba_backend.name)
                cls._logger.exception(
                    'Retrieving OSD statistics failed for ALBA Backend {0}'.
                    format(alba_backend.name))

        if cls._config is None:
            cls.validate_and_retrieve_config()

        stats = []
        errors = []
        threads = []
        environment = cls._config['environment']
        for ab in AlbaBackendList.get_albabackends():
            thread = Thread(name=ab.name,
                            target=_get_stats_osds_for_alba_backend,
                            args=(ab, stats, errors))
            thread.start()
            threads.append(thread)

        for thr in threads:
            thr.join(timeout=20)

        if len(errors) > 0:
            raise Exception(
                'Retrieving OSD statistics failed for ALBA Backends:\n * {0}'.
                format('\n * '.join(errors)))
        return False, stats
 def list(self, request):
     """
     Lists all available ALBA Backends:
     :param request: The raw request
     :type request: Request
     """
     backends = AlbaBackendList.get_albabackends()
     allowed_backends = []
     for alba_backend in backends:
         if Toolbox.access_granted(request.client,
                                   user_rights=alba_backend.backend.user_rights,
                                   client_rights=alba_backend.backend.client_rights):
             allowed_backends.append(alba_backend)
     return allowed_backends
Example #13
0
    def get_stats_alba_backends(cls):
        """
        Retrieve statistics about all ALBA Backends and their maintenance work
        """
        if cls._config is None:
            cls.validate_and_retrieve_config()

        stats = []
        errors = False
        environment = cls._config['environment']
        for alba_backend in AlbaBackendList.get_albabackends():
            try:
                local_summary = alba_backend.local_summary
                sizes = local_summary['sizes']
                devices = local_summary['devices']
                stats.append({
                    'tags': {
                        'environment': environment,
                        'backend_name': alba_backend.name
                    },
                    'fields': {
                        'red':
                        int(devices['red']),
                        'free':
                        float(sizes['size'] - sizes['used']),
                        'used':
                        float(sizes['used']),
                        'green':
                        int(devices['green']),
                        'orange':
                        int(devices['orange']),
                        'maintenance_work':
                        int(
                            AlbaCLI.run(
                                command='list-work',
                                config=Configuration.get_configuration_path(
                                    alba_backend.abm_cluster.config_location))
                            ['count'])
                    },
                    'measurement': 'backend'
                })
            except Exception:
                errors = True
                cls._logger.exception(
                    'Retrieving statistics for ALBA Backend {0} failed'.format(
                        alba_backend.name))
        return errors, stats
Example #14
0
 def list(self, request):
     """
     Lists all available ALBA Backends:
     :param request: The raw request
     :type request: Request
     :return: All ALBA Backends for which the current user has permissions
     :rtype: list
     """
     backends = AlbaBackendList.get_albabackends()
     allowed_backends = []
     for alba_backend in backends:
         if ApiToolbox.access_granted(
                 request.client,
                 user_rights=alba_backend.backend.user_rights,
                 client_rights=alba_backend.backend.client_rights):
             allowed_backends.append(alba_backend)
     return allowed_backends
    def remove_disk(node_guid, disk):
        """
        Removes a disk
        :param node_guid: Guid of the node to remove a disk from
        :type node_guid: str

        :param disk: Disk name to remove
        :type disk: str

        :return: None
        """
        node = AlbaNode(node_guid)
        offline_node = False
        try:
            if disk not in node.client.get_disks():
                raise RuntimeError('Disk {0} not available on node {1}'.format(disk, node.guid))
        except (requests.ConnectionError, requests.Timeout):
            AlbaNodeController._logger.warning('Could not connect to node {0} to validate disks'.format(node.guid))
            offline_node = True
        node_id = node.node_id
        asds = {}
        for backend in AlbaBackendList.get_albabackends():
            storage_stack = backend.storage_stack
            if node_id in storage_stack and disk in storage_stack[node_id]:
                asds.update(storage_stack[node_id][disk]['asds'])
        for asd_info in asds.values():
            if (offline_node is False and asd_info['status'] != 'available') or (offline_node is True and asd_info['status_detail'] == 'nodedown'):
                AlbaNodeController._logger.error('Disk {0} has still non-available ASDs on node {1}'.format(disk, node.ip))
                raise RuntimeError('Disk {0} has still some non-available ASDs'.format(disk))
        if offline_node is False:
            result = node.client.remove_disk(disk)
            if result['_success'] is False:
                raise RuntimeError('Error removing disk {0}: {1}'.format(disk, result['_error']))
        for model_disk in node.disks:
            if model_disk.name == disk:
                for asd in model_disk.asds:
                    asd.delete()
                model_disk.delete()
        node.invalidate_dynamics()
        if node.storagerouter is not None:
            DiskController.sync_with_reality(node.storagerouter_guid)
    def verify_namespaces():
        """
        Verify namespaces for all backends
        """
        logger.info('verify namespace task scheduling started')

        job_factor = 10
        job_factor_key = '/ovs/alba/backends/job_factor'
        if EtcdConfiguration.exists(job_factor_key):
            job_factor = EtcdConfiguration.get(job_factor_key)
        else:
            EtcdConfiguration.set(job_factor_key, job_factor)

        for albabackend in AlbaBackendList.get_albabackends():
            config = 'etcd://127.0.0.1:2379/ovs/arakoon/{0}-abm/config'.format(albabackend.backend.name)
            namespaces = AlbaCLI.run('list-namespaces', config=config, as_json=True)
            for namespace in namespaces:
                logger.info('verifying namespace: {0} scheduled ...'.format(namespace['name']))
                AlbaCLI.run('verify-namespace {0} --factor={1}'.format(namespace['name'], job_factor))

        logger.info('verify namespace task scheduling finished')
Example #17
0
    def get_albabackend_by_name(albabackend_name):
        """
        Get a Albabackend by name

        :param albabackend_name: albabackend name
        :type albabackend_name: str
        :return: alba backend object
        :rtype: ovs.dal.hybrids.albabackend
        """

        try:
            return [
                alba_backend
                for alba_backend in AlbaBackendList.get_albabackends()
                if alba_backend.name == albabackend_name
            ][0]
        except IndexError:
            error_msg = "No Alba backend found with name: {0}".format(
                albabackend_name)
            BackendHelper.LOGGER.error(error_msg)
            raise NameError(error_msg)
    def verify_namespaces():
        """
        Verify namespaces for all backends
        """
        AlbaScheduledTaskController._logger.info('verify namespace task scheduling started')

        verification_factor = 10
        verification_factor_key = '/ovs/alba/backends/verification_factor'
        if EtcdConfiguration.exists(verification_factor_key):
            verification_factor = EtcdConfiguration.get(verification_factor_key)
        else:
            EtcdConfiguration.set(verification_factor_key, verification_factor)

        for albabackend in AlbaBackendList.get_albabackends():
            backend_name = albabackend.abm_services[0].service.name if albabackend.abm_services else albabackend.name + '-abm'
            config = 'etcd://127.0.0.1:2379/ovs/arakoon/{0}/config'.format(backend_name)
            namespaces = AlbaCLI.run('list-namespaces', config=config, as_json=True)
            for namespace in namespaces:
                AlbaScheduledTaskController._logger.info('verifying namespace: {0} scheduled ...'.format(namespace['name']))
                AlbaCLI.run('verify-namespace {0} --factor={1}'.format(namespace['name'], verification_factor))

        AlbaScheduledTaskController._logger.info('verify namespace task scheduling finished')
    def _stack(self):
        """
        Returns an overview of this node's storage stack
        """
        from ovs.dal.hybrids.albabackend import AlbaBackend
        from ovs.dal.lists.albabackendlist import AlbaBackendList

        def _move(info):
            for move in [('state', 'status'),
                         ('state_detail', 'status_detail')]:
                if move[0] in info:
                    info[move[1]] = info[move[0]]
                    del info[move[0]]

        stack = {}
        node_down = False
        # Fetch stack from asd-manager
        try:
            remote_stack = self.client.get_stack()
            for slot_id, slot_data in remote_stack.iteritems():
                stack[slot_id] = {'status': 'ok'}
                stack[slot_id].update(slot_data)
                # Migrate state > status
                _move(stack[slot_id])
                for osd_data in slot_data.get('osds', {}).itervalues():
                    _move(osd_data)
        except (requests.ConnectionError, requests.Timeout,
                InvalidCredentialsError):
            self._logger.warning(
                'Error during stack retrieval. Assuming that the node is down')
            node_down = True

        model_osds = {}
        found_osds = {}
        # Apply own model to fetched stack
        for osd in self.osds:
            model_osds[osd.osd_id] = osd  # Initially set the info
            if osd.slot_id not in stack:
                stack[osd.slot_id] = {
                    'status':
                    self.OSD_STATUSES.UNKNOWN
                    if node_down is True else self.OSD_STATUSES.MISSING,
                    'status_detail':
                    self.OSD_STATUS_DETAILS.NODEDOWN
                    if node_down is True else '',
                    'osds': {}
                }
            osd_data = stack[osd.slot_id]['osds'].get(osd.osd_id, {})
            stack[osd.slot_id]['osds'][
                osd.osd_id] = osd_data  # Initially set the info in the stack
            osd_data.update(osd.stack_info)
            if node_down is True:
                osd_data['status'] = self.OSD_STATUSES.UNKNOWN
                osd_data['status_detail'] = self.OSD_STATUS_DETAILS.NODEDOWN
            elif osd.alba_backend_guid is not None:  # Osds has been claimed
                # Load information from alba
                if osd.alba_backend_guid not in found_osds:
                    found_osds[osd.alba_backend_guid] = {}
                    if osd.alba_backend.abm_cluster is not None:
                        config = Configuration.get_configuration_path(
                            osd.alba_backend.abm_cluster.config_location)
                        try:
                            for found_osd in AlbaCLI.run(
                                    command='list-all-osds', config=config):
                                found_osds[osd.alba_backend_guid][
                                    found_osd['long_id']] = found_osd
                        except (AlbaError, RuntimeError):
                            self._logger.exception(
                                'Listing all osds has failed')
                            osd_data['status'] = self.OSD_STATUSES.UNKNOWN
                            osd_data[
                                'status_detail'] = self.OSD_STATUS_DETAILS.ALBAERROR
                            continue

                if osd.osd_id not in found_osds[osd.alba_backend_guid]:
                    # Not claimed by any backend thus not in use
                    continue
                found_osd = found_osds[osd.alba_backend_guid][osd.osd_id]
                if found_osd['decommissioned'] is True:
                    osd_data['status'] = self.OSD_STATUSES.UNAVAILABLE
                    osd_data[
                        'status_detail'] = self.OSD_STATUS_DETAILS.DECOMMISSIONED
                    continue

                backend_interval_key = '/ovs/alba/backends/{0}/gui_error_interval'.format(
                    osd.alba_backend_guid)
                if Configuration.exists(backend_interval_key):
                    interval = Configuration.get(backend_interval_key)
                else:
                    interval = Configuration.get(
                        '/ovs/alba/backends/global_gui_error_interval')
                read = found_osd['read'] or [0]
                write = found_osd['write'] or [0]
                errors = found_osd['errors']
                osd_data['status'] = self.OSD_STATUSES.WARNING
                osd_data['status_detail'] = self.OSD_STATUS_DETAILS.ERROR
                if len(errors) == 0 or (len(read + write) > 0
                                        and max(min(read), min(write)) >
                                        max(error[0]
                                            for error in errors) + interval):
                    osd_data['status'] = self.OSD_STATUSES.OK
                    osd_data['status_detail'] = ''

        statistics = {}
        for slot_info in stack.itervalues():
            for osd_id, osd in slot_info['osds'].iteritems():
                if osd.get(
                        'status_detail') == self.OSD_STATUS_DETAILS.ACTIVATING:
                    osd['claimed_by'] = 'unknown'  # We won't be able to connect to it just yet
                    continue
                if osd_id not in model_osds:
                    # The osd is known by the remote node but not in the model
                    # In that case, let's connect to the OSD to see whether we get some info from it
                    try:
                        ips = osd['hosts'] if 'hosts' in osd and len(
                            osd['hosts']) > 0 else osd.get('ips', [])
                        port = osd['port']
                        claimed_by = 'unknown'
                        for ip in ips:
                            try:
                                # Output will be None if it is not claimed
                                claimed_by = AlbaCLI.run('get-osd-claimed-by',
                                                         named_params={
                                                             'host': ip,
                                                             'port': port
                                                         })
                                break
                            except (AlbaError, RuntimeError):
                                self._logger.warning(
                                    'get-osd-claimed-by failed for IP:port {0}:{1}'
                                    .format(ip, port))
                        alba_backend = AlbaBackendList.get_by_alba_id(
                            claimed_by)
                        osd['claimed_by'] = alba_backend.guid if alba_backend is not None else claimed_by
                    except KeyError:
                        osd['claimed_by'] = 'unknown'
                    except:
                        self._logger.exception(
                            'Could not load OSD info: {0}'.format(osd_id))
                        osd['claimed_by'] = 'unknown'
                        if osd.get('status') not in ['error', 'warning']:
                            osd['status'] = self.OSD_STATUSES.ERROR
                            osd['status_detail'] = self.OSD_STATUS_DETAILS.UNREACHABLE
                claimed_by = osd.get('claimed_by', 'unknown')
                if claimed_by == 'unknown':
                    continue
                try:
                    alba_backend = AlbaBackend(claimed_by)
                except ObjectNotFoundException:
                    continue
                # Add usage information
                if alba_backend not in statistics:
                    statistics[alba_backend] = alba_backend.osd_statistics
                osd_statistics = statistics[alba_backend]
                if osd_id not in osd_statistics:
                    continue
                stats = osd_statistics[osd_id]
                osd['usage'] = {
                    'size': int(stats['capacity']),
                    'used': int(stats['disk_usage']),
                    'available': int(stats['capacity'] - stats['disk_usage'])
                }
        return stack
 def list(self):
     """
     Lists all available ALBA Backends
     """
     return AlbaBackendList.get_albabackends()
 def monitor_arakoon_clusters(cls):
     """
     Get an overview of where the Arakoon clusters for each ALBA Backend have been deployed
     The overview is printed on stdout
     :return: None
     """
     try:
         while True:
             output = [
                 '', 'Open vStorage - NSM/ABM debug information',
                 '=========================================',
                 'timestamp: {0}'.format(datetime.datetime.now()), ''
             ]
             alba_backends = sorted(AlbaBackendList.get_albabackends(),
                                    key=lambda k: k.name)
             for storagerouter in sorted(
                     StorageRouterList.get_storagerouters(),
                     key=lambda k: k.name):
                 if len([
                         service for service in storagerouter.services
                         if service.type.name in [
                             ServiceType.SERVICE_TYPES.NS_MGR,
                             ServiceType.SERVICE_TYPES.ALBA_MGR
                         ] and service.storagerouter == storagerouter
                 ]) == 0:
                     continue
                 output.append('+ {0} ({1})'.format(storagerouter.name,
                                                    storagerouter.ip))
                 for alba_backend in alba_backends:
                     is_internal = alba_backend.abm_cluster.abm_services[
                         0].service.is_internal
                     if is_internal is False:
                         output.append('    + ABM (externally managed)')
                     else:
                         abm_service = [
                             abm_service for abm_service in
                             alba_backend.abm_cluster.abm_services
                             if abm_service.service.storagerouter ==
                             storagerouter
                         ]
                         nsm_clusters = [
                             nsm_cluster
                             for nsm_cluster in alba_backend.nsm_clusters
                             for nsm_service in nsm_cluster.nsm_services
                             if nsm_service.service.storagerouter ==
                             storagerouter
                         ]
                         if len(abm_service) > 0 or len(nsm_clusters) > 0:
                             output.append('  + {0}'.format(
                                 alba_backend.name))
                             if len(abm_service) > 0:
                                 output.append(
                                     '    + ABM - port {0}'.format(
                                         abm_service[0].service.ports))
                         for nsm_cluster in sorted(nsm_clusters,
                                                   key=lambda k: k.number):
                             load = None
                             try:
                                 load = AlbaArakoonController.get_load(
                                     nsm_cluster)
                             except:
                                 pass  # Don't print load when Arakoon unreachable
                             load = 'infinite' if load == float(
                                 'inf') else '{0}%'.format(round(
                                     load,
                                     2)) if load is not None else 'unknown'
                             capacity = 'infinite' if float(
                                 nsm_cluster.capacity) < 0 else float(
                                     nsm_cluster.capacity)
                             for nsm_service in nsm_cluster.nsm_services:
                                 if nsm_service.service.storagerouter != storagerouter:
                                     continue
                                 if is_internal is True:
                                     output.append(
                                         '    + NSM {0} - port {1} - capacity: {2}, load: {3}'
                                         .format(nsm_cluster.number,
                                                 nsm_service.service.ports,
                                                 capacity, load))
                                 else:
                                     output.append(
                                         '    + NSM {0} (externally managed) - capacity: {1}, load: {2}'
                                         .format(nsm_cluster.number,
                                                 capacity, load))
             output += ['', 'Press ^C to exit', '']
             print '\x1b[2J\x1b[H' + '\n'.join(output)
             time.sleep(1)
     except KeyboardInterrupt:
         pass
 def check_nsm_load(cls, result_handler, max_load=None, use_total_capacity=False, total_capacity_warning=None, total_capacity_error=None):
     """
     Checks all NSM services registered within the Framework and will report their load
     :param result_handler: logging object
     :type result_handler: ovs.extensions.healthcheck.result.HCResults
     :param max_load: Maximum load percentage before marking it as overloaded. Defaults to ovs/framework/plugins/alba/config|nsm.maxload
     :type max_load: float
     :param use_total_capacity: Base NSM load of the total possible capacity (capacity of NSMs before they are marked as overloaded)
     instead of checking the least filled NSM. Use threshold arguments for tuning'
     :type use_total_capacity: bool
     :param total_capacity_warning: Number of remaining namespaces threshold before throwing a warning. Defaults 20% of the total namespaces
     :type total_capacity_warning: int
     :param total_capacity_error: Number of remaining namespaces threshold before throwing an error. Defaults to 5% of the total namespaces
     :type total_capacity_error: int
     :return: None
     :rtype: NoneType
     """
     max_nsm_load_config = Configuration.get('ovs/framework/plugins/alba/config|nsm.maxload')
     max_load = max_load or max_nsm_load_config
     for alba_backend in AlbaBackendList.get_albabackends():
         if alba_backend.abm_cluster is None:
             result_handler.failure('No ABM cluster found for ALBA Backend {0}'.format(alba_backend.name))
             continue
         if len(alba_backend.abm_cluster.abm_services) == 0:
             result_handler.failure('ALBA Backend {0} does not have any registered ABM services'.format(alba_backend.name))
             continue
         if len(alba_backend.nsm_clusters) == 0:
             result_handler.failure('ALBA Backend {0} does not have any registered NSM services'.format(alba_backend.name))
             continue
         internal = alba_backend.abm_cluster.abm_services[0].service.is_internal
         if use_total_capacity:
             maximum_capacity_before_overload = AlbaHealthCheck._get_nsm_max_capacity_before_overload(alba_backend, max_nsm_load_config)
             total_capacity_warning = total_capacity_warning or math.ceil(maximum_capacity_before_overload * 1.0/5)
             total_capacity_error = total_capacity_error or math.ceil(maximum_capacity_before_overload * 1.0/20)
             config = Configuration.get_configuration_path(key=alba_backend.abm_cluster.config_location)
             hosts_data = AlbaCLI.run(command='list-nsm-hosts', config=config)
             current_capacity = sum([host['namespaces_count'] for host in hosts_data if not host['lost']])
             remaining_capacity = maximum_capacity_before_overload - current_capacity
             if remaining_capacity > total_capacity_warning and remaining_capacity > total_capacity_error:  # Only error could be specified
                 result_handler.success('NSMs for backend {0} have enough capacity remaining ({1}/{2} used)'.format(alba_backend.name, current_capacity, maximum_capacity_before_overload),
                                        code=ErrorCodes.nsm_load_ok)
             elif total_capacity_warning >= remaining_capacity > total_capacity_error:
                 result_handler.warning('NSMs for backend {0} have reached the warning threshold '
                                        '({1} namespaces had to be remaining, {2}/{3} used)'.format(alba_backend.name, total_capacity_warning, current_capacity, maximum_capacity_before_overload),
                                        code=ErrorCodes.nsm_load_ok)
             else:
                 result_handler.failure('NSMs for backend {0} have reached the error threshold '
                                        '({1} namespaces had to be remaining, ({2}/{3} used)'.format(alba_backend.name, total_capacity_error, current_capacity, maximum_capacity_before_overload),
                                        code=ErrorCodes.nsm_load_ok)
         else:
             nsm_loads = {}
             sorted_nsm_clusters = sorted(alba_backend.nsm_clusters, key=lambda k: k.number)
             for nsm_cluster in sorted_nsm_clusters:
                 nsm_loads[nsm_cluster.number] = AlbaController.get_load(nsm_cluster)
             overloaded = min(nsm_loads.values()) >= max_load
             if overloaded is False:
                 result_handler.success('NSMs for backend {0} are not overloaded'.format(alba_backend.name),
                                        code=ErrorCodes.nsm_load_ok)
             else:
                 if internal is True:
                     result_handler.warning('NSMs for backend {0} are overloaded. The NSM checkup will take care of this'.format(alba_backend.name),
                                            code=ErrorCodes.nsm_load_warn)
                 else:
                     result_handler.failure('NSMs for backend {0} are overloaded. Please add your own NSM clusters to the backend'.format(alba_backend.name),
                                            code=ErrorCodes.nsm_load_failure)
    def _all_disks(self):
        """
        Returns a live list of all disks known to this AlbaBackend
        """
        from ovs.dal.lists.albanodelist import AlbaNodeList
        from ovs.dal.lists.albabackendlist import AlbaBackendList

        alba_backend_map = {}
        for a_backend in AlbaBackendList.get_albabackends():
            alba_backend_map[a_backend.alba_id] = a_backend
        node_disk_map = {}
        alba_nodes = AlbaNodeList.get_albanodes()
        for node in alba_nodes:
            node_disk_map[node.node_id] = []

        # Load OSDs
        config = 'etcd://127.0.0.1:2379/ovs/arakoon/{0}-abm/config'.format(self.backend.name)
        for found_osd in AlbaCLI.run('list-all-osds', config=config, as_json=True):
            node_id = found_osd['node_id']
            if node_id in node_disk_map:
                node_disk_map[node_id].append({'osd': found_osd})

        # Load all_disk information
        def _load_disks(_node, _list):
            for _disk in _node.all_disks:
                found = False
                for container in _list:
                    if 'osd' in container and container['osd']['long_id'] == _disk.get('asd_id'):
                        container['disk'] = _disk
                        found = True
                        break
                if found is False:
                    _list.append({'disk': _disk})
        threads = []
        for node in alba_nodes:
            thread = Thread(target=_load_disks, args=(node, node_disk_map[node.node_id]))
            thread.start()
            threads.append(thread)
        for thread in threads:
            thread.join()

        # Make mapping between node IDs and the relevant OSDs and disks
        def _process_disk(_info, _disks, _node):
            disk = _info.get('disk')
            if disk is None:
                return
            disk_status = 'uninitialized'
            disk_status_detail = ''
            disk_alba_backend_guid = ''
            if disk['available'] is False:
                osd = _info.get('osd')
                disk_alba_state = disk['state']['state']
                if disk_alba_state == 'ok':
                    if osd is None:
                        disk_status = 'initialized'
                    elif osd['id'] is None:
                        alba_id = osd['alba_id']
                        if alba_id is None:
                            disk_status = 'available'
                        else:
                            disk_status = 'unavailable'
                            alba_backend = alba_backend_map.get(alba_id)
                            if alba_backend is not None:
                                disk_alba_backend_guid = alba_backend.guid
                    else:
                        disk_status = 'error'
                        disk_status_detail = 'communicationerror'
                        disk_alba_backend_guid = self.guid

                        for asd in _node.asds:
                            if asd.asd_id == disk['asd_id'] and asd.statistics != {}:
                                disk_status = 'warning'
                                disk_status_detail = 'recenterrors'

                                read = osd['read'] or [0]
                                write = osd['write'] or [0]
                                errors = osd['errors']
                                global_interval_key = '/ovs/alba/backends/global_gui_error_interval'
                                backend_interval_key = '/ovs/alba/backends/{0}/gui_error_interval'.format(self.guid)
                                interval = EtcdConfiguration.get(global_interval_key)
                                if EtcdConfiguration.exists(backend_interval_key):
                                    interval = EtcdConfiguration.get(backend_interval_key)
                                if len(errors) == 0 or (len(read + write) > 0 and max(min(read), min(write)) > max(error[0] for error in errors) + interval):
                                    disk_status = 'claimed'
                                    disk_status_detail = ''
                elif disk_alba_state == 'decommissioned':
                    disk_status = 'unavailable'
                    disk_status_detail = 'decommissioned'
                else:
                    disk_status = 'error'
                    disk_status_detail = disk['state']['detail']
                    alba_backend = alba_backend_map.get(osd.get('alba_id'))
                    if alba_backend is not None:
                        disk_alba_backend_guid = alba_backend.guid
            disk['status'] = disk_status
            disk['status_detail'] = disk_status_detail
            disk['alba_backend_guid'] = disk_alba_backend_guid
            _disks.append(disk)

        def _worker(_queue, _disks):
            while True:
                try:
                    item = _queue.get(False)
                    _process_disk(item['info'], _disks, item['node'])
                except Empty:
                    return

        queue = Queue()
        for node in alba_nodes:
            for info in node_disk_map[node.node_id]:
                queue.put({'info': info,
                           'node': node})
        disks = []
        threads = []
        for i in range(5):
            thread = Thread(target=_worker, args=(queue, disks))
            thread.start()
            threads.append(thread)
        for thread in threads:
            thread.join()
        return disks
Example #24
0
    def migrate():
        """
        Executes async migrations. It doesn't matter too much when they are executed, as long as they get eventually
        executed. This code will typically contain:
        * "dangerous" migration code (it needs certain running services)
        * Migration code depending on a cluster-wide state
        * ...
        """
        AlbaMigrationController._logger.info(
            'Preparing out of band migrations...')

        from ovs.dal.hybrids.diskpartition import DiskPartition
        from ovs.dal.lists.albabackendlist import AlbaBackendList
        from ovs.dal.lists.albanodelist import AlbaNodeList
        from ovs.dal.lists.albaosdlist import AlbaOSDList
        from ovs.dal.lists.storagerouterlist import StorageRouterList
        from ovs.extensions.generic.configuration import Configuration
        from ovs.extensions.generic.sshclient import SSHClient, UnableToConnectException
        from ovs.extensions.migration.migration.albamigrator import ExtensionMigrator
        from ovs.extensions.packages.albapackagefactory import PackageFactory
        from ovs.extensions.services.albaservicefactory import ServiceFactory
        from ovs.extensions.plugins.albacli import AlbaCLI, AlbaError
        from ovs.lib.alba import AlbaController
        from ovs.lib.disk import DiskController

        AlbaMigrationController._logger.info('Start out of band migrations...')

        #############################################
        # Introduction of IP:port combination on OSDs
        osd_info_map = {}
        alba_backends = AlbaBackendList.get_albabackends()
        for alba_backend in alba_backends:
            AlbaMigrationController._logger.info(
                'Verifying ALBA Backend {0}'.format(alba_backend.name))
            if alba_backend.abm_cluster is None:
                AlbaMigrationController._logger.warning(
                    'ALBA Backend {0} does not have an ABM cluster registered'.
                    format(alba_backend.name))
                continue

            AlbaMigrationController._logger.debug(
                'Retrieving configuration path for ALBA Backend {0}'.format(
                    alba_backend.name))
            try:
                config = Configuration.get_configuration_path(
                    alba_backend.abm_cluster.config_location)
            except:
                AlbaMigrationController._logger.exception(
                    'Failed to retrieve the configuration path for ALBA Backend {0}'
                    .format(alba_backend.name))
                continue

            AlbaMigrationController._logger.info(
                'Retrieving OSD information for ALBA Backend {0}'.format(
                    alba_backend.name))
            try:
                osd_info = AlbaCLI.run(command='list-all-osds', config=config)
            except (AlbaError, RuntimeError):
                AlbaMigrationController._logger.exception(
                    'Failed to retrieve OSD information for ALBA Backend {0}'.
                    format(alba_backend.name))
                continue

            for osd_info in osd_info:
                if osd_info.get('long_id'):
                    osd_info_map[osd_info['long_id']] = {
                        'ips': osd_info.get('ips', []),
                        'port': osd_info.get('port')
                    }

        for osd in AlbaOSDList.get_albaosds():
            if osd.osd_id not in osd_info_map:
                AlbaMigrationController._logger.warning(
                    'OSD with ID {0} is modelled but could not be found through ALBA'
                    .format(osd.osd_id))
                continue

            ips = osd_info_map[osd.osd_id]['ips']
            port = osd_info_map[osd.osd_id]['port']
            changes = False
            if osd.ips is None:
                changes = True
                osd.ips = ips
            if osd.port is None:
                changes = True
                osd.port = port
            if changes is True:
                AlbaMigrationController._logger.info(
                    'Updating OSD with ID {0} with IPS {1} and port {2}'.
                    format(osd.osd_id, ips, port))
                osd.save()

        ###################################################
        # Read preference for GLOBAL ALBA Backends (1.10.3)  (https://github.com/openvstorage/framework-alba-plugin/issues/452)
        if Configuration.get(key='/ovs/framework/migration|read_preference',
                             default=False) is False:
            try:
                name_backend_map = dict((alba_backend.name, alba_backend)
                                        for alba_backend in alba_backends)
                for alba_node in AlbaNodeList.get_albanodes():
                    AlbaMigrationController._logger.info(
                        'Processing maintenance services running on ALBA Node {0} with ID {1}'
                        .format(alba_node.ip, alba_node.node_id))
                    alba_node.invalidate_dynamics('maintenance_services')
                    for alba_backend_name, services in alba_node.maintenance_services.iteritems(
                    ):
                        if alba_backend_name not in name_backend_map:
                            AlbaMigrationController._logger.error(
                                'ALBA Node {0} has services for an ALBA Backend {1} which is not modelled'
                                .format(alba_node.ip, alba_backend_name))
                            continue

                        alba_backend = name_backend_map[alba_backend_name]
                        AlbaMigrationController._logger.info(
                            'Processing {0} ALBA Backend {1} with GUID {2}'.
                            format(alba_backend.scaling, alba_backend.name,
                                   alba_backend.guid))
                        if alba_backend.scaling == alba_backend.SCALINGS.LOCAL:
                            read_preferences = [alba_node.node_id]
                        else:
                            read_preferences = AlbaController.get_read_preferences_for_global_backend(
                                alba_backend=alba_backend,
                                alba_node_id=alba_node.node_id,
                                read_preferences=[])

                        for service_name, _ in services:
                            AlbaMigrationController._logger.info(
                                'Processing service {0}'.format(service_name))
                            old_config_key = '/ovs/alba/backends/{0}/maintenance/config'.format(
                                alba_backend.guid)
                            new_config_key = '/ovs/alba/backends/{0}/maintenance/{1}/config'.format(
                                alba_backend.guid, service_name)
                            if Configuration.exists(key=old_config_key):
                                new_config = Configuration.get(
                                    key=old_config_key)
                                new_config[
                                    'read_preference'] = read_preferences
                                Configuration.set(key=new_config_key,
                                                  value=new_config)
                for alba_backend in alba_backends:
                    Configuration.delete(
                        key='/ovs/alba/backends/{0}/maintenance/config'.format(
                            alba_backend.guid))
                AlbaController.checkup_maintenance_agents.delay()

                Configuration.set(
                    key='/ovs/framework/migration|read_preference', value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Updating read preferences for ALBA Backends failed')

        #######################################################
        # Storing actual package name in version files (1.11.0) (https://github.com/openvstorage/framework/issues/1876)
        changed_clients = set()
        storagerouters = StorageRouterList.get_storagerouters()
        if Configuration.get(
                key=
                '/ovs/framework/migration|actual_package_name_in_version_file_alba',
                default=False) is False:
            try:
                service_manager = ServiceFactory.get_manager()
                alba_pkg_name, alba_version_cmd = PackageFactory.get_package_and_version_cmd_for(
                    component=PackageFactory.COMP_ALBA)
                for storagerouter in storagerouters:
                    try:
                        root_client = SSHClient(
                            endpoint=storagerouter.ip, username='******'
                        )  # Use '.ip' instead of StorageRouter object because this code is executed during post-update at which point the heartbeat has not been updated for some time
                    except UnableToConnectException:
                        AlbaMigrationController._logger.exception(
                            'Updating actual package name for version files failed on StorageRouter {0}'
                            .format(storagerouter.ip))
                        continue

                    for file_name in root_client.file_list(
                            directory=ServiceFactory.RUN_FILE_DIR):
                        if not file_name.endswith('.version'):
                            continue
                        file_path = '{0}/{1}'.format(
                            ServiceFactory.RUN_FILE_DIR, file_name)
                        contents = root_client.file_read(filename=file_path)
                        if alba_pkg_name == PackageFactory.PKG_ALBA_EE and '{0}='.format(
                                PackageFactory.PKG_ALBA) in contents:
                            # Rewrite the version file in the RUN_FILE_DIR
                            contents = contents.replace(
                                PackageFactory.PKG_ALBA,
                                PackageFactory.PKG_ALBA_EE)
                            root_client.file_write(filename=file_path,
                                                   contents=contents)

                            # Regenerate the service and update the EXTRA_VERSION_CMD in the configuration management
                            service_name = file_name.split('.')[0]
                            service_config_key = ServiceFactory.SERVICE_CONFIG_KEY.format(
                                storagerouter.machine_id, service_name)
                            if Configuration.exists(key=service_config_key):
                                service_config = Configuration.get(
                                    key=service_config_key)
                                if 'EXTRA_VERSION_CMD' in service_config:
                                    service_config[
                                        'EXTRA_VERSION_CMD'] = '{0}=`{1}`'.format(
                                            alba_pkg_name, alba_version_cmd)
                                    Configuration.set(key=service_config_key,
                                                      value=service_config)
                                    service_manager.regenerate_service(
                                        name='ovs-arakoon',
                                        client=root_client,
                                        target_name='ovs-{0}'.format(
                                            service_name)
                                    )  # Leave out .version
                                    changed_clients.add(root_client)
                Configuration.set(
                    key=
                    '/ovs/framework/migration|actual_package_name_in_version_file_alba',
                    value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Updating actual package name for version files failed')

        for root_client in changed_clients:
            try:
                root_client.run(['systemctl', 'daemon-reload'])
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Executing command "systemctl daemon-reload" failed')

        ####################################
        # Fix for migration version (1.11.0)
        # Previous code could potentially store a higher version number in the config management than the actual version number
        if Configuration.get(
                key='/ovs/framework/migration|alba_migration_version_fix',
                default=False) is False:
            try:
                for storagerouter in storagerouters:
                    config_key = '/ovs/framework/hosts/{0}/versions'.format(
                        storagerouter.machine_id)
                    if Configuration.exists(key=config_key):
                        versions = Configuration.get(key=config_key)
                        if versions.get(PackageFactory.COMP_MIGRATION_ALBA,
                                        0) > ExtensionMigrator.THIS_VERSION:
                            versions[
                                PackageFactory.
                                COMP_MIGRATION_ALBA] = ExtensionMigrator.THIS_VERSION
                            Configuration.set(key=config_key, value=versions)
                Configuration.set(
                    key='/ovs/framework/migration|alba_migration_version_fix',
                    value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Updating migration version failed')

        ####################################
        # Enable auto-cleanup
        migration_auto_cleanup_key = '/ovs/framework/migration|alba_auto_cleanup'
        if Configuration.get(key=migration_auto_cleanup_key,
                             default=False) is False:
            try:
                for storagerouter in StorageRouterList.get_storagerouters():
                    storagerouter.invalidate_dynamics(
                        'features')  # New feature was added
                errors = []
                for alba_backend in AlbaBackendList.get_albabackends():
                    try:
                        AlbaController.set_auto_cleanup(alba_backend.guid)
                    except Exception as ex:
                        AlbaMigrationController._logger.exception(
                            'Failed to set the auto-cleanup for ALBA Backend {0}'
                            .format(alba_backend.name))
                        errors.append(ex)
                if len(errors) == 0:
                    Configuration.set(key=migration_auto_cleanup_key,
                                      value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Updating auto cleanup failed')

        ####################################
        # Change cache eviction
        migration_random_eviction_key = '/ovs/framework/migration|alba_cache_eviction_random'
        if Configuration.get(key=migration_random_eviction_key,
                             default=False) is False:
            try:
                errors = []
                for alba_backend in AlbaBackendList.get_albabackends():
                    try:
                        AlbaController.set_cache_eviction(alba_backend.guid)
                    except Exception as ex:
                        AlbaMigrationController._logger.exception(
                            'Failed to set the auto-cleanup for ALBA Backend {0}'
                            .format(alba_backend.name))
                        errors.append(ex)
                if len(errors) == 0:
                    Configuration.set(key=migration_random_eviction_key,
                                      value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Updating auto cleanup failed')

        ###################################################
        # Sync all disks and apply the backend role. Backend role was removed with the AD (since 1.10)
        albanode_backend_role_sync_key = '/ovs/framework/migration|albanode_backend_role_sync'
        if not Configuration.get(key=albanode_backend_role_sync_key,
                                 default=False):
            try:
                errors = []
                for alba_node in AlbaNodeList.get_albanodes():
                    try:
                        if not alba_node.storagerouter:
                            continue
                        stack = alba_node.client.get_stack()  # type: dict
                        for slot_id, slot_information in stack.iteritems():
                            osds = slot_information.get('osds',
                                                        {})  # type: dict
                            slot_aliases = slot_information.get(
                                'aliases', [])  # type: list
                            if not osds:  # No osds means no partition was made
                                continue
                            # Sync to add all potential partitions that will need a backend role
                            DiskController.sync_with_reality(
                                storagerouter_guid=alba_node.storagerouter_guid
                            )
                            for disk in alba_node.storagerouter.disks:
                                if set(disk.aliases).intersection(
                                        set(slot_aliases)):
                                    partition = disk.partitions[0]
                                    if DiskPartition.ROLES.BACKEND not in partition.roles:
                                        partition.roles.append(
                                            DiskPartition.ROLES.BACKEND)
                                        partition.save()
                    except Exception as ex:
                        AlbaMigrationController._logger.exception(
                            'Syncing for storagerouter/albanode {0} failed'.
                            format(alba_node.storagerouter.ip))
                        errors.append(ex)
                if not errors:
                    Configuration.set(key=albanode_backend_role_sync_key,
                                      value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Syncing up the disks for backend roles failed')

        AlbaMigrationController._logger.info('Finished out of band migrations')
    def nsm_checkup(alba_backend_guid=None,
                    min_internal_nsms=1,
                    external_nsm_cluster_names=None):
        # type: (Optional[str], Optional[int], Optional[List[str]]) -> None
        """
        Validates the current NSM setup/configuration and takes actions where required.
        Assumptions:
        * A 2 node NSM is considered safer than a 1 node NSM.
        * When adding an NSM, the nodes with the least amount of NSM participation are preferred

        :param alba_backend_guid: Run for a specific ALBA Backend
        :type alba_backend_guid: str
        :param min_internal_nsms: Minimum amount of NSM hosts that need to be provided
        :type min_internal_nsms: int
        :param external_nsm_cluster_names: Information about the additional clusters to claim (only for externally managed Arakoon clusters)
        :type external_nsm_cluster_names: list
        :return: None
        :rtype: NoneType
        """
        ###############
        # Validations #
        ###############
        if external_nsm_cluster_names is None:
            external_nsm_cluster_names = []
        AlbaArakoonController._logger.info('NSM checkup started')
        if min_internal_nsms < 1:
            raise ValueError(
                'Minimum amount of NSM clusters must be 1 or more')

        if not isinstance(external_nsm_cluster_names, list):
            raise ValueError(
                "'external_nsm_cluster_names' must be of type 'list'")

        if len(external_nsm_cluster_names) > 0:
            if alba_backend_guid is None:
                raise ValueError(
                    'Additional NSMs can only be configured for a specific ALBA Backend'
                )
            if min_internal_nsms > 1:
                raise ValueError(
                    "'min_internal_nsms' and 'external_nsm_cluster_names' are mutually exclusive"
                )

            external_nsm_cluster_names = list(set(
                external_nsm_cluster_names))  # Remove duplicate cluster names
            for cluster_name in external_nsm_cluster_names:
                try:
                    ArakoonInstaller.get_arakoon_metadata_by_cluster_name(
                        cluster_name=cluster_name)
                except NotFoundException:
                    raise ValueError(
                        'Arakoon cluster with name {0} does not exist'.format(
                            cluster_name))

        if alba_backend_guid is None:
            alba_backends = [
                alba_backend
                for alba_backend in AlbaBackendList.get_albabackends()
                if alba_backend.backend.status == 'RUNNING'
            ]
        else:
            alba_backends = [AlbaBackend(alba_backend_guid)]

        masters = StorageRouterList.get_masters()
        storagerouters = set()
        for alba_backend in alba_backends:
            if alba_backend.abm_cluster is None:
                raise ValueError(
                    'No ABM cluster found for ALBA Backend {0}'.format(
                        alba_backend.name))
            if len(alba_backend.abm_cluster.abm_services) == 0:
                raise ValueError(
                    'ALBA Backend {0} does not have any registered ABM services'
                    .format(alba_backend.name))
            if len(alba_backend.nsm_clusters) + len(
                    external_nsm_cluster_names) > MAX_NSM_AMOUNT:
                raise ValueError(
                    'The maximum of {0} NSM Arakoon clusters will be exceeded. Amount of clusters that can be deployed for this ALBA Backend: {1}'
                    .format(MAX_NSM_AMOUNT,
                            MAX_NSM_AMOUNT - len(alba_backend.nsm_clusters)))
            # Validate enough externally managed Arakoon clusters are available
            if alba_backend.abm_cluster.abm_services[
                    0].service.is_internal is False:
                unused_cluster_names = set([
                    cluster_info['cluster_name'] for cluster_info in
                    ArakoonInstaller.get_unused_arakoon_clusters(
                        cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.NSM)
                ])
                if set(external_nsm_cluster_names).difference(
                        unused_cluster_names):
                    raise ValueError(
                        'Some of the provided cluster_names have already been claimed before'
                    )
                storagerouters.update(
                    set(masters)
                )  # For externally managed we need an available master node
            else:
                for abm_service in alba_backend.abm_cluster.abm_services:  # For internally managed we need all StorageRouters online
                    storagerouters.add(abm_service.service.storagerouter)
                for nsm_cluster in alba_backend.nsm_clusters:  # For internally managed we need all StorageRouters online
                    for nsm_service in nsm_cluster.nsm_services:
                        storagerouters.add(nsm_service.service.storagerouter)

        ssh_clients = {}
        for storagerouter in storagerouters:
            try:
                ssh_clients[storagerouter] = SSHClient(endpoint=storagerouter)
            except UnableToConnectException:
                raise RuntimeError(
                    'StorageRouter {0} with IP {1} is not reachable'.format(
                        storagerouter.name, storagerouter.ip))

        version_str = AlbaArakoonInstaller.get_alba_version_string()
        nsm_installer = NSMInstaller(version_str=version_str,
                                     ssh_clients=ssh_clients)

        ##################
        # Check Clusters #
        ##################
        safety = Configuration.get(
            '/ovs/framework/plugins/alba/config|nsm.safety')
        maxload = Configuration.get(
            '/ovs/framework/plugins/alba/config|nsm.maxload')

        AlbaArakoonController._logger.debug(
            'NSM safety is configured at: {0}'.format(safety))
        AlbaArakoonController._logger.debug(
            'NSM max load is configured at: {0}'.format(maxload))

        master_client = None
        failed_backends = []
        for alba_backend in alba_backends:
            try:
                # Gather information
                AlbaArakoonController._logger.info(
                    'ALBA Backend {0} - Ensuring NSM safety'.format(
                        alba_backend.name))

                internal = AlbaArakoonInstaller.is_internally_managed(
                    alba_backend)
                nsm_loads = AlbaArakoonController.get_nsm_loads(alba_backend)
                nsm_storagerouters = AlbaArakoonController.get_nsms_per_storagerouter(
                    alba_backend)
                sorted_nsm_clusters = sorted(alba_backend.nsm_clusters,
                                             key=lambda k: k.number)

                if not internal and len(external_nsm_cluster_names) > 0:
                    for sr, cl in ssh_clients.iteritems():
                        if sr.node_type == 'MASTER':
                            master_client = cl
                            break
                    if master_client is None:
                        # Internal is False and we specified the NSM clusters to claim, but no MASTER nodes online
                        raise ValueError(
                            'Could not find an online master node')

                AlbaArakoonController._logger.debug(
                    'ALBA Backend {0} - Arakoon clusters are {1} managed'.
                    format(alba_backend.name,
                           'internally' if internal is True else 'externally'))
                for nsm_number, nsm_load in nsm_loads.iteritems():
                    AlbaArakoonController._logger.debug(
                        'ALBA Backend {0} - NSM Cluster {1} - Load {2}'.format(
                            alba_backend.name, nsm_number, nsm_load))
                for sr, count in nsm_storagerouters.iteritems():
                    AlbaArakoonController._logger.debug(
                        'ALBA Backend {0} - StorageRouter {1} - NSM Services {2}'
                        .format(alba_backend.name, sr.name, count))

                if internal:
                    # Extend existing NSM clusters if safety not met
                    for nsm_cluster in sorted_nsm_clusters:
                        AlbaArakoonController._logger.debug(
                            'ALBA Backend {0} - Processing NSM {1} - Expected safety {2} - Current safety {3}'
                            .format(alba_backend.name, nsm_cluster.number,
                                    safety, len(nsm_cluster.nsm_services)))
                        AlbaArakoonController.ensure_nsm_cluster_safety(
                            nsm_cluster,
                            nsm_storagerouters,
                            nsm_installer=nsm_installer)
                AlbaArakoonController.ensure_nsm_clusters_load(
                    alba_backend,
                    nsms_per_storagerouter=nsm_storagerouters,
                    ssh_clients=ssh_clients,
                    version_str=version_str,
                    min_internal_nsms=min_internal_nsms,
                    external_nsm_cluster_names=external_nsm_cluster_names)
            except Exception:
                AlbaArakoonController._logger.exception(
                    'NSM Checkup failed for Backend {0}'.format(
                        alba_backend.name))
                failed_backends.append(alba_backend.name)
    def get_disk_safety():
        """
        Send disk safety for each vpool and the amount of namespaces with the lowest disk safety
        """
        points = []
        abms = []

        for service in ServiceList.get_services():
            if service.type.name == ServiceType.SERVICE_TYPES.ALBA_MGR:
                abms.append(service.name)

        abms = list(set(abms))
        abl = AlbaBackendList.get_albabackends()
        for ab in abl:
            service_name = Service(ab.abm_services[0].service_guid).name
            if service_name not in abms:
                continue

            config = "etcd://127.0.0.1:2379/ovs/arakoon/{}/config".format(service_name)

            try:
                disk_safety = AlbaCLI.run('get-disk-safety', config=config, to_json=True)
            except Exception as ex:
                StatsmonkeyScheduledTaskController._logger.error('{0}: {1}'.format(service_name, ex.message))
                continue

            presets = ab.presets
            used_preset = None
            for preset in presets:
                try:
                    policies = preset['policy_metadata']
                    for policy in policies:
                        if policies[policy]['is_active'] and policies[policy]['in_use']:
                            used_preset = policy

                    if used_preset is not None:
                        used_preset = json.loads(used_preset.replace('(', '[').replace(')', ']'))
                        max_disk_safety = used_preset[1]

                        safety = {
                            'measurement': 'disk_safety',
                            'tags': {
                                'backend_name': ab.name,
                                'max_disk_safety': max_disk_safety,
                                'min_disk_safety': max_disk_safety
                            },
                            'fields': {
                                'amount_max_disk_safety': 0,
                                'amount_between_disk_safety': 0,
                                'amount_min_disk_safety': 0
                            }
                        }
                        stats = {}
                        for disk in disk_safety:
                            if disk['safety'] is not None:
                                if disk['safety'] not in stats:
                                    stats[disk['safety']] = 0
                                stats[disk['safety']] += 1
                        min_disk_safety = min(stats.keys())
                        safety['tags']['min_disk_safety'] = min_disk_safety

                        for stat in stats:
                            if stat == max_disk_safety:
                                safety['fields']['amount_max_disk_safety'] = stats[stat]
                            elif stat == min_disk_safety:
                                safety['fields']['amount_min_disk_safety'] = stats[stat]
                            else:
                                safety['fields']['amount_between_disk_safety'] += stats[stat]

                        points.append(safety)
                except Exception as ex:
                    StatsmonkeyScheduledTaskController._logger.error(ex.message)

        if len(points) == 0:
            StatsmonkeyScheduledTaskController._logger.info("No statistics found")
            return

        StatsmonkeyScheduledTaskController._send_stats(points)
        return points
Example #27
0
    def get_stats_vdisks(cls):
        """
        Retrieve statistics about all vDisks on the system.
        Check the safety, storage amount on the Backend, fail-over status and others
        """
        if cls._config is None:
            cls.validate_and_retrieve_config()

        stats = []
        errors = False
        environment = cls._config['environment']
        alba_backend_info = {}
        for alba_backend in AlbaBackendList.get_albabackends():
            config_path = Configuration.get_configuration_path(
                alba_backend.abm_cluster.config_location)
            disk_safety = {}
            namespace_usage = {}

            # Retrieve namespace, preset and disk safety information
            try:
                preset_info = AlbaCLI.run(
                    command='list-presets', config=config_path
                )  # Not using alba_backend.presets, because it takes a whole lot longer to retrieve
                all_namespace_info = AlbaCLI.run(command='show-namespaces',
                                                 config=config_path,
                                                 extra_params=['--max=-1'])[1]
                all_disk_safety_info = AlbaCLI.run(command='get-disk-safety',
                                                   config=config_path)
            except Exception:
                errors = True
                cls._logger.exception(
                    'Retrieving information for ALBA Backend {0} failed'.
                    format(alba_backend.name))
                continue

            alba_backend_info[alba_backend.guid] = {
                'disk_safety': disk_safety,
                'namespace_usage': namespace_usage
            }

            # Parse namespace information
            for namespace_info in all_namespace_info:
                namespace_usage[namespace_info['name']] = float(
                    namespace_info['statistics']['storage'])

            # Parse preset information
            policies = []
            preset_name = None
            for preset in preset_info:
                if preset['in_use'] is not True:
                    continue
                preset_name = preset['name']
                policies.extend(preset['policies'])
            if preset_name is None:
                continue

            # Parse disk safety information
            total_objects = 0
            max_lost_disks = 0
            max_disk_safety = 0
            bucket_overview = {}
            disk_lost_overview = {}
            disk_safety_overview = {}
            for disk_safety_info in all_disk_safety_info:
                safety = disk_safety_info['safety']
                volume_id = disk_safety_info['namespace']
                disk_safety[volume_id] = float(
                    safety) if safety is not None else safety

                for bucket_safety in disk_safety_info['bucket_safety']:
                    bucket = bucket_safety['bucket']
                    objects = bucket_safety['count']
                    remaining_safety = bucket_safety['remaining_safety']

                    if bucket[1] > max_lost_disks:
                        max_lost_disks = bucket[1]
                    if remaining_safety > max_disk_safety:
                        max_disk_safety = remaining_safety

                    for policy in policies:
                        k = policy[0] == bucket[0]
                        m = policy[1] == bucket[1]
                        c = policy[2] <= bucket[2]
                        x = policy[3] >= bucket[3]
                        if k and m and c and x:
                            if preset_name not in bucket_overview:
                                bucket_overview[preset_name] = {
                                    'policy': str(policy),
                                    'presets': {}
                                }

                    bucket[2] -= bucket_safety['applicable_dead_osds']
                    if str(bucket
                           ) not in bucket_overview[preset_name]['presets']:
                        bucket_overview[preset_name]['presets'][str(
                            bucket)] = {
                                'objects': 0,
                                'disk_safety': 0
                            }

                    disk_lost = bucket[0] + bucket[1] - bucket[
                        2]  # Data fragments + parity fragments - amount of fragments to write + dead osds
                    if disk_lost not in disk_lost_overview:
                        disk_lost_overview[disk_lost] = 0
                    if remaining_safety not in disk_safety_overview:
                        disk_safety_overview[remaining_safety] = 0

                    total_objects += objects
                    disk_lost_overview[disk_lost] += objects
                    disk_safety_overview[remaining_safety] += objects
                    bucket_overview[preset_name]['presets'][str(
                        bucket)]['objects'] += objects
                    bucket_overview[preset_name]['presets'][str(
                        bucket)]['disk_safety'] = remaining_safety

            # Create statistics regarding disk safety
            for disk_lost_number in xrange(max_lost_disks + 1):
                stats.append({
                    'tags': {
                        'disk_lost': disk_lost_number,
                        'environment': environment,
                        'backend_name': alba_backend.name
                    },
                    'fields': {
                        'objects': disk_lost_overview.get(disk_lost_number, 0),
                        'total_objects': total_objects
                    },
                    'measurement': 'disk_lost'
                })

            for disk_safety_number in xrange(max_disk_safety + 1):
                stats.append({
                    'tags': {
                        'disk_safety': disk_safety_number,
                        'environment': environment,
                        'backend_name': alba_backend.name
                    },
                    'fields': {
                        'objects':
                        disk_safety_overview.get(disk_safety_number, 0),
                        'total_objects': total_objects
                    },
                    'measurement': 'disk_safety'
                })

            for preset_name, result in bucket_overview.iteritems():
                for bucket_count, bucket_result in result['presets'].iteritems(
                ):
                    stats.append({
                        'tags': {
                            'bucket': bucket_count,
                            'policy': result['policy'],
                            'preset_name': preset_name,
                            'environment': environment,
                            'disk_safety': bucket_result['disk_safety'],
                            'backend_name': alba_backend.name
                        },
                        'fields': {
                            'objects': bucket_result['objects'],
                            'total_objects': total_objects
                        },
                        'measurement': 'bucket'
                    })

        # Integrate namespace and disk safety information in vPool stats
        for vpool in VPoolList.get_vpools():
            alba_backend_guid = vpool.metadata['backend']['backend_info'][
                'alba_backend_guid']
            for vdisk in vpool.vdisks:
                try:
                    metrics = cls._convert_to_float_values(
                        cls._pop_realtime_info(vdisk.statistics))
                    metrics['failover_mode'] = vdisk.dtl_status
                    metrics['frontend_size'] = float(vdisk.size)
                    metrics['failover_mode_status'] = cls._FAILOVER_MAP.get(
                        vdisk.dtl_status, 3)
                    if alba_backend_guid in alba_backend_info:
                        metrics['disk_safety'] = alba_backend_info[
                            alba_backend_guid]['disk_safety'].get(
                                vdisk.volume_id)
                        metrics['backend_stored'] = alba_backend_info[
                            alba_backend_guid]['namespace_usage'].get(
                                vdisk.volume_id)

                    stats.append({
                        'tags': {
                            'disk_name':
                            vdisk.name,
                            'volume_id':
                            vdisk.volume_id,
                            'vpool_name':
                            vdisk.vpool.name,
                            'environment':
                            environment,
                            'storagerouter_name':
                            StorageRouter(vdisk.storagerouter_guid).name
                        },
                        'fields': metrics,
                        'measurement': 'vdisk'
                    })
                except Exception:
                    errors = True
                    cls._logger.exception(
                        'Retrieving statistics for vDisk {0} with guid {1} failed'
                        .format(vdisk.name, vdisk.guid))
        return errors, stats
Example #28
0
    def _bootstrap_dal_models(self):
        """
        Load/hook dal models as snmp oids
        """
        _guids = set()

        enabled_key = "{0}_config_dal_enabled".format(STORAGE_PREFIX)
        self.instance_oid = 0
        try:
            enabled = self.persistent.get(enabled_key)
        except KeyNotFoundException:
            enabled = True  # Enabled by default, can be disabled by setting the key
        if enabled:
            from ovs.dal.lists.vdisklist import VDiskList
            from ovs.dal.lists.storagerouterlist import StorageRouterList
            from ovs.dal.lists.pmachinelist import PMachineList
            from ovs.dal.lists.vmachinelist import VMachineList
            from ovs.dal.lists.vpoollist import VPoolList
            from ovs.dal.lists.storagedriverlist import StorageDriverList

            for storagerouter in StorageRouterList.get_storagerouters():
                _guids.add(storagerouter.guid)
                if not self._check_added(storagerouter):
                    self._register_dal_model(10, storagerouter, 'guid', "0")
                    self._register_dal_model(10, storagerouter, 'name', "1")
                    self._register_dal_model(10, storagerouter, 'pmachine', "3", key = 'host_status')
                    self._register_dal_model(10, storagerouter, 'description', "4")
                    self._register_dal_model(10, storagerouter, 'devicename', "5")
                    self._register_dal_model(10, storagerouter, 'dtl_mode', "6")
                    self._register_dal_model(10, storagerouter, 'ip', "8")
                    self._register_dal_model(10, storagerouter, 'machineid', "9")
                    self._register_dal_model(10, storagerouter, 'status', "10")
                    self._register_dal_model(10, storagerouter, '#vdisks', "11",
                                             func = lambda storagerouter: len([vdisk for vpool_vdisks in [storagedriver.vpool.vdisks for storagedriver in storagerouter.storagedrivers] for vdisk in vpool_vdisks if vdisk.storagedriver_id == storagedriver.storagedriver_id]),
                                             atype = int)
                    self._register_dal_model(10, storagerouter, '#vmachines', "12",
                                             func = lambda storagerouter: len(set([vdisk.vmachine.guid for vpool_vdisks in [storagedriver.vpool.vdisks for storagedriver in storagerouter.storagedrivers] for vdisk in vpool_vdisks if vdisk.storagedriver_id == storagedriver.storagedriver_id])),
                                             atype = int)
                    self._register_dal_model(10, storagerouter, '#stored_data', "13",
                                             func = lambda storagerouter: sum([vdisk.vmachine.stored_data for vpool_vdisks in [storagedriver.vpool.vdisks for storagedriver in storagerouter.storagedrivers] for vdisk in vpool_vdisks if vdisk.storagedriver_id == storagedriver.storagedriver_id]),
                                             atype = int)
                    self.instance_oid += 1

            for vm in VMachineList.get_vmachines():
                _guids.add(vm.guid)
                if not self._check_added(vm):
                    if vm.is_vtemplate:
                        self._register_dal_model(11, vm, 'guid', "0")
                        self._register_dal_model(11, vm, 'name', "1")

                        def _children(vmt):
                            children = 0
                            disks = [vd.guid for vd in vmt.vdisks]
                            for vdisk in [vdisk.parent_vdisk_guid for item in [vm.vdisks for vm in VMachineList.get_vmachines() if not vm.is_vtemplate] for vdisk in item]:
                                for disk in disks:
                                    if vdisk == disk:
                                        children += 1
                            return children
                        self._register_dal_model(11, vm, '#children', 2, func = _children, atype = int)
                        self.instance_oid += 1

            for vm in VMachineList.get_vmachines():
                _guids.add(vm.guid)
                if not self._check_added(vm):
                    if not vm.is_vtemplate:
                        self._register_dal_model(0, vm, 'guid', "0")
                        self._register_dal_model(0, vm, 'name', "1")
                        self._register_dal_model(0, vm, 'statistics', "2.0", key = "operations", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.1", key = "cluster_cache_misses_ps", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.2", key = "data_read", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.3", key = "sco_cache_misses", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.4", key = "sco_cache_hits_ps", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.5", key = "sco_cache_hits", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.6", key = "write_operations", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.7", key = "cluster_cache_misses", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.8", key = "read_operations_ps", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.9", key = "sco_cache_misses_ps", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.10", key = "backend_write_operations", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.11", key = "backend_data_read", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.12", key = "cache_hits", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.13", key = "backend_write_operations_ps", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.14", key = "metadata_store_hits_ps", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.15", key = "metadata_store_misses", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.16", key = "backend_data_written", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.17", key = "data_read_ps", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.18", key = "read_operations", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.19", key = "cluster_cache_hits", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.20", key = "data_written_ps", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.21", key = "cluster_cache_hits_ps", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.22", key = "cache_hits_ps", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.23", key = "timestamp", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.24", key = "metadata_store_misses_ps", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.25", key = "backend_data_written_ps", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.26", key = "backend_read_operations", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.27", key = "data_written", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.28", key = "metadata_store_hits", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.29", key = "backend_data_read_ps", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.30", key = "operations_ps", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.31", key = "backend_read_operations_ps", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.32", key = "data_transferred_ps", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.33", key = "write_operations_ps", atype = int)
                        self._register_dal_model(0, vm, 'statistics', "2.34", key = "data_transferred", atype = int)
                        self._register_dal_model(0, vm, 'stored_data', "3", atype = int)
                        self._register_dal_model(0, vm, 'description', "4")
                        self._register_dal_model(0, vm, 'devicename', "5")
                        self._register_dal_model(0, vm, 'dtl_mode', "6")
                        self._register_dal_model(0, vm, 'hypervisorid', "7")
                        self._register_dal_model(0, vm, 'ip', "8")
                        self._register_dal_model(0, vm, 'status', "10")
                        self._register_dal_model(0, vm, 'stored_data', "10", atype = int)
                        self._register_dal_model(0, vm, 'snapshots', "11", atype = int)
                        self._register_dal_model(0, vm, 'vdisks', "12", atype = int)
                        self._register_dal_model(0, vm, 'DTL', '13',
                                                 func = lambda vm: 'DEGRADED' if all(item == 'DEGRADED' for item in [vd.info['failover_mode'] for vd in vm.vdisks]) else 'OK')
                    self.instance_oid += 1

            for vd in VDiskList.get_vdisks():
                _guids.add(vd.guid)
                if not self._check_added(vd):
                    self._register_dal_model(1, vd, 'guid', "0")
                    self._register_dal_model(1, vd, 'name', "1")
                    self._register_dal_model(1, vd, 'statistics', "2.0", key = "operations", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.1", key = "data_written_ps", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.2", key = "data_read", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.3", key = "sco_cache_misses", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.4", key = "sco_cache_hits_ps", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.5", key = "sco_cache_hits", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.6", key = "write_operations", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.7", key = "cluster_cache_misses", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.8", key = "read_operations_ps", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.9", key = "sco_cache_misses_ps", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.10", key = "backend_write_operations", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.11", key = "backend_data_read", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.12", key = "cache_hits", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.13", key = "backend_write_operations_ps", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.14", key = "metadata_store_hits_ps", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.15", key = "metadata_store_misses", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.16", key = "backend_data_written", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.17", key = "data_read_ps", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.18", key = "read_operations", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.19", key = "cluster_cache_hits", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.20", key = "cluster_cache_misses_ps", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.21", key = "cluster_cache_hits_ps", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.22", key = "cache_hits_ps", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.23", key = "timestamp", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.24", key = "metadata_store_misses_ps", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.25", key = "backend_data_written_ps", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.26", key = "backend_read_operations", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.27", key = "data_written", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.28", key = "metadata_store_hits", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.29", key = "backend_data_read_ps", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.30", key = "operations_ps", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.31", key = "backend_read_operations_ps", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.32", key = "data_transferred_ps", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.33", key = "write_operations_ps", atype = int)
                    self._register_dal_model(1, vd, 'statistics', "2.34", key = "data_transferred", atype = int)
                    self._register_dal_model(1, vd, 'info', "3", key = 'stored', atype = int)
                    self._register_dal_model(1, vd, 'info', "4", key = 'failover_mode', atype = int)
                    self._register_dal_model(1, vd, 'snapshots', "5", atype = int)
                    self.instance_oid += 1

            for pm in PMachineList.get_pmachines():
                _guids.add(pm.guid)
                if not self._check_added(pm):
                    self._register_dal_model(2, pm, 'guid', "0")
                    self._register_dal_model(2, pm, 'name', "1")
                    self._register_dal_model(2, pm, 'host_status', "2")
                    self.instance_oid += 1

            for vp in VPoolList.get_vpools():
                _guids.add(vp.guid)
                if not self._check_added(vp):
                    self._register_dal_model(3, vp, 'guid', "0")
                    self._register_dal_model(3, vp, 'name', "1")
                    self._register_dal_model(3, vp, 'statistics', "2.0", key = "operations", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.1", key = "cluster_cache_misses_ps", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.2", key = "data_read", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.3", key = "sco_cache_misses", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.4", key = "sco_cache_hits_ps", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.5", key = "sco_cache_hits", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.6", key = "write_operations", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.7", key = "cluster_cache_misses", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.8", key = "read_operations_ps", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.9", key = "sco_cache_misses_ps", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.10", key = "backend_write_operations", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.11", key = "backend_data_read", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.12", key = "cache_hits", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.13", key = "backend_write_operations_ps", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.14", key = "metadata_store_hits_ps", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.15", key = "metadata_store_misses", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.16", key = "backend_data_written", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.17", key = "data_read_ps", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.18", key = "read_operations", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.19", key = "cluster_cache_hits", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.20", key = "data_written_ps", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.21", key = "cluster_cache_hits_ps", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.22", key = "cache_hits_ps", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.23", key = "timestamp", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.24", key = "metadata_store_misses_ps", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.25", key = "backend_data_written_ps", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.26", key = "backend_read_operations", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.27", key = "data_written", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.28", key = "metadata_store_hits", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.29", key = "backend_data_read_ps", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.30", key = "operations_ps", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.31", key = "backend_read_operations_ps", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.32", key = "data_transferred_ps", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.33", key = "write_operations_ps", atype = int)
                    self._register_dal_model(3, vp, 'statistics', "2.34", key = "data_transferred", atype = int)
                    self._register_dal_model(3, vp, 'status', "3")
                    self._register_dal_model(3, vp, 'description', "4")
                    self._register_dal_model(3, vp, 'vdisks', "5", atype = int)
                    self._register_dal_model(3, vp, '#vmachines', "6",
                                             func = lambda vp: len(set([vd.vmachine.guid for vd in vp.vdisks])),
                                             atype = int)
                    self.instance_oid += 1

            for storagedriver in StorageDriverList.get_storagedrivers():
                _guids.add(storagedriver.guid)
                if not self._check_added(storagedriver):
                    self._register_dal_model(4, storagedriver, 'guid', "0")
                    self._register_dal_model(4, storagedriver, 'name', "1")
                    self._register_dal_model(4, storagedriver, 'stored_data', "2", atype = int)
                    self.instance_oid += 1

            try:
                # try to load OVS Backends
                from ovs.dal.lists.albabackendlist import AlbaBackendList
                for backend in AlbaBackendList.get_albabackends():
                    _guids.add(backend.guid)
                    if not self._check_added(backend):
                        self._register_dal_model(5, backend, 'guid', 0)
                        self._register_dal_model(5, backend, 'name', 1)
                        for disk_id in range(len((backend.all_disks))):
                            self._register_dal_model(5, backend, 'all_disks', '2.{0}.0'.format(disk_id), key = "name", index=disk_id)
                            self._register_dal_model(5, backend, 'all_disks', '2.{0}.1'.format(disk_id), key = "usage.size", atype = long, index=disk_id)
                            self._register_dal_model(5, backend, 'all_disks', '2.{0}.2'.format(disk_id), key = "usage.used", atype = long, index=disk_id)
                            self._register_dal_model(5, backend, 'all_disks', '2.{0}.3'.format(disk_id), key = "usage.available", atype = long, index=disk_id)
                            self._register_dal_model(5, backend, 'all_disks', '2.{0}.4'.format(disk_id), key = "state.state", index=disk_id)
                            self._register_dal_model(5, backend, 'all_disks', '2.{0}.5'.format(disk_id), key = "node_id", index=disk_id)

                        self.instance_oid += 1
            except ImportError:
                print('OVS Backend not present')
                pass
            reload = False
            for object_guid in list(self.model_oids):
                if object_guid not in _guids:
                    self.model_oids.remove(object_guid)
                    reload = True
            if reload:
                self._reload_snmp()
    def _local_stack(self):
        """
        Returns a live list of all disks known to this AlbaBackend
        """
        from ovs.dal.lists.albanodelist import AlbaNodeList
        from ovs.dal.lists.albabackendlist import AlbaBackendList

        if len(self.abm_services) == 0:
            return {}  # No ABM services yet, so backend not fully installed yet

        alba_backend_map = {}
        for alba_backend in AlbaBackendList.get_albabackends():
            alba_backend_map[alba_backend.alba_id] = alba_backend

        # Load information based on the model
        asd_map = {}
        storage_map = {}
        alba_nodes = AlbaNodeList.get_albanodes()
        for node in alba_nodes:
            node_id = node.node_id
            storage_map[node_id] = {}
            for disk in node.disks:
                disk_id = disk.aliases[0].split('/')[-1]
                storage_map[node_id][disk_id] = {'asds': {},
                                                 'name': disk_id,
                                                 'guid': disk.guid,
                                                 'status': 'error',
                                                 'aliases': disk.aliases,
                                                 'status_detail': 'unknown'}
                for osd in disk.osds:
                    osd_id = osd.osd_id
                    data = {'asd_id': osd_id,
                            'guid': osd.guid,
                            'status': 'error',
                            'status_detail': 'unknown',
                            'alba_backend_guid': osd.alba_backend_guid}
                    asd_map[osd_id] = data
                    storage_map[node_id][disk_id]['asds'][osd_id] = data

        # Load information from node
        def _load_live_info(_node, _node_data):
            _data = _node.storage_stack
            if _data['status'] != 'ok':
                for disk_entry in _node_data.values():
                    disk_entry['status_detail'] = _data['status']
                    for entry in disk_entry.get('asds', {}).values():
                        entry['status_detail'] = _data['status']
            else:
                for _disk_id, disk_asd_info in _data['stack'].iteritems():
                    if _disk_id not in _node_data:
                        _node_data[_disk_id] = {'asds': {}}
                    entry = _node_data[_disk_id]
                    disk_info = copy.deepcopy(disk_asd_info)
                    del disk_info['asds']
                    entry.update(disk_info)
                    asds_info = disk_asd_info['asds']
                    for _asd_id, asd_info in asds_info.iteritems():
                        if _asd_id not in _node_data[_disk_id]['asds']:
                            _node_data[_disk_id]['asds'][_asd_id] = asd_info
                        else:
                            _node_data[_disk_id]['asds'][_asd_id].update(asd_info)

        threads = []
        for node in alba_nodes:
            thread = Thread(target=_load_live_info, args=(node, storage_map[node.node_id]))
            thread.start()
            threads.append(thread)
        for thread in threads:
            thread.join()

        # Mix in usage information
        for asd_id, stats in self.asd_statistics.iteritems():
            if asd_id in asd_map:
                asd_map[asd_id]['usage'] = {'size': int(stats['capacity']),
                                            'used': int(stats['disk_usage']),
                                            'available': int(stats['capacity'] - stats['disk_usage'])}

        # Load information from alba
        backend_interval_key = '/ovs/alba/backends/{0}/gui_error_interval'.format(self.guid)
        if Configuration.exists(backend_interval_key):
            interval = Configuration.get(backend_interval_key)
        else:
            interval = Configuration.get('/ovs/alba/backends/global_gui_error_interval')
        config = Configuration.get_configuration_path('/ovs/arakoon/{0}-abm/config'.format(self.name))
        asds = {}
        for found_osd in AlbaCLI.run(command='list-all-osds', config=config):
            asds[found_osd['long_id']] = found_osd
        for node_data in storage_map.values():
            for _disk in node_data.values():
                for asd_id, asd_data in _disk['asds'].iteritems():
                    if asd_id not in asds:
                        continue
                    found_osd = asds[asd_id]
                    if 'state' not in asd_data:
                        continue
                    if found_osd.get('decommissioned') is True:
                        asd_data['status'] = 'unavailable'
                        asd_data['status_detail'] = 'decommissioned'
                        continue
                    state = asd_data['state']
                    if state == 'ok':
                        if found_osd['id'] is None:
                            alba_id = found_osd['alba_id']
                            if alba_id is None:
                                asd_data['status'] = 'available'
                            else:
                                asd_data['status'] = 'unavailable'
                                alba_backend = alba_backend_map.get(alba_id)
                                if alba_backend is not None:
                                    asd_data['alba_backend_guid'] = alba_backend.guid
                        else:
                            asd_data['alba_backend_guid'] = self.guid
                            asd_data['status'] = 'warning'
                            asd_data['status_detail'] = 'recenterrors'

                            read = found_osd['read'] or [0]
                            write = found_osd['write'] or [0]
                            errors = found_osd['errors']
                            if len(errors) == 0 or (len(read + write) > 0 and max(min(read), min(write)) > max(error[0] for error in errors) + interval):
                                asd_data['status'] = 'claimed'
                                asd_data['status_detail'] = ''
                    else:
                        asd_data['status'] = 'error'
                        asd_data['status_detail'] = asd_data.get('state_detail', '')
                        alba_backend = alba_backend_map.get(found_osd.get('alba_id'))
                        if alba_backend is not None:
                            asd_data['alba_backend_guid'] = alba_backend.guid
        return storage_map
    def _alba_arakoon_checkup(cls,
                              alba_backend_guid=None,
                              abm_cluster=None,
                              nsm_clusters=None):
        # type: (Optional[str], Optional[str], Optional[List[str]]) -> None
        """
        Creates a new Arakoon cluster if required and extends cluster if possible on all available master nodes
        :param alba_backend_guid: Guid of the ALBA Backend
        :type alba_backend_guid: str
        :param nsm_clusters: NSM clusters for this ALBA Backend
        The code will claim the Arakoon clusters for this backend when provided
        :type nsm_clusters: list[str]
        :param abm_cluster: ABM cluster for this ALBA Backend
        The code will claim the Arakoon cluster for this backend when provided
        :type abm_cluster: str|None
        :return:None
        :rtype: NoneType
        """
        slaves = StorageRouterList.get_slaves()
        masters = StorageRouterList.get_masters()
        clients = {}
        for storagerouter in masters + slaves:
            try:
                clients[storagerouter] = SSHClient(storagerouter)
            except UnableToConnectException:
                cls._logger.warning(
                    'Storage Router with IP {0} is not reachable'.format(
                        storagerouter.ip))
        available_storagerouters = cls.get_available_arakoon_storagerouters(
            clients)

        # Call here, because this potentially raises error, which should happen before actually making changes
        abm_installer = ABMInstaller(ssh_clients=clients)
        nsm_installer = NSMInstaller(version_str=abm_installer.version_str,
                                     ssh_clients=clients)

        # Cluster creation
        if alba_backend_guid is not None:
            alba_backend = AlbaBackend(alba_backend_guid)
            # @todo revisit. This might enforce the ABM name for externals (might be unintended)
            abm_cluster_name = '{0}-abm'.format(alba_backend.name)

            # ABM Arakoon cluster creation
            if alba_backend.abm_cluster is None:
                # Fallback to installing the cluster on an available storagerouter
                storagerouter, partition = available_storagerouters.items()[0]
                abm_installer.deploy_abm_cluster(
                    alba_backend,
                    abm_cluster_name,
                    requested_abm_cluster_name=abm_cluster,
                    storagerouter=storagerouter)

            # NSM Arakoon cluster creation
            if len(alba_backend.nsm_clusters
                   ) == 0 and nsm_clusters is not None:
                storagerouter, partition = available_storagerouters.items()[0]
                nsm_installer.deploy_nsm_cluster(alba_backend,
                                                 storagerouter=storagerouter,
                                                 nsm_clusters=nsm_clusters)

        # ABM Cluster extension
        for alba_backend in AlbaBackendList.get_albabackends():
            if alba_backend.abm_cluster is None:
                AlbaArakoonController._logger.warning(
                    'ALBA Backend {0} does not have an ABM cluster registered'.
                    format(alba_backend.name))
                continue
            cls.ensure_abm_cluster_safety(alba_backend.abm_cluster,
                                          available_storagerouters,
                                          abm_installer=abm_installer)
    def checkup_maintenance_agents():
        """
        Check if requested nr of maintenance agents / backend is actually present
        Add / remove as necessary
        :return: None
        """
        service_template_key = 'alba-maintenance_{0}-{1}'
        maintenance_agents_map = {}
        asd_nodes = AlbaNodeList.get_albanodes()
        nr_of_storage_nodes = len(asd_nodes)

        def _get_node_load(backend_name):
            highest_load = 0
            lowest_load = sys.maxint
            agent_load = {'high_load_node': asd_nodes[0] if asd_nodes else None,
                          'low_load_node': asd_nodes[0] if asd_nodes else None,
                          'total_load': 0}
            for asd_node in asd_nodes:
                actual_nr_of_agents = 0
                maint_services = asd_node.client.list_maintenance_services()
                for service_name in maint_services:
                    if service_template_key.format(backend_name, '') in service_name:
                        actual_nr_of_agents += 1
                if actual_nr_of_agents > highest_load:
                    agent_load['high_load_node'] = asd_node
                    highest_load = actual_nr_of_agents
                if actual_nr_of_agents < lowest_load:
                    agent_load['low_load_node'] = asd_node
                    lowest_load = actual_nr_of_agents
                agent_load['total_load'] += actual_nr_of_agents

            return agent_load

        alba_backends = AlbaBackendList.get_albabackends()
        for alba_backend in alba_backends:
            nr_of_agents_key = AlbaNodeController.NR_OF_AGENTS_ETCD_TEMPLATE.format(alba_backend.guid)
            name = alba_backend.backend.name
            if not EtcdConfiguration.exists(nr_of_agents_key):
                EtcdConfiguration.set(nr_of_agents_key, nr_of_storage_nodes)
            required_nr = EtcdConfiguration.get(nr_of_agents_key)
            maintenance_agents_map[name] = {'required': required_nr,
                                            'actual': _get_node_load(name)['total_load'],
                                            'backend': alba_backend.backend}

        for name, values in maintenance_agents_map.iteritems():
            AlbaNodeController._logger.info('Checking backend: {0}'.format(name))
            to_process = values['required'] - values['actual']

            if to_process == 0:
                AlbaNodeController._logger.info('No action required for: {0}'.format(name))
            elif to_process >= 0:
                AlbaNodeController._logger.info('Adding {0} maintenance agent(s) for {1}'.format(to_process, name))
                for _ in xrange(to_process):
                    unique_hash = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(16))
                    node = _get_node_load(name)['low_load_node']
                    AlbaNodeController._logger.info('Service to add: ' + service_template_key.format(name, unique_hash))
                    if node and node.client:
                        node.client.add_maintenance_service(service_template_key.format(name, unique_hash),
                                                            values['backend'].alba_backend.guid,
                                                            AlbaController.get_abm_service_name(values['backend']))
                        AlbaNodeController._logger.info('Service added')
            else:
                to_process = abs(to_process)
                AlbaNodeController._logger.info('Removing {0} maintenance agent(s) for {1}'.format(to_process, name))
                for _ in xrange(to_process):
                    node = _get_node_load(name)['high_load_node']
                    services = node.client.list_maintenance_services()
                    if services and node and node.client:
                        for service in services:
                            if 'alba-maintenance_' + name in service:
                                node.client.remove_maintenance_service(service)
                                break
    def _storage_stack(self):
        """
        Returns a live list of all disks known to this AlbaBackend
        """
        from ovs.dal.lists.albanodelist import AlbaNodeList
        from ovs.dal.lists.albabackendlist import AlbaBackendList

        if len(self.abm_services) == 0:
            return {}  # No ABM services yet, so backend not fully installed yet

        storage_map = {}
        asd_map = {}

        alba_backend_map = {}
        for alba_backend in AlbaBackendList.get_albabackends():
            alba_backend_map[alba_backend.alba_id] = alba_backend

        # Load information based on the model
        alba_nodes = AlbaNodeList.get_albanodes()
        for node in alba_nodes:
            node_id = node.node_id
            storage_map[node_id] = {}
            for disk in node.disks:
                disk_id = disk.name
                storage_map[node_id][disk_id] = {'name': disk_id,
                                                 'guid': disk.guid,
                                                 'status': 'error',
                                                 'status_detail': 'unknown',
                                                 'asds': {}}
                for asd in disk.asds:
                    asd_id = asd.asd_id
                    data = {'asd_id': asd_id,
                            'guid': asd.guid,
                            'status': 'error',
                            'status_detail': 'unknown',
                            'alba_backend_guid': asd.alba_backend_guid}
                    asd_map[asd_id] = data
                    storage_map[node_id][disk_id]['asds'][asd_id] = data

        # Load information from node
        def _load_live_info(_node, _node_data):
            # Live disk information
            try:
                disk_data = _node.client.get_disks()
            except (requests.ConnectionError, requests.Timeout):
                for entry in _node_data.values():
                    entry['status_detail'] = 'nodedown'
                disk_data = {}
            for _disk_id, disk_info in disk_data.iteritems():
                if _disk_id in _node_data:
                    entry = _node_data[_disk_id]
                else:
                    entry = {'name': _disk_id,
                             'status': 'unknown',
                             'status_detail': '',
                             'asds': {}}
                    _node_data[_disk_id] = entry
                entry.update(disk_info)
                if disk_info['state'] == 'ok':
                    entry['status'] = 'uninitialized' if disk_info['available'] is True else 'initialized'
                    entry['status_detail'] = ''
                else:
                    entry['status'] = disk_info['state']
                    entry['status_detail'] = disk_info.get('state_detail', '')
            # Live ASD information
            try:
                _asd_data = _node.client.get_asds()
            except (requests.ConnectionError, requests.Timeout):
                for disk_entry in _node_data.values():
                    for entry in disk_entry['asds'].values():
                        entry['status_detail'] = 'nodedown'
                _asd_data = {}
            for _disk_id, asds in _asd_data.iteritems():
                if _disk_id not in _node_data:
                    continue
                for _asd_id, asd_info in asds.iteritems():
                    entry = {'asd_id': _asd_id,
                             'status': 'error' if asd_info['state'] == 'error' else 'initialized',
                             'status_detail': asd_info.get('state_detail', ''),
                             'state': asd_info['state'],
                             'state_detail': asd_info.get('state_detail', '')}
                    if _asd_id not in _node_data[_disk_id]['asds']:
                        _node_data[_disk_id]['asds'][_asd_id] = entry
                        asd_map[_asd_id] = entry
                    else:
                        _node_data[_disk_id]['asds'][_asd_id].update(entry)
        threads = []
        for node in alba_nodes:
            thread = Thread(target=_load_live_info, args=(node, storage_map[node.node_id]))
            thread.start()
            threads.append(thread)
        for thread in threads:
            thread.join()

        # Mix in usage information
        for asd_id, stats in self.asd_statistics.iteritems():
            if asd_id in asd_map:
                asd_map[asd_id]['usage'] = {'size': int(stats['capacity']),
                                            'used': int(stats['disk_usage']),
                                            'available': int(stats['capacity'] - stats['disk_usage'])}

        # Load information from alba
        backend_interval_key = '/ovs/alba/backends/{0}/gui_error_interval'.format(self.guid)
        if EtcdConfiguration.exists(backend_interval_key):
            interval = EtcdConfiguration.get(backend_interval_key)
        else:
            interval = EtcdConfiguration.get('/ovs/alba/backends/global_gui_error_interval')
        config = 'etcd://127.0.0.1:2379/ovs/arakoon/{0}/config'.format(self.abm_services[0].service.name)
        for found_osd in AlbaCLI.run('list-all-osds', config=config, as_json=True):
            node_id = found_osd['node_id']
            asd_id = found_osd['long_id']
            for _disk in storage_map.get(node_id, {}).values():
                asd_data = _disk['asds'].get(asd_id, {})
                if 'state' not in asd_data:
                    continue
                if found_osd.get('decommissioned') is True:
                    asd_data['status'] = 'unavailable'
                    asd_data['status_detail'] = 'decommissioned'
                    continue
                state = asd_data['state']
                if state == 'ok':
                    if found_osd['id'] is None:
                        alba_id = found_osd['alba_id']
                        if alba_id is None:
                            asd_data['status'] = 'available'
                        else:
                            asd_data['status'] = 'unavailable'
                            alba_backend = alba_backend_map.get(alba_id)
                            if alba_backend is not None:
                                asd_data['alba_backend_guid'] = alba_backend.guid
                    else:
                        asd_data['alba_backend_guid'] = self.guid
                        asd_data['status'] = 'warning'
                        asd_data['status_detail'] = 'recenterrors'

                        read = found_osd['read'] or [0]
                        write = found_osd['write'] or [0]
                        errors = found_osd['errors']
                        if len(errors) == 0 or (len(read + write) > 0 and max(min(read), min(write)) > max(error[0] for error in errors) + interval):
                            asd_data['status'] = 'claimed'
                            asd_data['status_detail'] = ''
                else:
                    asd_data['status'] = 'error'
                    asd_data['status_detail'] = asd_data.get('state_detail', '')
                    alba_backend = alba_backend_map.get(found_osd.get('alba_id'))
                    if alba_backend is not None:
                        asd_data['alba_backend_guid'] = alba_backend.guid
        return storage_map
Example #33
0
    def migrate(previous_version):
        """
        Migrates from a given version to the current version. It uses 'previous_version' to be smart
        wherever possible, but the code should be able to migrate any version towards the expected version.
        When this is not possible, the code can set a minimum version and raise when it is not met.
        :param previous_version: The previous version from which to start the migration
        :type previous_version: float
        """

        working_version = previous_version

        if working_version == 0:
            from ovs.dal.hybrids.servicetype import ServiceType
            # Initial version:
            # * Add any basic configuration or model entries

            # Add backends
            for backend_type_info in [('ALBA', 'alba')]:
                code = backend_type_info[1]
                backend_type = BackendTypeList.get_backend_type_by_code(code)
                if backend_type is None:
                    backend_type = BackendType()
                backend_type.name = backend_type_info[0]
                backend_type.code = code
                backend_type.save()

            # Add service types
            for service_type_info in [
                    ServiceType.SERVICE_TYPES.NS_MGR,
                    ServiceType.SERVICE_TYPES.ALBA_MGR,
                    ServiceType.SERVICE_TYPES.ALBA_S3_TRANSACTION
            ]:
                service_type = ServiceType()
                service_type.name = service_type_info
                service_type.save()

        # From here on, all actual migration should happen to get to the expected state for THIS RELEASE
        elif working_version < DALMigrator.THIS_VERSION:
            import hashlib
            from ovs.dal.exceptions import ObjectNotFoundException
            from ovs.dal.helpers import HybridRunner, Descriptor
            from ovs.dal.hybrids.albaabmcluster import ABMCluster
            from ovs.dal.hybrids.albaosd import AlbaOSD
            from ovs.dal.hybrids.albansmcluster import NSMCluster
            from ovs.dal.hybrids.j_abmservice import ABMService
            from ovs.dal.hybrids.j_nsmservice import NSMService
            from ovs.dal.hybrids.service import Service
            from ovs.dal.hybrids.servicetype import ServiceType
            from ovs.dal.lists.albabackendlist import AlbaBackendList
            from ovs.dal.lists.albanodelist import AlbaNodeList
            from ovs.dal.lists.servicetypelist import ServiceTypeList
            from ovs.dal.lists.storagerouterlist import StorageRouterList
            from ovs.extensions.db.arakooninstaller import ArakoonClusterConfig, ArakoonInstaller
            from ovs.extensions.generic.configuration import Configuration, NotFoundException
            from ovs_extensions.generic.toolbox import ExtensionsToolbox
            from ovs.extensions.plugins.albacli import AlbaCLI
            from ovs.extensions.storage.persistentfactory import PersistentFactory

            # Migrate unique constraints & indexes
            client = PersistentFactory.get_client()
            hybrid_structure = HybridRunner.get_hybrids()
            for class_descriptor in hybrid_structure.values():
                cls = Descriptor().load(class_descriptor).get_object()
                classname = cls.__name__.lower()
                unique_key = 'ovs_unique_{0}_{{0}}_'.format(classname)
                index_prefix = 'ovs_index_{0}|{{0}}|'.format(classname)
                index_key = 'ovs_index_{0}|{{0}}|{{1}}'.format(classname)
                uniques = []
                indexes = []
                # noinspection PyProtectedMember
                for prop in cls._properties:
                    if prop.unique is True and len([
                            k for k in client.prefix(
                                unique_key.format(prop.name))
                    ]) == 0:
                        uniques.append(prop.name)
                    if prop.indexed is True and len([
                            k for k in client.prefix(
                                index_prefix.format(prop.name))
                    ]) == 0:
                        indexes.append(prop.name)
                if len(uniques) > 0 or len(indexes) > 0:
                    prefix = 'ovs_data_{0}_'.format(classname)
                    for key, data in client.prefix_entries(prefix):
                        for property_name in uniques:
                            ukey = '{0}{1}'.format(
                                unique_key.format(property_name),
                                hashlib.sha1(str(
                                    data[property_name])).hexdigest())
                            client.set(ukey, key)
                        for property_name in indexes:
                            if property_name not in data:
                                continue  # This is the case when there's a new indexed property added.
                            ikey = index_key.format(
                                property_name,
                                hashlib.sha1(str(
                                    data[property_name])).hexdigest())
                            index = list(
                                client.get_multi([ikey], must_exist=False))[0]
                            transaction = client.begin_transaction()
                            if index is None:
                                client.assert_value(ikey,
                                                    None,
                                                    transaction=transaction)
                                client.set(ikey, [key],
                                           transaction=transaction)
                            elif key not in index:
                                client.assert_value(ikey,
                                                    index[:],
                                                    transaction=transaction)
                                client.set(ikey,
                                           index + [key],
                                           transaction=transaction)
                            client.apply_transaction(transaction)

            #############################################
            # Introduction of ABMCluster and NSMCluster #
            #############################################
            # Verify presence of unchanged ALBA Backends
            alba_backends = AlbaBackendList.get_albabackends()
            changes_required = False
            for alba_backend in alba_backends:
                if alba_backend.abm_cluster is None or len(
                        alba_backend.nsm_clusters) == 0:
                    changes_required = True
                    break

            if changes_required:
                # Retrieve ABM and NSM clusters
                abm_cluster_info = []
                nsm_cluster_info = []
                for cluster_name in Configuration.list('/ovs/arakoon'):
                    try:
                        metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(
                            cluster_name=cluster_name)
                        if metadata[
                                'cluster_type'] == ServiceType.ARAKOON_CLUSTER_TYPES.ABM:
                            abm_cluster_info.append(metadata)
                        elif metadata[
                                'cluster_type'] == ServiceType.ARAKOON_CLUSTER_TYPES.NSM:
                            nsm_cluster_info.append(metadata)
                    except NotFoundException:
                        continue

                # Retrieve NSM Arakoon cluster information
                cluster_arakoon_map = {}
                for cluster_info in abm_cluster_info + nsm_cluster_info:
                    cluster_name = cluster_info['cluster_name']
                    arakoon_config = ArakoonClusterConfig(
                        cluster_id=cluster_name)
                    cluster_arakoon_map[
                        cluster_name] = arakoon_config.export_dict()

                storagerouter_map = dict(
                    (storagerouter.machine_id, storagerouter) for storagerouter
                    in StorageRouterList.get_storagerouters())
                alba_backend_id_map = dict((alba_backend.alba_id, alba_backend)
                                           for alba_backend in alba_backends)
                for cluster_info in abm_cluster_info:
                    internal = cluster_info['internal']
                    cluster_name = cluster_info['cluster_name']
                    config_location = Configuration.get_configuration_path(
                        key=ArakoonClusterConfig.CONFIG_KEY.format(
                            cluster_name))
                    try:
                        alba_id = AlbaCLI.run(command='get-alba-id',
                                              config=config_location,
                                              named_params={'attempts':
                                                            3})['id']
                        nsm_hosts = AlbaCLI.run(command='list-nsm-hosts',
                                                config=config_location,
                                                named_params={'attempts': 3})
                    except RuntimeError:
                        continue

                    alba_backend = alba_backend_id_map.get(alba_id)
                    if alba_backend is None:  # ALBA Backend with ID not found in model
                        continue
                    if alba_backend.abm_cluster is not None and len(
                            alba_backend.nsm_clusters
                    ) > 0:  # Clusters already exist
                        continue

                    # Create ABM Cluster
                    if alba_backend.abm_cluster is None:
                        abm_cluster = ABMCluster()
                        abm_cluster.name = cluster_name
                        abm_cluster.alba_backend = alba_backend
                        abm_cluster.config_location = ArakoonClusterConfig.CONFIG_KEY.format(
                            cluster_name)
                        abm_cluster.save()
                    else:
                        abm_cluster = alba_backend.abm_cluster

                    # Create ABM Services
                    abm_arakoon_config = cluster_arakoon_map[cluster_name]
                    abm_arakoon_config.pop('global')
                    arakoon_nodes = abm_arakoon_config.keys()
                    if internal is False:
                        services_to_create = 1
                    else:
                        if set(arakoon_nodes).difference(
                                set(storagerouter_map.keys())):
                            continue
                        services_to_create = len(arakoon_nodes)
                    for index in range(services_to_create):
                        service = Service()
                        service.name = 'arakoon-{0}-abm'.format(
                            alba_backend.name)
                        service.type = ServiceTypeList.get_by_name(
                            ServiceType.SERVICE_TYPES.ALBA_MGR)
                        if internal is True:
                            arakoon_node_config = abm_arakoon_config[
                                arakoon_nodes[index]]
                            service.ports = [
                                arakoon_node_config['client_port'],
                                arakoon_node_config['messaging_port']
                            ]
                            service.storagerouter = storagerouter_map[
                                arakoon_nodes[index]]
                        else:
                            service.ports = []
                            service.storagerouter = None
                        service.save()

                        abm_service = ABMService()
                        abm_service.service = service
                        abm_service.abm_cluster = abm_cluster
                        abm_service.save()

                    # Create NSM Clusters
                    for cluster_index, nsm_host in enumerate(
                            sorted(nsm_hosts,
                                   key=lambda host: ExtensionsToolbox.
                                   advanced_sort(host['cluster_id'], '_'))):
                        nsm_cluster_name = nsm_host['cluster_id']
                        nsm_arakoon_config = cluster_arakoon_map.get(
                            nsm_cluster_name)
                        if nsm_arakoon_config is None:
                            continue

                        number = cluster_index if internal is False else int(
                            nsm_cluster_name.split('_')[-1])
                        nsm_cluster = NSMCluster()
                        nsm_cluster.name = nsm_cluster_name
                        nsm_cluster.number = number
                        nsm_cluster.alba_backend = alba_backend
                        nsm_cluster.config_location = ArakoonClusterConfig.CONFIG_KEY.format(
                            nsm_cluster_name)
                        nsm_cluster.save()

                        # Create NSM Services
                        nsm_arakoon_config.pop('global')
                        arakoon_nodes = nsm_arakoon_config.keys()
                        if internal is False:
                            services_to_create = 1
                        else:
                            if set(arakoon_nodes).difference(
                                    set(storagerouter_map.keys())):
                                continue
                            services_to_create = len(arakoon_nodes)
                        for service_index in range(services_to_create):
                            service = Service()
                            service.name = 'arakoon-{0}-nsm_{1}'.format(
                                alba_backend.name, number)
                            service.type = ServiceTypeList.get_by_name(
                                ServiceType.SERVICE_TYPES.NS_MGR)
                            if internal is True:
                                arakoon_node_config = nsm_arakoon_config[
                                    arakoon_nodes[service_index]]
                                service.ports = [
                                    arakoon_node_config['client_port'],
                                    arakoon_node_config['messaging_port']
                                ]
                                service.storagerouter = storagerouter_map[
                                    arakoon_nodes[service_index]]
                            else:
                                service.ports = []
                                service.storagerouter = None
                            service.save()

                            nsm_service = NSMService()
                            nsm_service.service = service
                            nsm_service.nsm_cluster = nsm_cluster
                            nsm_service.save()

            # Clean up all junction services no longer linked to an ALBA Backend
            all_nsm_services = [
                service.nsm_service for service in ServiceTypeList.get_by_name(
                    ServiceType.SERVICE_TYPES.NS_MGR).services
                if service.nsm_service.nsm_cluster is None
            ]
            all_abm_services = [
                service.abm_service for service in ServiceTypeList.get_by_name(
                    ServiceType.SERVICE_TYPES.ALBA_MGR).services
                if service.abm_service.abm_cluster is None
            ]
            for abm_service in all_abm_services:
                abm_service.delete()
                abm_service.service.delete()
            for nsm_service in all_nsm_services:
                nsm_service.delete()
                nsm_service.service.delete()

            ################################
            # Introduction of Active Drive #
            ################################
            # Update slot_id and Alba Node relation for all OSDs
            client = PersistentFactory.get_client()
            disk_osd_map = {}
            for key, data in client.prefix_entries('ovs_data_albaosd_'):
                alba_disk_guid = data.get('alba_disk', {}).get('guid')
                if alba_disk_guid is not None:
                    if alba_disk_guid not in disk_osd_map:
                        disk_osd_map[alba_disk_guid] = []
                    disk_osd_map[alba_disk_guid].append(
                        key.replace('ovs_data_albaosd_', ''))
                try:
                    value = client.get(key)
                    value.pop('alba_disk', None)
                    client.set(key=key, value=value)
                except Exception:
                    pass  # We don't care if we would have any leftover AlbaDisk information in _data, but its cleaner not to

            alba_guid_node_map = dict(
                (an.guid, an) for an in AlbaNodeList.get_albanodes())
            for key, data in client.prefix_entries('ovs_data_albadisk_'):
                alba_disk_guid = key.replace('ovs_data_albadisk_', '')
                alba_node_guid = data.get('alba_node', {}).get('guid')
                if alba_disk_guid in disk_osd_map and alba_node_guid in alba_guid_node_map and len(
                        data.get('aliases', [])) > 0:
                    slot_id = data['aliases'][0].split('/')[-1]
                    for osd_guid in disk_osd_map[alba_disk_guid]:
                        try:
                            osd = AlbaOSD(osd_guid)
                        except ObjectNotFoundException:
                            continue
                        osd.slot_id = slot_id
                        osd.alba_node = alba_guid_node_map[alba_node_guid]
                        osd.save()
                client.delete(key=key, must_exist=False)

            # Remove unique constraints for AlbaNode IP
            for key in client.prefix('ovs_unique_albanode_ip_'):
                client.delete(key=key, must_exist=False)

            # Remove relation for all Alba Disks
            for key in client.prefix('ovs_reverseindex_albadisk_'):
                client.delete(key=key, must_exist=False)

            # Remove the relation between AlbaNode and AlbaDisk
            for key in client.prefix('ovs_reverseindex_albanode_'):
                if '|disks|' in key:
                    client.delete(key=key, must_exist=False)

        return DALMigrator.THIS_VERSION
    def get_backend_stats():
        """
        Send backend stats for each backend to InfluxDB
        """
        points = []
        abms = []
        abs = []

        for service in ServiceList.get_services():
            if service.type.name == ServiceType.SERVICE_TYPES.ALBA_MGR:
                abms.append(service.name)

        for ab in AlbaNodeList.get_albanodes():
            abs.append(ab.node_id)

        abms = list(set(abms))

        config = "etcd://127.0.0.1:2379/ovs/arakoon/{}/config".format(abms[0])
        try:
            decommissioning_osds = AlbaCLI.run('list-decommissioning-osds', config=config, to_json=True)
        except Exception as ex:
            StatsmonkeyScheduledTaskController._logger.error('{0}'.format(ex.message))
            return None

        filtered_osds = []

        for ab in abs:
            filtered_osds += [osd for osd in decommissioning_osds if osd['node_id'] == ab]

        abl = AlbaBackendList.get_albabackends()

        for ab in abl:
            try:
                stat = {
                    'measurement': 'backend_stats',
                    'tags': {
                        'backend_name': ab.name
                    },
                    'fields': {
                        'gets': ab.statistics['multi_get']['n'],
                        'puts': ab.statistics['apply']['n']
                    }
                }
                stat_asd = {
                    'decommissioning': len(filtered_osds),
                    'decommissioned': 0,
                    'claimed': 0,
                    'warning': 0,
                    'failure': 0,
                    'error': 0
                }

                for disks in ab.local_stack.values():
                    for disk in disks.values():
                        for asd in disk['asds'].values():
                            if asd['alba_backend_guid'] == ab.guid:
                                status = asd['status']
                                status_detail = asd['status_detail']
                                if status_detail == 'decommissioned':
                                    status = status_detail
                                if status not in stat_asd:
                                    stat_asd[status] = 0
                                stat_asd[status] += 1

                for status in stat_asd:
                    stat['fields'][status] = stat_asd[status]
                points.append(stat)
            except Exception as ex:
                StatsmonkeyScheduledTaskController._logger.error(ex.message)

        if len(points) == 0:
            StatsmonkeyScheduledTaskController._logger.info("No statistics found")
            return None

        StatsmonkeyScheduledTaskController._send_stats(points)
        return points