Ejemplo n.º 1
0
class MigrationController(object):
    """
    This controller contains (part of the) migration code. It runs out-of-band with the updater so we reduce the risk of
    failures during the update
    """
    _logger = Logger(name='update', forced_target_type='file')

    @staticmethod
    @ovs_task(name='ovs.migration.migrate', schedule=Schedule(minute='0', hour='6'), ensure_single_info={'mode': 'DEFAULT'})
    def migrate():
        """
        Executes async migrations. It doesn't matter too much when they are executed, as long as they get eventually
        executed. This code will typically contain:
        * "dangerous" migration code (it needs certain running services)
        * Migration code depending on a cluster-wide state
        * ...
        * Successfully finishing a piece of migration code, should create an entry in /ovs/framework/migration in case it should not be executed again
        *     Eg: /ovs/framework/migration|stats_monkey_integration: True
        """
        MigrationController._logger.info('Preparing out of band migrations...')

        from ovs.dal.lists.servicetypelist import ServiceTypeList
        from ovs.dal.lists.storagedriverlist import StorageDriverList
        from ovs.dal.lists.storagerouterlist import StorageRouterList
        from ovs.dal.lists.vpoollist import VPoolList
        from ovs.extensions.db.arakooninstaller import ArakoonInstaller
        from ovs.extensions.generic.configuration import Configuration
        from ovs.extensions.generic.sshclient import SSHClient
        from ovs_extensions.generic.toolbox import ExtensionsToolbox
        from ovs.extensions.migration.migration.ovsmigrator import ExtensionMigrator
        from ovs.extensions.packages.packagefactory import PackageFactory
        from ovs_extensions.services.interfaces.systemd import Systemd
        from ovs.extensions.services.servicefactory import ServiceFactory
        from ovs.extensions.storageserver.storagedriver import StorageDriverConfiguration
        from ovs.lib.helpers.storagedriver.installer import StorageDriverInstaller

        MigrationController._logger.info('Start out of band migrations...')
        service_manager = ServiceFactory.get_manager()

        sr_client_map = {}
        for storagerouter in StorageRouterList.get_storagerouters():
            sr_client_map[storagerouter.guid] = SSHClient(endpoint=storagerouter.ip,  # Is triggered during post-update code too during which the ovs-watcher-framework service is still down and thus not refreshing the heartbeat --> use IP i/o StorageRouter
                                                          username='******')

        #########################################################
        # Addition of 'ExecReload' for AlbaProxy SystemD services
        if ServiceFactory.get_service_type() == 'systemd':
            changed_clients = set()
            for storagedriver in StorageDriverList.get_storagedrivers():
                root_client = sr_client_map[storagedriver.storagerouter_guid]
                for alba_proxy in storagedriver.alba_proxies:
                    service = alba_proxy.service
                    service_name = 'ovs-{0}'.format(service.name)
                    if not service_manager.has_service(name=service_name, client=root_client):
                        continue
                    if 'ExecReload=' in root_client.file_read(filename='/lib/systemd/system/{0}.service'.format(service_name)):
                        continue

                    try:
                        service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY, client=root_client, target_name=service_name)
                        changed_clients.add(root_client)
                    except:
                        MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name))
            for root_client in changed_clients:
                root_client.run(['systemctl', 'daemon-reload'])

        ##################################################################
        # Adjustment of open file descriptors for Arakoon services to 8192
        changed_clients = set()
        for storagerouter in StorageRouterList.get_storagerouters():
            root_client = sr_client_map[storagerouter.guid]
            for service_name in service_manager.list_services(client=root_client):
                if not service_name.startswith('ovs-arakoon-'):
                    continue

                if ServiceFactory.get_service_type() == 'systemd':
                    path = '/lib/systemd/system/{0}.service'.format(service_name)
                    check = 'LimitNOFILE=8192'
                else:
                    path = '/etc/init/{0}.conf'.format(service_name)
                    check = 'limit nofile 8192 8192'

                if not root_client.file_exists(path):
                    continue
                if check in root_client.file_read(path):
                    continue

                try:
                    service_manager.regenerate_service(name='ovs-arakoon', client=root_client, target_name=service_name)
                    changed_clients.add(root_client)
                    ExtensionsToolbox.edit_version_file(client=root_client,
                                                        package_name='arakoon',
                                                        old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, service_name))
                except:
                    MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name))
        for root_client in changed_clients:
            root_client.run(['systemctl', 'daemon-reload'])

        #############################
        # Migrate to multiple proxies
        for storagedriver in StorageDriverList.get_storagedrivers():
            vpool = storagedriver.vpool
            root_client = sr_client_map[storagedriver.storagerouter_guid]
            for alba_proxy in storagedriver.alba_proxies:
                # Rename alba_proxy service in model
                service = alba_proxy.service
                old_service_name = 'albaproxy_{0}'.format(vpool.name)
                new_service_name = 'albaproxy_{0}_0'.format(vpool.name)
                if old_service_name != service.name:
                    continue
                service.name = new_service_name
                service.save()

                if not service_manager.has_service(name=old_service_name, client=root_client):
                    continue
                old_configuration_key = '/ovs/framework/hosts/{0}/services/{1}'.format(storagedriver.storagerouter.machine_id, old_service_name)
                if not Configuration.exists(key=old_configuration_key):
                    continue

                # Add '-reboot' to alba_proxy services (because of newly created services and removal of old service)
                ExtensionsToolbox.edit_version_file(client=root_client,
                                                    package_name='alba',
                                                    old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, old_service_name),
                                                    new_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, new_service_name))

                # Register new service and remove old service
                service_manager.add_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY,
                                            client=root_client,
                                            params=Configuration.get(old_configuration_key),
                                            target_name='ovs-{0}'.format(new_service_name))

                # Update scrub proxy config
                proxy_config_key = '/ovs/vpools/{0}/proxies/{1}/config/main'.format(vpool.guid, alba_proxy.guid)
                proxy_config = None if Configuration.exists(key=proxy_config_key) is False else Configuration.get(proxy_config_key)
                if proxy_config is not None:
                    fragment_cache = proxy_config.get(StorageDriverConfiguration.CACHE_FRAGMENT, ['none', {}])
                    if fragment_cache[0] == 'alba' and fragment_cache[1].get('cache_on_write') is True:  # Accelerated ALBA configured
                        fragment_cache_scrub_info = copy.deepcopy(fragment_cache)
                        fragment_cache_scrub_info[1]['cache_on_read'] = False
                        proxy_scrub_config_key = '/ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid)
                        proxy_scrub_config = None if Configuration.exists(key=proxy_scrub_config_key) is False else Configuration.get(proxy_scrub_config_key)
                        if proxy_scrub_config is not None and proxy_scrub_config[StorageDriverConfiguration.CACHE_FRAGMENT] == ['none']:
                            proxy_scrub_config[StorageDriverConfiguration.CACHE_FRAGMENT] = fragment_cache_scrub_info
                            Configuration.set(key=proxy_scrub_config_key, value=proxy_scrub_config)

            # Update 'backend_connection_manager' section
            changes = False
            storagedriver_config = StorageDriverConfiguration(vpool.guid, storagedriver.storagedriver_id)
            if 'backend_connection_manager' not in storagedriver_config.configuration:
                continue

            current_config = storagedriver_config.configuration['backend_connection_manager']
            if current_config.get('backend_type') != 'MULTI':
                changes = True
                backend_connection_manager = {'backend_type': 'MULTI'}
                for index, proxy in enumerate(sorted(storagedriver.alba_proxies, key=lambda pr: pr.service.ports[0])):
                    backend_connection_manager[str(index)] = copy.deepcopy(current_config)
                    # noinspection PyUnresolvedReferences
                    backend_connection_manager[str(index)]['alba_connection_use_rora'] = True
                    # noinspection PyUnresolvedReferences
                    backend_connection_manager[str(index)]['alba_connection_rora_manifest_cache_capacity'] = 5000
                    # noinspection PyUnresolvedReferences
                    for key, value in backend_connection_manager[str(index)].items():
                        if key.startswith('backend_interface'):
                            backend_connection_manager[key] = value
                            # noinspection PyUnresolvedReferences
                            del backend_connection_manager[str(index)][key]
                for key, value in {'backend_interface_retries_on_error': 5,
                                   'backend_interface_retry_interval_secs': 1,
                                   'backend_interface_retry_backoff_multiplier': 2.0}.iteritems():
                    if key not in backend_connection_manager:
                        backend_connection_manager[key] = value
            else:
                backend_connection_manager = current_config
                for value in backend_connection_manager.values():
                    if isinstance(value, dict):
                        for key, val in value.items():
                            if key.startswith('backend_interface'):
                                backend_connection_manager[key] = val
                                changes = True
                                del value[key]
                for key, value in {'backend_interface_retries_on_error': 5,
                                   'backend_interface_retry_interval_secs': 1,
                                   'backend_interface_retry_backoff_multiplier': 2.0}.iteritems():
                    if key not in backend_connection_manager:
                        changes = True
                        backend_connection_manager[key] = value

            if changes is True:
                storagedriver_config.clear_backend_connection_manager()
                storagedriver_config.configure_backend_connection_manager(**backend_connection_manager)
                storagedriver_config.save(root_client)

                # Add '-reboot' to volumedriver services (because of updated 'backend_connection_manager' section)
                ExtensionsToolbox.edit_version_file(client=root_client,
                                                    package_name='volumedriver',
                                                    old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, 'volumedriver_{0}'.format(vpool.name)))
                if service_manager.__class__ == Systemd:
                    root_client.run(['systemctl', 'daemon-reload'])

        ########################################
        # Update metadata_store_bits information
        vpools = VPoolList.get_vpools()
        for vpool in vpools:
            bits = None
            for storagedriver in vpool.storagedrivers:
                key = '/ovs/framework/hosts/{0}/services/volumedriver_{1}'.format(storagedriver.storagerouter.machine_id, vpool.name)
                if Configuration.exists(key=key) and 'METADATASTORE_BITS' not in Configuration.get(key=key):
                    if bits is None:
                        entries = service_manager.extract_from_service_file(name='ovs-volumedriver_{0}'.format(vpool.name),
                                                                            client=sr_client_map[storagedriver.storagerouter_guid],
                                                                            entries=['METADATASTORE_BITS='])
                        if len(entries) == 1:
                            bits = entries[0].split('=')[-1]
                            bits = int(bits) if bits.isdigit() else 5
                    if bits is not None:
                        try:
                            content = Configuration.get(key=key)
                            content['METADATASTORE_BITS'] = bits
                            Configuration.set(key=key, value=content)
                        except:
                            MigrationController._logger.exception('Error updating volumedriver info for vPool {0} on StorageRouter {1}'.format(vpool.name, storagedriver.storagerouter.name))

            if bits is not None:
                vpool.metadata_store_bits = bits
                vpool.save()

        #####################################
        # Update the vPool metadata structure
        def _update_metadata_structure(metadata):
            metadata = copy.deepcopy(metadata)
            cache_structure = {'read': False,
                               'write': False,
                               'is_backend': False,
                               'quota': None,
                               'backend_info': {'name': None,  # Will be filled in when is_backend is true
                                                'backend_guid': None,
                                                'alba_backend_guid': None,
                                                'policies': None,
                                                'preset': None,
                                                'arakoon_config': None,
                                                'connection_info': {'client_id': None,
                                                                    'client_secret': None,
                                                                    'host': None,
                                                                    'port': None,
                                                                    'local': None}}
                               }
            structure_map = {StorageDriverConfiguration.CACHE_BLOCK: {'read': 'block_cache_on_read',
                                                                      'write': 'block_cache_on_write',
                                                                      'quota': 'quota_bc',
                                                                      'backend_prefix': 'backend_bc_{0}'},
                             StorageDriverConfiguration.CACHE_FRAGMENT: {'read': 'fragment_cache_on_read',
                                                                         'write': 'fragment_cache_on_write',
                                                                         'quota': 'quota_fc',
                                                                         'backend_prefix': 'backend_aa_{0}'}}
            if 'arakoon_config' in metadata['backend']:  # Arakoon config should be placed under the backend info
                metadata['backend']['backend_info']['arakoon_config'] = metadata['backend'].pop('arakoon_config')
            if 'connection_info' in metadata['backend']:  # Connection info sohuld be placed under the backend info
                metadata['backend']['backend_info']['connection_info'] = metadata['backend'].pop('connection_info')
            if 'caching_info' not in metadata:  # Caching info is the new key
                would_be_caching_info = {}
                metadata['caching_info'] = would_be_caching_info
                # Extract all caching data for every storagerouter
                current_caching_info = metadata['backend'].pop('caching_info')  # Pop to mutate metadata
                for storagerouter_guid in current_caching_info.iterkeys():
                    current_cache_data = current_caching_info[storagerouter_guid]
                    storagerouter_caching_info = {}
                    would_be_caching_info[storagerouter_guid] = storagerouter_caching_info
                    for cache_type, cache_type_mapping in structure_map.iteritems():
                        new_cache_structure = copy.deepcopy(cache_structure)
                        storagerouter_caching_info[cache_type] = new_cache_structure
                        for new_structure_key, old_structure_key in cache_type_mapping.iteritems():
                            if new_structure_key == 'backend_prefix':
                                # Get possible backend related info
                                metadata_key = old_structure_key.format(storagerouter_guid)
                                if metadata_key not in metadata:
                                    continue
                                backend_data = metadata.pop(metadata_key)  # Pop to mutate metadata
                                new_cache_structure['is_backend'] = True
                                # Copy over the old data
                                new_cache_structure['backend_info']['arakoon_config'] = backend_data['arakoon_config']
                                new_cache_structure['backend_info'].update(backend_data['backend_info'])
                                new_cache_structure['backend_info']['connection_info'].update(backend_data['connection_info'])
                            else:
                                new_cache_structure[new_structure_key] = current_cache_data.get(old_structure_key)
            return metadata

        vpools = VPoolList.get_vpools()
        for vpool in vpools:
            try:
                new_metadata = _update_metadata_structure(vpool.metadata)
                vpool.metadata = new_metadata
                vpool.save()
            except KeyError:
                MigrationController._logger.exception('Exceptions occurred when updating the metadata for vPool {0}'.format(vpool.name))

        ##############################################
        # Always use indent=4 during Configuration set
        def _resave_all_config_entries(config_path='/ovs'):
            """
            Recursive functions which checks every config management key if its a directory or not.
            If not a directory, we retrieve the config and just save it again using the new indentation logic
            """
            for item in Configuration.list(config_path):
                new_path = config_path + '/' + item
                print new_path
                if Configuration.dir_exists(new_path) is True:
                    _resave_all_config_entries(config_path=new_path)
                else:
                    try:
                        _config = Configuration.get(new_path)
                        Configuration.set(new_path, _config)
                    except:
                        _config = Configuration.get(new_path, raw=True)
                        Configuration.set(new_path, _config, raw=True)
        if ExtensionMigrator.THIS_VERSION <= 13:  # There is no way of checking whether this new indentation logic has been applied, so we only perform this for version 13 and lower
            MigrationController._logger.info('Re-saving every configuration setting with new indentation rules')
            _resave_all_config_entries()

        ############################
        # Update some default values
        def _update_manifest_cache_size(_proxy_config_key):
            updated = False
            manifest_cache_size = 500 * 1024 * 1024
            if Configuration.exists(key=_proxy_config_key):
                _proxy_config = Configuration.get(key=_proxy_config_key)
                for cache_type in [StorageDriverConfiguration.CACHE_BLOCK, StorageDriverConfiguration.CACHE_FRAGMENT]:
                    if cache_type in _proxy_config and _proxy_config[cache_type][0] == 'alba':
                        if _proxy_config[cache_type][1]['manifest_cache_size'] != manifest_cache_size:
                            updated = True
                            _proxy_config[cache_type][1]['manifest_cache_size'] = manifest_cache_size
                if _proxy_config['manifest_cache_size'] != manifest_cache_size:
                    updated = True
                    _proxy_config['manifest_cache_size'] = manifest_cache_size

                if updated is True:
                    Configuration.set(key=_proxy_config_key, value=_proxy_config)
            return updated

        for storagedriver in StorageDriverList.get_storagedrivers():
            try:
                vpool = storagedriver.vpool
                root_client = sr_client_map[storagedriver.storagerouter_guid]
                _update_manifest_cache_size('/ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid))  # Generic scrub proxy is deployed every time scrubbing kicks in, so no need to restart these services
                for alba_proxy in storagedriver.alba_proxies:
                    if _update_manifest_cache_size('/ovs/vpools/{0}/proxies/{1}/config/main'.format(vpool.guid, alba_proxy.guid)) is True:
                        # Add '-reboot' to alba_proxy services (because of newly created services and removal of old service)
                        ExtensionsToolbox.edit_version_file(client=root_client,
                                                            package_name='alba',
                                                            old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, alba_proxy.service.name))

                # Update 'backend_connection_manager' section
                changes = False
                storagedriver_config = StorageDriverConfiguration(vpool.guid, storagedriver.storagedriver_id)
                if 'backend_connection_manager' not in storagedriver_config.configuration:
                    continue

                current_config = storagedriver_config.configuration['backend_connection_manager']
                for key, value in current_config.iteritems():
                    if key.isdigit() is True:
                        if value.get('alba_connection_asd_connection_pool_capacity') != 10:
                            changes = True
                            value['alba_connection_asd_connection_pool_capacity'] = 10
                        if value.get('alba_connection_timeout') != 30:
                            changes = True
                            value['alba_connection_timeout'] = 30
                        if value.get('alba_connection_rora_manifest_cache_capacity') != 25000:
                            changes = True
                            value['alba_connection_rora_manifest_cache_capacity'] = 25000

                if changes is True:
                    storagedriver_config.clear_backend_connection_manager()
                    storagedriver_config.configure_backend_connection_manager(**current_config)
                    storagedriver_config.save(root_client)

                    # Add '-reboot' to volumedriver services (because of updated 'backend_connection_manager' section)
                    ExtensionsToolbox.edit_version_file(client=root_client,
                                                        package_name='volumedriver',
                                                        old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, 'volumedriver_{0}'.format(vpool.name)))
            except Exception:
                MigrationController._logger.exception('Updating default configuration values failed for StorageDriver {0}'.format(storagedriver.storagedriver_id))

        ####################################################
        # Adding proxy fail fast as env variable for proxies
        changed_clients = set()
        for storagerouter in StorageRouterList.get_storagerouters():
            root_client = sr_client_map[storagerouter.guid]
            for service_name in service_manager.list_services(client=root_client):
                if not service_name.startswith('ovs-albaproxy_'):
                    continue

                if ServiceFactory.get_service_type() == 'systemd':
                    path = '/lib/systemd/system/{0}.service'.format(service_name)
                    check = 'Environment=ALBA_FAIL_FAST=true'
                else:
                    path = '/etc/init/{0}.conf'.format(service_name)
                    check = 'env ALBA_FAIL_FAST=true'

                if not root_client.file_exists(path):
                    continue
                if check in root_client.file_read(path):
                    continue

                try:
                    service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY, client=root_client, target_name=service_name)
                    changed_clients.add(root_client)
                    ExtensionsToolbox.edit_version_file(client=root_client,
                                                        package_name='alba',
                                                        old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, service_name))
                except:
                    MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name))
        for root_client in changed_clients:
            root_client.run(['systemctl', 'daemon-reload'])

        ######################################
        # Integration of stats monkey (2.10.2)
        if Configuration.get(key='/ovs/framework/migration|stats_monkey_integration', default=False) is False:
            try:
                # Get content of old key into new key
                old_stats_monkey_key = '/statsmonkey/statsmonkey'
                if Configuration.exists(key=old_stats_monkey_key) is True:
                    Configuration.set(key='/ovs/framework/monitoring/stats_monkey', value=Configuration.get(key=old_stats_monkey_key))
                    Configuration.delete(key=old_stats_monkey_key)

                # Make sure to disable the stats monkey by default or take over the current schedule if it was configured manually before
                celery_key = '/ovs/framework/scheduling/celery'
                current_value = None
                scheduling_config = Configuration.get(key=celery_key, default={})
                if 'statsmonkey.run_all_stats' in scheduling_config:  # Old celery task name of the stats monkey
                    current_value = scheduling_config.pop('statsmonkey.run_all_stats')
                scheduling_config['ovs.stats_monkey.run_all'] = current_value
                scheduling_config['alba.stats_monkey.run_all'] = current_value
                Configuration.set(key=celery_key, value=scheduling_config)

                support_key = '/ovs/framework/support'
                support_config = Configuration.get(key=support_key)
                support_config['support_agent'] = support_config.pop('enabled', True)
                support_config['remote_access'] = support_config.pop('enablesupport', False)
                Configuration.set(key=support_key, value=support_config)

                # Make sure once this finished, it never runs again by setting this key to True
                Configuration.set(key='/ovs/framework/migration|stats_monkey_integration', value=True)
            except Exception:
                MigrationController._logger.exception('Integration of stats monkey failed')

        ######################################################
        # Write away cluster ID to a file for back-up purposes
        try:
            cluster_id = Configuration.get(key='/ovs/framework/cluster_id', default=None)
            with open(Configuration.CONFIG_STORE_LOCATION, 'r') as config_file:
                config = json.load(config_file)
            if cluster_id is not None and config.get('cluster_id', None) is None:
                config['cluster_id'] = cluster_id
                with open(Configuration.CONFIG_STORE_LOCATION, 'w') as config_file:
                    json.dump(config, config_file, indent=4)
        except Exception:
            MigrationController._logger.exception('Writing cluster id to a file failed.')

        #########################################################
        # Additional string formatting in Arakoon services (2.11)
        try:
            if Configuration.get(key='/ovs/framework/migration|arakoon_service_update', default=False) is False:
                arakoon_service_names = [ArakoonInstaller.get_service_name_for_cluster(cluster_name=cluster_name) for cluster_name in Configuration.list(key='ovs/arakoon')]
                for storagerouter in StorageRouterList.get_masters():
                    for service_name in arakoon_service_names:
                        config_key = ServiceFactory.SERVICE_CONFIG_KEY.format(storagerouter.machine_id, service_name)
                        if Configuration.exists(key=config_key):
                            config = Configuration.get(key=config_key)
                            config['RUN_FILE_DIR'] = ServiceFactory.RUN_FILE_DIR
                            config['ARAKOON_PKG_NAME'] = PackageFactory.PKG_ARAKOON
                            config['ARAKOON_VERSION_CMD'] = PackageFactory.VERSION_CMD_ARAKOON
                            Configuration.set(key=config_key, value=config)
                # Make sure once this finished, it never runs again by setting this key to True
                Configuration.set(key='/ovs/framework/migration|arakoon_service_update', value=True)
        except Exception:
            MigrationController._logger.exception('Updating the string formatting for the Arakoon services failed')

        ############################################################
        # Additional string formatting in ALBA proxy services (2.11)
        changed_clients = set()
        try:
            if Configuration.get(key='/ovs/framework/migration|alba_proxy_service_update', default=False) is False:
                alba_pkg_name, alba_version_cmd = PackageFactory.get_package_and_version_cmd_for(component=PackageFactory.COMP_ALBA)
                for service in ServiceTypeList.get_by_name('AlbaProxy').services:
                    root_client = sr_client_map[service.storagerouter_guid]
                    config_key = ServiceFactory.SERVICE_CONFIG_KEY.format(service.storagerouter.machine_id, service.name)
                    if Configuration.exists(key=config_key):
                        config = Configuration.get(key=config_key)
                        config['RUN_FILE_DIR'] = ServiceFactory.RUN_FILE_DIR
                        config['ALBA_PKG_NAME'] = alba_pkg_name
                        config['ALBA_VERSION_CMD'] = alba_version_cmd
                        Configuration.set(key=config_key, value=config)
                        service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY,
                                                           client=root_client,
                                                           target_name='ovs-{0}'.format(service.name))
                        changed_clients.add(root_client)

                # Make sure once this finished, it never runs again by setting this key to True
                Configuration.set(key='/ovs/framework/migration|alba_proxy_service_update', value=True)
        except Exception:
            MigrationController._logger.exception('Updating the string formatting for the Arakoon services failed')

        ############################################################
        # Additional string formatting in DTL/VOLDRV services (2.11)
        try:
            if Configuration.get(key='/ovs/framework/migration|voldrv_service_update', default=False) is False:
                sd_pkg_name, sd_version_cmd = PackageFactory.get_package_and_version_cmd_for(component=PackageFactory.COMP_SD)
                for vpool in VPoolList.get_vpools():
                    for storagedriver in vpool.storagedrivers:
                        root_client = sr_client_map[storagedriver.storagerouter_guid]
                        for entry in ['dtl', 'volumedriver']:
                            service_name = '{0}_{1}'.format(entry, vpool.name)
                            service_template = StorageDriverInstaller.SERVICE_TEMPLATE_DTL if entry == 'dtl' else StorageDriverInstaller.SERVICE_TEMPLATE_SD
                            config_key = ServiceFactory.SERVICE_CONFIG_KEY.format(storagedriver.storagerouter.machine_id, service_name)
                            if Configuration.exists(key=config_key):
                                config = Configuration.get(key=config_key)
                                config['RUN_FILE_DIR'] = ServiceFactory.RUN_FILE_DIR
                                config['VOLDRV_PKG_NAME'] = sd_pkg_name
                                config['VOLDRV_VERSION_CMD'] = sd_version_cmd
                                Configuration.set(key=config_key, value=config)
                                service_manager.regenerate_service(name=service_template,
                                                                   client=root_client,
                                                                   target_name='ovs-{0}'.format(service_name))
                                changed_clients.add(root_client)

                # Make sure once this finished, it never runs again by setting this key to True
                Configuration.set(key='/ovs/framework/migration|voldrv_service_update', value=True)
        except Exception:
            MigrationController._logger.exception('Updating the string formatting for the Arakoon services failed')

        #######################################################
        # Storing actual package name in version files (2.11.0) (https://github.com/openvstorage/framework/issues/1876)
        if Configuration.get(key='/ovs/framework/migration|actual_package_name_in_version_file', default=False) is False:
            try:
                voldrv_pkg_name, _ = PackageFactory.get_package_and_version_cmd_for(component=PackageFactory.COMP_SD)
                for storagerouter in StorageRouterList.get_storagerouters():
                    root_client = sr_client_map.get(storagerouter.guid)
                    if root_client is None:
                        continue

                    for file_name in root_client.file_list(directory=ServiceFactory.RUN_FILE_DIR):
                        if not file_name.endswith('.version'):
                            continue
                        file_path = '{0}/{1}'.format(ServiceFactory.RUN_FILE_DIR, file_name)
                        contents = root_client.file_read(filename=file_path)
                        regenerate = False
                        if voldrv_pkg_name == PackageFactory.PKG_VOLDRV_SERVER:
                            if 'volumedriver-server' in contents:
                                regenerate = True
                                contents = contents.replace('volumedriver-server', PackageFactory.PKG_VOLDRV_SERVER)
                                root_client.file_write(filename=file_path, contents=contents)
                        elif voldrv_pkg_name == PackageFactory.PKG_VOLDRV_SERVER_EE:
                            if 'volumedriver-server' in contents or PackageFactory.PKG_VOLDRV_SERVER in contents:
                                regenerate = True
                                contents = contents.replace('volumedriver-server', PackageFactory.PKG_VOLDRV_SERVER_EE)
                                contents = contents.replace(PackageFactory.PKG_VOLDRV_SERVER, PackageFactory.PKG_VOLDRV_SERVER_EE)
                                root_client.file_write(filename=file_path, contents=contents)

                        if regenerate is True:
                            service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_DTL if file_name.startswith('dtl') else StorageDriverInstaller.SERVICE_TEMPLATE_SD,
                                                               client=root_client,
                                                               target_name='ovs-{0}'.format(file_name.split('.')[0]))  # Leave out .version
                            changed_clients.add(root_client)
                Configuration.set(key='/ovs/framework/migration|actual_package_name_in_version_file', value=True)
            except Exception:
                MigrationController._logger.exception('Updating actual package name for version files failed')

        for root_client in changed_clients:
            try:
                root_client.run(['systemctl', 'daemon-reload'])
            except Exception:
                MigrationController._logger.exception('Executing command "systemctl daemon-reload" failed')

        #########################################################
        # Addition of 'Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50' for AlbaProxy SystemD services
        if ServiceFactory.get_service_type() == 'systemd':
            changed_clients = set()
            for storagedriver in StorageDriverList.get_storagedrivers():
                root_client = sr_client_map[storagedriver.storagerouter_guid]
                for alba_proxy in storagedriver.alba_proxies:
                    service = alba_proxy.service
                    service_name = 'ovs-{0}'.format(service.name)
                    if not service_manager.has_service(name=service_name, client=root_client):
                        continue
                    if "Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50" in root_client.file_read(filename='/lib/systemd/system/{0}.service'.format(service_name)):
                        continue
                    try:
                        service_manager.regenerate_service(name='ovs-albaproxy', client=root_client, target_name=service_name)
                        changed_clients.add(root_client)
                    except:
                        MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name))
            for root_client in changed_clients:
                root_client.run(['systemctl', 'daemon-reload'])
        #########################################################
        # Addition of 'Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50' for Arakoon SystemD services
        if ServiceFactory.get_service_type() == 'systemd':
            changed_clients = set()
            for storagerouter in StorageRouterList.get_storagerouters():
                root_client = sr_client_map[storagerouter.guid]
                for service_name in service_manager.list_services(client=root_client):
                    if not service_name.startswith('ovs-arakoon-'):
                        continue
                    if not service_manager.has_service(name=service_name, client=root_client):
                        continue
                    if "Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50" in root_client.file_read(filename='/lib/systemd/system/{0}.service'.format(service_name)):
                        continue
                    try:
                        service_manager.regenerate_service(name='ovs-arakoon', client=root_client, target_name=service_name)
                        changed_clients.add(root_client)
                    except:
                        MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name))
            for root_client in changed_clients:
                root_client.run(['systemctl', 'daemon-reload'])

        MigrationController._logger.info('Finished out of band migrations')
Ejemplo n.º 2
0
class StorageDriverController(object):
    """
    Contains all BLL related to Storage Drivers
    """
    _logger = LogHandler.get('lib', name='storagedriver')

    @staticmethod
    @celery.task(name='ovs.storagedriver.mark_offline')
    def mark_offline(storagerouter_guid):
        """
        Marks all StorageDrivers on this StorageRouter offline
        :param storagerouter_guid: Guid of the Storage Router
        :type storagerouter_guid: str
        :return: None
        """
        for storagedriver in StorageRouter(storagerouter_guid).storagedrivers:
            vpool = storagedriver.vpool
            if len(vpool.storagedrivers) > 1:
                storagedriver_client = StorageDriverClient.load(vpool, excluded_storagedrivers=[storagedriver])
                storagedriver_client.mark_node_offline(str(storagedriver.storagedriver_id))

    @staticmethod
    @celery.task(name='ovs.storagedriver.volumedriver_error')
    @log('VOLUMEDRIVER_TASK')
    def volumedriver_error(code, volume_id):
        """
        Handles error messages/events from the volumedriver
        :param code: Volumedriver error code
        :type code: int
        :param volume_id: Name of the volume throwing the error
        :type volume_id: str
        :return: None
        """
        if code == VolumeDriverEvents.MDSFailover:
            disk = VDiskList.get_vdisk_by_volume_id(volume_id)
            if disk is not None:
                MDSServiceController.ensure_safety(disk)

    @staticmethod
    @add_hooks('setup', 'demote')
    def on_demote(cluster_ip, master_ip, offline_node_ips=None):
        """
        Handles the demote for the StorageDrivers
        :param cluster_ip: IP of the node to demote
        :type cluster_ip: str
        :param master_ip: IP of the master node
        :type master_ip: str
        :param offline_node_ips: IPs of nodes which are offline
        :type offline_node_ips: list
        :return: None
        """
        _ = master_ip
        if offline_node_ips is None:
            offline_node_ips = []
        client = SSHClient(cluster_ip, username='******') if cluster_ip not in offline_node_ips else None
        servicetype = ServiceTypeList.get_by_name(ServiceType.SERVICE_TYPES.ARAKOON)
        current_service = None
        remaining_ips = []
        for service in servicetype.services:
            if service.name == 'arakoon-voldrv' and service.is_internal is True:  # Externally managed arakoon cluster service does not have storage router
                if service.storagerouter.ip == cluster_ip:
                    current_service = service
                elif service.storagerouter.ip not in offline_node_ips:
                    remaining_ips.append(service.storagerouter.ip)
        if current_service is not None:
            if len(remaining_ips) == 0:
                raise RuntimeError('Could not find any remaining arakoon nodes for the voldrv cluster')
            StorageDriverController._logger.debug('* Shrink StorageDriver cluster')
            cluster_name = str(Configuration.get('/ovs/framework/arakoon_clusters|voldrv'))
            ArakoonInstaller.shrink_cluster(deleted_node_ip=cluster_ip,
                                            remaining_node_ip=remaining_ips[0],
                                            cluster_name=cluster_name,
                                            offline_nodes=offline_node_ips)
            if client is not None and ServiceManager.has_service(current_service.name, client=client) is True:
                ServiceManager.stop_service(current_service.name, client=client)
                ServiceManager.remove_service(current_service.name, client=client)
            ArakoonInstaller.restart_cluster_remove(cluster_name, remaining_ips, filesystem=False)
            current_service.delete()
            StorageDriverController._configure_arakoon_to_volumedriver(cluster_name=cluster_name)

    @staticmethod
    @add_hooks('setup', 'remove')
    def on_remove(cluster_ip, complete_removal):
        """
        Handles the StorageDriver removal part of a node
        :param cluster_ip: IP of the node which is being removed from the cluster
        :type cluster_ip: str
        :param complete_removal: Unused for StorageDriver, used for AlbaController
        :type complete_removal: bool
        :return: None
        """
        _ = complete_removal
        service_name = 'watcher-volumedriver'
        try:
            client = SSHClient(endpoint=cluster_ip, username='******')
            if ServiceManager.has_service(name=service_name, client=client):
                ServiceManager.stop_service(name=service_name, client=client)
                ServiceManager.remove_service(name=service_name, client=client)
        except UnableToConnectException:
            pass

    @staticmethod
    @celery.task(name='ovs.storagedriver.scheduled_voldrv_arakoon_checkup', schedule=Schedule(minute='15', hour='*'))
    def scheduled_voldrv_arakoon_checkup():
        """
        Makes sure the volumedriver arakoon is on all available master nodes
        :return: None
        """
        StorageDriverController._voldrv_arakoon_checkup(False)

    @staticmethod
    @celery.task(name='ovs.storagedriver.manual_voldrv_arakoon_checkup')
    def manual_voldrv_arakoon_checkup():
        """
        Creates a new Arakoon Cluster if required and extends cluster if possible on all available master nodes
        :return: None
        """
        StorageDriverController._voldrv_arakoon_checkup(True)

    @staticmethod
    @ensure_single(task_name='ovs.storagedriver.voldrv_arakoon_checkup')
    def _voldrv_arakoon_checkup(create_cluster):
        def add_service(service_storagerouter, arakoon_ports):
            """
            Add a service to the storage router
            :param service_storagerouter: Storage Router to add the service to
            :type service_storagerouter: StorageRouter
            :param arakoon_ports: Port information
            :type arakoon_ports: list
            :return: The newly created and added service
            :rtype: Service
            """
            new_service = Service()
            new_service.name = service_name
            new_service.type = service_type
            new_service.ports = arakoon_ports
            new_service.storagerouter = service_storagerouter
            new_service.save()
            return new_service

        service_name = 'arakoon-voldrv'
        service_type = ServiceTypeList.get_by_name(ServiceType.SERVICE_TYPES.ARAKOON)

        current_ips = []
        current_services = []
        for service in service_type.services:
            if service.name == service_name:
                current_services.append(service)
                if service.is_internal is True:
                    current_ips.append(service.storagerouter.ip)

        all_sr_ips = [storagerouter.ip for storagerouter in StorageRouterList.get_slaves()]
        available_storagerouters = {}
        for storagerouter in StorageRouterList.get_masters():
            storagerouter.invalidate_dynamics(['partition_config'])
            if len(storagerouter.partition_config[DiskPartition.ROLES.DB]) > 0:
                available_storagerouters[storagerouter] = DiskPartition(storagerouter.partition_config[DiskPartition.ROLES.DB][0])
            all_sr_ips.append(storagerouter.ip)

        if create_cluster is True and len(current_services) == 0:  # Create new cluster
            metadata = ArakoonInstaller.get_unused_arakoon_metadata_and_claim(cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.SD)
            if metadata is None:  # No externally managed cluster found, we create 1 ourselves
                if not available_storagerouters:
                    raise RuntimeError('Could not find any Storage Router with a DB role')

                storagerouter, partition = available_storagerouters.items()[0]
                result = ArakoonInstaller.create_cluster(cluster_name='voldrv',
                                                         cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.SD,
                                                         ip=storagerouter.ip,
                                                         base_dir=partition.folder,
                                                         filesystem=False)
                ports = [result['client_port'], result['messaging_port']]
                metadata = result['metadata']
                ArakoonInstaller.restart_cluster_add(cluster_name='voldrv',
                                                     current_ips=current_ips,
                                                     new_ip=storagerouter.ip,
                                                     filesystem=False)
                ArakoonInstaller.claim_cluster(cluster_name='voldrv',
                                               master_ip=storagerouter.ip,
                                               filesystem=False,
                                               metadata=metadata)
                current_ips.append(storagerouter.ip)
            else:
                ports = []
                storagerouter = None

            cluster_name = metadata['cluster_name']
            Configuration.set('/ovs/framework/arakoon_clusters|voldrv', cluster_name)
            StorageDriverController._logger.info('Claiming {0} managed arakoon cluster: {1}'.format('externally' if storagerouter is None else 'internally', cluster_name))
            StorageDriverController._configure_arakoon_to_volumedriver(cluster_name=cluster_name)
            current_services.append(add_service(service_storagerouter=storagerouter, arakoon_ports=ports))

        cluster_name = Configuration.get('/ovs/framework/arakoon_clusters').get('voldrv')
        if cluster_name is None:
            return
        metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name)
        if 0 < len(current_services) < len(available_storagerouters) and metadata['internal'] is True:
            for storagerouter, partition in available_storagerouters.iteritems():
                if storagerouter.ip in current_ips:
                    continue
                result = ArakoonInstaller.extend_cluster(master_ip=current_services[0].storagerouter.ip,
                                                         new_ip=storagerouter.ip,
                                                         cluster_name=cluster_name,
                                                         base_dir=partition.folder)
                add_service(storagerouter, [result['client_port'], result['messaging_port']])
                current_ips.append(storagerouter.ip)
                ArakoonInstaller.restart_cluster_add(cluster_name=cluster_name,
                                                     current_ips=current_ips,
                                                     new_ip=storagerouter.ip,
                                                     filesystem=False)
            StorageDriverController._configure_arakoon_to_volumedriver(cluster_name=cluster_name)

    @staticmethod
    def _configure_arakoon_to_volumedriver(cluster_name):
        StorageDriverController._logger.info('Update existing vPools')
        config = ArakoonClusterConfig(cluster_id=cluster_name, filesystem=False)
        config.load_config()
        arakoon_nodes = []
        for node in config.nodes:
            arakoon_nodes.append({'host': node.ip,
                                  'port': node.client_port,
                                  'node_id': node.name})
        if Configuration.dir_exists('/ovs/vpools'):
            for vpool_guid in Configuration.list('/ovs/vpools'):
                for storagedriver_id in Configuration.list('/ovs/vpools/{0}/hosts'.format(vpool_guid)):
                    storagedriver_config = StorageDriverConfiguration('storagedriver', vpool_guid, storagedriver_id)
                    storagedriver_config.load()
                    storagedriver_config.configure_volume_registry(vregistry_arakoon_cluster_id=cluster_name,
                                                                   vregistry_arakoon_cluster_nodes=arakoon_nodes)
                    storagedriver_config.configure_distributed_lock_store(dls_type='Arakoon',
                                                                          dls_arakoon_cluster_id=cluster_name,
                                                                          dls_arakoon_cluster_nodes=arakoon_nodes)
                    storagedriver_config.save(reload_config=True)

    @staticmethod
    def add_storagedriverpartition(storagedriver, partition_info):
        """
        Stores new storagedriver partition object with correct number
        :param storagedriver: Storagedriver to create the partition for
        :type storagedriver: StorageDriver
        :param partition_info: Partition information containing, role, size, sub_role, disk partition, MDS service
        :type partition_info: dict
        :return: Newly created storage driver partition
        :rtype: StorageDriverPartition
        """
        role = partition_info['role']
        size = partition_info.get('size')
        sub_role = partition_info.get('sub_role')
        partition = partition_info['partition']
        mds_service = partition_info.get('mds_service')
        highest_number = 0
        for existing_sdp in storagedriver.partitions:
            if existing_sdp.partition_guid == partition.guid and existing_sdp.role == role and existing_sdp.sub_role == sub_role:
                highest_number = max(existing_sdp.number, highest_number)
        sdp = StorageDriverPartition()
        sdp.role = role
        sdp.size = size
        sdp.number = highest_number + 1
        sdp.sub_role = sub_role
        sdp.partition = partition
        sdp.mds_service = mds_service
        sdp.storagedriver = storagedriver
        sdp.save()
        return sdp
Ejemplo n.º 3
0
class MDSServiceController(object):
    """
    Contains all BLL related to MDSServices
    """
    _logger = LogHandler.get('lib', name='mds')

    storagerouterclient.Logger.setupLogging(
        LogHandler.load_path('storagerouterclient'))
    # noinspection PyArgumentList
    storagerouterclient.Logger.enableLogging()

    @staticmethod
    def prepare_mds_service(storagerouter, vpool, fresh_only, reload_config):
        """
        Prepares an MDS service:
        * Creates the required configuration
        * Sets up the service files

        Assumes the StorageRouter and VPool are already configured with a StorageDriver and that all model-wise
        configuration regarding both is completed.
        :param storagerouter: Storagerouter on which MDS service will be created
        :type storagerouter: StorageRouter

        :param vpool: The vPool for which the MDS service will be created
        :type vpool: VPool

        :param fresh_only: If True and no current mds services exist for this vpool on this storagerouter, a new 1 will be created
        :type fresh_only: bool

        :param reload_config: If True, the volumedriver's updated configuration will be reloaded
        :type reload_config: bool

        :return: Newly created service
        :rtype: MDSService
        """
        # Fetch service sequence number based on MDS services for current vPool and current storage router
        service_number = -1
        for mds_service in vpool.mds_services:
            if mds_service.service.storagerouter_guid == storagerouter.guid:
                service_number = max(mds_service.number, service_number)

        if fresh_only is True and service_number >= 0:
            return  # There is already 1 or more MDS services running, aborting

        # VALIDATIONS
        # 1. Find free port based on MDS services for all vPools on current storage router
        client = SSHClient(storagerouter)
        mdsservice_type = ServiceTypeList.get_by_name(
            ServiceType.SERVICE_TYPES.MD_SERVER)
        occupied_ports = []
        for service in mdsservice_type.services:
            if service.storagerouter_guid == storagerouter.guid:
                occupied_ports.extend(service.ports)

        mds_port_range = Configuration.get(
            '/ovs/framework/hosts/{0}/ports|mds'.format(
                System.get_my_machine_id(client)))
        free_ports = System.get_free_ports(selected_range=mds_port_range,
                                           exclude=occupied_ports,
                                           nr=1,
                                           client=client)
        if not free_ports:
            raise RuntimeError(
                'Failed to find an available port on storage router {0} within range {1}'
                .format(storagerouter.name, mds_port_range))

        # 2. Partition check
        db_partition = None
        for disk in storagerouter.disks:
            for partition in disk.partitions:
                if DiskPartition.ROLES.DB in partition.roles:
                    db_partition = partition
                    break
        if db_partition is None:
            raise RuntimeError(
                'Could not find DB partition on storage router {0}'.format(
                    storagerouter.name))

        # 3. Verify storage driver configured
        storagedrivers = [
            sd for sd in vpool.storagedrivers
            if sd.storagerouter_guid == storagerouter.guid
        ]
        if not storagedrivers:
            raise RuntimeError(
                'Expected to find a configured storagedriver for vpool {0} on storage router {1}'
                .format(vpool.name, storagerouter.name))
        storagedriver = storagedrivers[0]

        # MODEL UPDATES
        # 1. Service
        service_number += 1
        service = Service()
        service.name = 'metadataserver_{0}_{1}'.format(vpool.name,
                                                       service_number)
        service.type = mdsservice_type
        service.ports = [free_ports[0]]
        service.storagerouter = storagerouter
        service.save()
        mds_service = MDSService()
        mds_service.vpool = vpool
        mds_service.number = service_number
        mds_service.service = service
        mds_service.save()

        # 2. Storage driver partitions
        from ovs.lib.storagedriver import StorageDriverController
        StorageDriverController.add_storagedriverpartition(
            storagedriver, {
                'size': None,
                'role': DiskPartition.ROLES.DB,
                'sub_role': StorageDriverPartition.SUBROLE.MDS,
                'partition': db_partition,
                'mds_service': mds_service
            })

        # CONFIGURATIONS
        # 1. Volumedriver
        mds_nodes = []
        for service in mdsservice_type.services:
            if service.storagerouter_guid == storagerouter.guid:
                mds_service = service.mds_service
                if mds_service is not None:
                    if mds_service.vpool_guid == vpool.guid:
                        sdp = [
                            sd_partition for sd_partition in
                            mds_service.storagedriver_partitions
                            if sd_partition.role == DiskPartition.ROLES.DB
                            and sd_partition.sub_role ==
                            StorageDriverPartition.SUBROLE.MDS
                        ][0]
                        mds_nodes.append({
                            'host': service.storagerouter.ip,
                            'port': service.ports[0],
                            'db_directory': sdp.path,
                            'scratch_directory': sdp.path
                        })

        # Generate the correct section in the Storage Driver's configuration
        storagedriver_config = StorageDriverConfiguration(
            'storagedriver', vpool.guid, storagedriver.storagedriver_id)
        storagedriver_config.load()
        storagedriver_config.configure_metadata_server(mds_nodes=mds_nodes)
        storagedriver_config.save(client, reload_config=reload_config)

        return mds_service

    @staticmethod
    def remove_mds_service(mds_service,
                           vpool,
                           reconfigure,
                           allow_offline=False):
        """
        Removes an MDS service
        :param mds_service: The MDS service to remove
        :type mds_service: MDSService

        :param vpool: The vPool for which the MDS service will be removed
        :type vpool: VPool

        :param reconfigure: Indicates whether reconfiguration is required
        :type reconfigure: bool

        :param allow_offline: Indicates whether it's OK that the node for which mds services are cleaned is offline
        :type allow_offline: bool
        """
        if len(mds_service.vdisks_guids) > 0 and allow_offline is False:
            raise RuntimeError(
                'Cannot remove MDSService that is still serving disks')

        mdsservice_type = ServiceTypeList.get_by_name(
            ServiceType.SERVICE_TYPES.MD_SERVER)

        # Clean up model
        directories_to_clean = []
        for sd_partition in mds_service.storagedriver_partitions:
            directories_to_clean.append(sd_partition.path)
            sd_partition.delete()

        if allow_offline is True:  # Certain vdisks might still be attached to this offline MDS service --> Delete relations
            for junction in mds_service.vdisks:
                junction.delete()

        mds_service.delete()
        mds_service.service.delete()

        storagerouter = mds_service.service.storagerouter
        try:
            client = SSHClient(storagerouter)
            if reconfigure is True:
                # Generate new mds_nodes section
                mds_nodes = []
                for service in mdsservice_type.services:
                    if service.storagerouter_guid == storagerouter.guid:
                        mds_service = service.mds_service
                        if mds_service.vpool_guid == vpool.guid:
                            sdp = [
                                sd_partition for sd_partition in
                                mds_service.storagedriver_partitions
                                if sd_partition.role == DiskPartition.ROLES.DB
                                and sd_partition.sub_role ==
                                StorageDriverPartition.SUBROLE.MDS
                            ][0]
                            mds_nodes.append({
                                'host': service.storagerouter.ip,
                                'port': service.ports[0],
                                'db_directory': sdp.path,
                                'scratch_directory': sdp.path
                            })

                # Generate the correct section in the Storage Driver's configuration
                storagedriver = [
                    sd for sd in storagerouter.storagedrivers
                    if sd.vpool_guid == vpool.guid
                ][0]
                storagedriver_config = StorageDriverConfiguration(
                    'storagedriver', vpool.guid,
                    storagedriver.storagedriver_id)
                storagedriver_config.load()
                storagedriver_config.configure_metadata_server(
                    mds_nodes=mds_nodes)
                storagedriver_config.save(client, reload_config=reconfigure)

            tries = 5
            while tries > 0:
                try:
                    root_client = SSHClient(storagerouter, username='******')
                    root_client.dir_delete(directories=directories_to_clean,
                                           follow_symlinks=True)
                    for dir_name in directories_to_clean:
                        MDSServiceController._logger.debug(
                            'Recursively removed {0}'.format(dir_name))
                    break
                except Exception:
                    MDSServiceController._logger.debug(
                        'Waiting for the MDS service to go down...')
                    time.sleep(5)
                    tries -= 1
                    if tries == 0:
                        raise
        except UnableToConnectException:
            if allow_offline is True:
                MDSServiceController._logger.info(
                    'Allowed offline node during mds service removal')
            else:
                raise

    @staticmethod
    def sync_vdisk_to_reality(vdisk):
        """
        Syncs a vdisk to reality (except hypervisor)
        :param vdisk: vDisk to synchronize
        :type vdisk: VDisk

        :return: None
        """
        vdisk.reload_client('storagedriver')
        vdisk.invalidate_dynamics(['info'])
        config = vdisk.info['metadata_backend_config']
        config_dict = {}
        for item in config:
            if item['ip'] not in config_dict:
                config_dict[item['ip']] = []
            config_dict[item['ip']].append(item['port'])
        mds_dict = {}
        for junction in vdisk.mds_services:
            service = junction.mds_service.service
            storagerouter = service.storagerouter
            if config[0]['ip'] == storagerouter.ip and config[0][
                    'port'] == service.ports[0]:
                junction.is_master = True
                junction.save()
                if storagerouter.ip not in mds_dict:
                    mds_dict[storagerouter.ip] = []
                mds_dict[storagerouter.ip].append(service.ports[0])
            elif storagerouter.ip in config_dict and service.ports[
                    0] in config_dict[storagerouter.ip]:
                junction.is_master = False
                junction.save()
                if storagerouter.ip not in mds_dict:
                    mds_dict[storagerouter.ip] = []
                mds_dict[storagerouter.ip].append(service.ports[0])
            else:
                junction.delete()
        for ip, ports in config_dict.iteritems():
            for port in ports:
                if ip not in mds_dict or port not in mds_dict[ip]:
                    service = ServiceList.get_by_ip_ports(ip, [port])
                    if service is not None:
                        mds_service_vdisk = MDSServiceVDisk()
                        mds_service_vdisk.vdisk = vdisk
                        mds_service_vdisk.mds_service = service.mds_service
                        mds_service_vdisk.is_master = config[0][
                            'ip'] == service.storagerouter.ip and config[0][
                                'port'] == service.ports[0]
                        mds_service_vdisk.save()

    @staticmethod
    def ensure_safety(vdisk, excluded_storagerouters=None):
        """
        Ensures (or tries to ensure) the safety of a given vdisk (except hypervisor).
        Assumptions:
        * A local overloaded master is better than a non-local non-overloaded master
        * Prefer master/services to be on different hosts, a subsequent slave on the same node doesn't add safety
        * Don't actively overload services (e.g. configure an MDS as slave causing it to get overloaded)
        * Too much safety is not wanted (it adds loads to nodes while not required)
        :param vdisk: vDisk to calculate a new safety for
        :type vdisk: VDisk

        :param excluded_storagerouters: Storagerouters to leave out of calculation (Eg: When 1 is down or unavailable)
        :type excluded_storagerouters: list

        :return: None
        """
        def _add_suitable_nodes(_importance, _safety):
            if len(nodes) < _safety:
                for local_load in sorted(all_info_dict[_importance]['loads']):
                    for local_service in all_info_dict[_importance]['loads'][
                            local_load]:
                        if len(
                                nodes
                        ) < _safety and local_service.storagerouter.ip not in nodes:
                            try:
                                SSHClient(local_service.storagerouter)
                                new_services.append(local_service)
                                nodes.add(local_service.storagerouter.ip)
                            except UnableToConnectException:
                                MDSServiceController._logger.debug(
                                    'MDS safety: vDisk {0}: Skipping storagerouter with IP {1} as it is unreachable'
                                    .format(vdisk.guid,
                                            service.storagerouter.ip))
            return nodes, new_services

        MDSServiceController._logger.debug(
            'MDS safety: vDisk {0}: Start checkup for virtual disk {1}'.format(
                vdisk.guid, vdisk.name))
        tlogs = Configuration.get('/ovs/framework/storagedriver|mds_tlogs')
        safety = Configuration.get('/ovs/framework/storagedriver|mds_safety')
        max_load = Configuration.get(
            '/ovs/framework/storagedriver|mds_maxload')

        ######################
        # GATHER INFORMATION #
        ######################
        vdisk.reload_client('storagedriver')
        vdisk.reload_client('objectregistry')

        vdisk.invalidate_dynamics(['storagedriver_id', 'storagerouter_guid'])
        if vdisk.storagerouter_guid is None:
            raise SRCObjectNotFoundException(
                'Cannot ensure MDS safety for vDisk {0} with guid {1} because vDisk is not attached to any Storage Router'
                .format(vdisk.name, vdisk.guid))

        if excluded_storagerouters is None:
            excluded_storagerouters = []

        # Sorted was added merely for unittests, because they rely on specific order of services and their ports
        # Default sorting behavior for relations used to be based on order in which relations were added
        # Now sorting is based on guid (DAL speedup changes)
        nodes = set()
        services = sorted([
            mds_service.service for mds_service in vdisk.vpool.mds_services
            if mds_service.service.storagerouter not in excluded_storagerouters
        ],
                          key=lambda k: k.ports)
        service_per_key = {}
        for service in services:
            nodes.add(service.storagerouter.ip)
            service_per_key['{0}:{1}'.format(service.storagerouter.ip,
                                             service.ports[0])] = service

        # Create a pool of StorageRouters being a part of the primary and secondary domains of this Storage Router
        vdisk_storagerouter = StorageRouter(vdisk.storagerouter_guid)
        primary_domains = [
            junction.domain for junction in vdisk_storagerouter.domains
            if junction.backup is False
        ]
        secondary_domains = [
            junction.domain for junction in vdisk_storagerouter.domains
            if junction.backup is True
        ]
        primary_storagerouters = set()
        secondary_storagerouters = set()
        for domain in primary_domains:
            primary_storagerouters.update(
                StorageRouterList.get_primary_storagerouters_for_domain(
                    domain))
        for domain in secondary_domains:
            secondary_storagerouters.update(
                StorageRouterList.get_primary_storagerouters_for_domain(
                    domain))

        # In case no domains have been configured
        if len(primary_storagerouters) == 0:
            primary_storagerouters = set(
                StorageRouterList.get_storagerouters())

        if vdisk_storagerouter not in primary_storagerouters or vdisk_storagerouter in secondary_storagerouters:
            raise ValueError(
                'StorageRouter {0} for vDisk {1} should be part of the primary domains and NOT be part of the secondary domains'
                .format(vdisk_storagerouter.name, vdisk.name))

        # Remove all storagerouters from secondary which are present in primary
        secondary_storagerouters = secondary_storagerouters.difference(
            primary_storagerouters)

        ###################################
        # VERIFY RECONFIGURATION REQUIRED #
        ###################################
        vdisk.invalidate_dynamics(['info'])
        configs = vdisk.info[
            'metadata_backend_config']  # Ordered MASTER, SLAVE (secondary domain of master)
        master_service = None
        reconfigure_reasons = []
        if len(configs) > 0:
            config = configs.pop(0)
            config_key = '{0}:{1}'.format(config['ip'], config['port'])
            master_service = service_per_key.get(config_key)
            if master_service is None:
                reconfigure_reasons.append(
                    'Master ({0}:{1}) cannot be used anymore'.format(
                        config['ip'], config['port']))
        slave_services = []
        for config in configs:
            config_key = '{0}:{1}'.format(config['ip'], config['port'])
            if config_key in service_per_key:
                slave_services.append(service_per_key[config_key])
            else:
                reconfigure_reasons.append(
                    'Slave ({0}:{1}) cannot be used anymore'.format(
                        config['ip'], config['port']))

        # If MDS already in use, take current load, else take next load
        all_info_dict = {
            'primary': {
                'used': [],
                'loads': {},
                'available': []
            },
            'secondary': {
                'used': [],
                'loads': {},
                'available': []
            }
        }
        services_load = {}
        for service in services:
            importance = None
            if service.storagerouter in primary_storagerouters:
                importance = 'primary'
            elif service.storagerouter in secondary_storagerouters:
                importance = 'secondary'

            loads = MDSServiceController.get_mds_load(service.mds_service)
            if service == master_service or service in slave_services:  # Service is still in use
                load = loads[0]
                if importance is not None:
                    all_info_dict[importance]['used'].append(service)
                else:
                    reconfigure_reasons.append(
                        'Service {0} cannot be used anymore because storagerouter with IP {1} is not part of the domains'
                        .format(service.name, service.storagerouter.ip))
            else:  # Service is not in use, but available
                load = loads[1]
            services_load[service] = load

            if importance is not None:
                all_info_dict[importance]['available'].append(service)
                if load <= max_load:
                    if load not in all_info_dict[importance]['loads']:
                        all_info_dict[importance]['loads'][load] = []
                    all_info_dict[importance]['loads'][load].append(service)

        service_nodes = []
        if master_service is not None:
            service_nodes.append(master_service.storagerouter.ip)
        for service in slave_services:
            ip = service.storagerouter.ip
            if ip in service_nodes:
                reconfigure_reasons.append(
                    'Multiple MDS services on the same node')
            else:
                service_nodes.append(ip)

        if len(service_nodes) > safety:
            reconfigure_reasons.append('Too much safety')
        if len(service_nodes) < safety and len(service_nodes) < len(nodes):
            reconfigure_reasons.append('Not enough safety')
        if master_service is not None and services_load[
                master_service] > max_load:
            reconfigure_reasons.append('Master overloaded')
        if master_service is not None and master_service.storagerouter_guid != vdisk.storagerouter_guid:
            reconfigure_reasons.append('Master is not local')
        if any(service for service in slave_services
               if services_load[service] > max_load):
            reconfigure_reasons.append('One or more slaves overloaded')

        # Check reconfigure required based upon domains
        recommended_primary = math.ceil(
            safety / 2.0) if len(secondary_storagerouters) > 0 else safety
        recommended_secondary = safety - recommended_primary

        if master_service is not None and master_service not in all_info_dict[
                'primary']['used']:
            # Master service not present in primary domain
            reconfigure_reasons.append('Master service not in primary domain')

        primary_services_used = len(all_info_dict['primary']['used'])
        primary_services_available = len(all_info_dict['primary']['available'])
        if primary_services_used < recommended_primary and primary_services_used < primary_services_available:
            # More services can be used in primary domain
            reconfigure_reasons.append(
                'Not enough services in use in primary domain')
        if primary_services_used > recommended_primary:
            # Too many services in primary domain
            reconfigure_reasons.append(
                'Too many services in use in primary domain')

        # More services can be used in secondary domain
        secondary_services_used = len(all_info_dict['secondary']['used'])
        secondary_services_available = len(
            all_info_dict['secondary']['available'])
        if secondary_services_used < recommended_secondary and secondary_services_used < secondary_services_available:
            reconfigure_reasons.append(
                'Not enough services in use in secondary domain')
        if secondary_services_used > recommended_secondary:
            # Too many services in secondary domain
            reconfigure_reasons.append(
                'Too many services in use in secondary domain')

        # If secondary domain present, check order in which the slave services are configured
        secondary = False
        for slave_service in slave_services:
            if secondary is True and slave_service in all_info_dict['primary'][
                    'used']:
                reconfigure_reasons.append(
                    'A slave in secondary domain has priority over a slave in primary domain'
                )
                break
            if slave_service in all_info_dict['secondary']['used']:
                secondary = True

        if not reconfigure_reasons:
            MDSServiceController._logger.debug(
                'MDS safety: vDisk {0}: No reconfiguration required'.format(
                    vdisk.guid))
            MDSServiceController.sync_vdisk_to_reality(vdisk)
            return

        MDSServiceController._logger.debug(
            'MDS safety: vDisk {0}: Reconfiguration required. Reasons:'.format(
                vdisk.guid))
        for reason in reconfigure_reasons:
            MDSServiceController._logger.debug(
                'MDS safety: vDisk {0}:    * {1}'.format(vdisk.guid, reason))

        ############################
        # CREATE NEW CONFIGURATION #
        ############################

        # Check whether the master (if available) is non-local to the vdisk and/or is overloaded
        new_services = []
        master_ok = master_service is not None
        if master_ok is True:
            master_ok = master_service.storagerouter_guid == vdisk.storagerouter_guid and services_load[
                master_service] <= max_load

        previous_master = None
        if master_ok:
            # Add this master to the fresh configuration
            new_services.append(master_service)
        else:
            # Try to find the best non-overloaded LOCAL MDS slave to make master
            candidate_master_service = None
            candidate_master_load = 0
            local_mds = None
            local_mds_load = 0
            for service in all_info_dict['primary']['available']:
                load = services_load[service]
                if load <= max_load and service.storagerouter_guid == vdisk.storagerouter_guid:
                    if local_mds is None or local_mds_load > load:
                        # This service is a non-overloaded local MDS
                        local_mds = service
                        local_mds_load = load
                    if service in slave_services:
                        if candidate_master_service is None or candidate_master_load > load:
                            # This service is a non-overloaded local slave
                            candidate_master_service = service
                            candidate_master_load = load
            if candidate_master_service is not None:
                # A non-overloaded local slave was found.
                client = MetadataServerClient.load(candidate_master_service)
                try:
                    amount_of_tlogs = client.catch_up(str(vdisk.volume_id),
                                                      True)
                except RuntimeError as ex:
                    if 'Namespace does not exist' in ex.message:
                        client.create_namespace(str(vdisk.volume_id))
                        amount_of_tlogs = client.catch_up(
                            str(vdisk.volume_id), True)
                    else:
                        raise
                if amount_of_tlogs < tlogs:
                    # Almost there. Catching up right now, and continue as soon as it's up-to-date
                    start = time.time()
                    client.catch_up(str(vdisk.volume_id), False)
                    MDSServiceController._logger.debug(
                        'MDS safety: vDisk {0}: Catchup took {1}s'.format(
                            vdisk.guid, round(time.time() - start, 2)))
                    # It's up to date, so add it as a new master
                    new_services.append(candidate_master_service)
                    if master_service is not None:
                        # The current master (if available) is now candidate to become one of the slaves
                        slave_services.append(master_service)
                        previous_master = master_service
                else:
                    # It's not up to date, keep the previous master (if available) and give the local slave some more time to catch up
                    if master_service is not None:
                        new_services.append(master_service)
                    new_services.append(candidate_master_service)
                if candidate_master_service in slave_services:
                    slave_services.remove(candidate_master_service)
            else:
                # There's no non-overloaded local slave found. Keep the current master (if available) and add a local MDS (if available) as slave
                if master_service is not None:
                    new_services.append(master_service)
                if local_mds is not None:
                    new_services.append(local_mds)
                    if local_mds in slave_services:
                        slave_services.remove(local_mds)

        # At this point, there might (or might not) be a (new) master, and a (catching up) slave. The rest of the non-local
        # MDS nodes must now be added to the configuration until the safety is reached. There's always one extra
        # slave recycled to make sure there's always an (almost) up-to-date slave ready for failover
        nodes = set(service.storagerouter.ip for service in new_services)

        # Recycle slave for faster failover
        secondary_node_count = 0
        service_to_recycle = None
        if len(nodes) < safety:
            if recommended_primary > 1:  # If primary is 1, we only have master in primary
                # Try to recycle slave which is in primary domain
                for load in sorted(all_info_dict['primary']['loads']):
                    for service in all_info_dict['primary']['loads'][load]:
                        if service_to_recycle is None and service in slave_services and service.storagerouter.ip not in nodes:
                            try:
                                SSHClient(service.storagerouter)
                                service_to_recycle = service
                            except UnableToConnectException:
                                MDSServiceController._logger.debug(
                                    'MDS safety: vDisk {0}: Skipping storagerouter with IP {1} as it is unreachable'
                                    .format(vdisk.guid,
                                            service.storagerouter.ip))
            # Try to recycle slave which is in secondary domain if none found in primary
            if service_to_recycle is None and len(
                    secondary_storagerouters) > 0:
                for load in sorted(all_info_dict['secondary']['loads']):
                    for service in all_info_dict['secondary']['loads'][load]:
                        if service_to_recycle is None and service in slave_services and service.storagerouter.ip not in nodes:
                            try:
                                SSHClient(service.storagerouter)
                                service_to_recycle = service
                                secondary_node_count = 1  # We do not want to configure the secondary slave BEFORE the primary slaves
                            except UnableToConnectException:
                                MDSServiceController._logger.debug(
                                    'MDS safety: vDisk {0}: Skipping storagerouter with IP {1} as it is unreachable'
                                    .format(vdisk.guid,
                                            service.storagerouter.ip))
        if service_to_recycle is not None:
            slave_services.remove(service_to_recycle)
            if secondary_node_count == 0:  # Add service to recycle because its in primary domain
                new_services.append(service_to_recycle)
                nodes.add(service_to_recycle.storagerouter.ip)

        # Add extra (new) slaves until primary safety reached
        nodes, new_services = _add_suitable_nodes(_importance='primary',
                                                  _safety=recommended_primary)

        # Add recycled secondary slave after primary slaves have been added
        if secondary_node_count == 1:
            new_services.append(service_to_recycle)
            nodes.add(service_to_recycle.storagerouter.ip)

        # Add extra (new) slaves until secondary safety reached
        if len(secondary_storagerouters) > 0:
            nodes, new_services = _add_suitable_nodes(_importance='secondary',
                                                      _safety=safety)
            # Add extra slaves from primary domain in case no suitable nodes found in secondary domain
            if len(nodes) < safety:
                nodes, new_services = _add_suitable_nodes(
                    _importance='primary', _safety=safety)

        # Build the new configuration and update the vdisk
        configs_no_ex_master = []
        configs_all = []
        for service in new_services:
            client = MetadataServerClient.load(service)
            client.create_namespace(str(vdisk.volume_id))
            # noinspection PyArgumentList
            config = MDSNodeConfig(address=str(service.storagerouter.ip),
                                   port=service.ports[0])
            if previous_master != service:
                configs_no_ex_master.append(config)
            configs_all.append(config)
        try:
            if len(configs_no_ex_master) != len(configs_all):
                vdisk.storagedriver_client.update_metadata_backend_config(
                    volume_id=str(vdisk.volume_id),
                    metadata_backend_config=MDSMetaDataBackendConfig(
                        configs_no_ex_master))
            vdisk.storagedriver_client.update_metadata_backend_config(
                volume_id=str(vdisk.volume_id),
                metadata_backend_config=MDSMetaDataBackendConfig(configs_all))
        except Exception:
            MDSServiceController._logger.exception(
                'MDS safety: vDisk {0}: Failed to update the metadata backend configuration'
                .format(vdisk.guid))
            raise Exception(
                'MDS configuration for volume {0} with guid {1} could not be changed'
                .format(vdisk.name, vdisk.guid))

        for service in new_services[1:]:
            client = MetadataServerClient.load(service)
            client.set_role(str(vdisk.volume_id),
                            MetadataServerClient.MDS_ROLE.SLAVE)

        MDSServiceController.sync_vdisk_to_reality(vdisk)
        MDSServiceController._logger.debug(
            'MDS safety: vDisk {0}: Completed'.format(vdisk.guid))

    @staticmethod
    def get_preferred_mds(storagerouter, vpool):
        """
        Gets the MDS on this StorageRouter/VPool pair which is preferred to achieve optimal balancing
        :param storagerouter: Storagerouter to retrieve the best MDS service for
        :type storagerouter: StorageRouter

        :param vpool: vPool to retrieve the best MDS service for
        :type vpool: VPool

        :return: Preferred MDS service (least loaded), current load on that MDS service
        :rtype: tuple
        """
        mds_service = (None, float('inf'))
        for current_mds_service in vpool.mds_services:
            if current_mds_service.service.storagerouter_guid == storagerouter.guid:
                load = MDSServiceController.get_mds_load(
                    current_mds_service)[0]
                if mds_service is None or load < mds_service[1]:
                    mds_service = (current_mds_service, load)
        return mds_service

    @staticmethod
    def get_mds_load(mds_service):
        """
        Gets a 'load' for an MDS service based on its capacity and the amount of assigned VDisks
        :param mds_service: MDS service the get current load for
        :type mds_service: MDSService

        :return: Load of the MDS service
        :rtype: tuple
        """
        service_capacity = float(mds_service.capacity)
        if service_capacity < 0:
            return 50, 50
        if service_capacity == 0:
            return float('inf'), float('inf')
        usage = len(mds_service.vdisks_guids)
        return round(usage / service_capacity * 100.0, 5), round(
            (usage + 1) / service_capacity * 100.0, 5)

    @staticmethod
    def get_mds_storagedriver_config_set(vpool, check_online=False):
        """
        Builds a configuration for all StorageRouters from a given VPool with following goals:
        * Primary MDS is the local one
        * All slaves are on different hosts
        * Maximum `mds_safety` nodes are returned
        The configuration returned is the default configuration used by the volumedriver of which in normal use-cases
        only the 1st entry is used, because at volume creation time, the volumedriver needs to create 1 master MDS
        During ensure_safety, we actually create/set the MDS slaves for each volume

        :param vpool: vPool to get storagedriver configuration for
        :type vpool: VPool

        :param check_online: Check whether the storage routers are actually responsive
        :type check_online: bool

        :return: MDS configuration for a vPool
        :rtype: dict
        """
        mds_per_storagerouter = {}
        mds_per_load = {}
        for storagedriver in vpool.storagedrivers:
            storagerouter = storagedriver.storagerouter
            if check_online is True:
                try:
                    SSHClient(storagerouter)
                except UnableToConnectException:
                    continue
            mds_service, load = MDSServiceController.get_preferred_mds(
                storagerouter, vpool)
            if mds_service is None:
                raise RuntimeError('Could not find an MDS service')
            mds_per_storagerouter[storagerouter] = {
                'host': storagerouter.ip,
                'port': mds_service.service.ports[0]
            }
            if load not in mds_per_load:
                mds_per_load[load] = []
            mds_per_load[load].append(storagerouter)

        safety = Configuration.get('/ovs/framework/storagedriver|mds_safety')
        config_set = {}
        for storagerouter, ip_info in mds_per_storagerouter.iteritems():
            config_set[storagerouter.guid] = [ip_info]
            for importance in ['primary', 'secondary']:
                domains = [
                    junction.domain for junction in storagerouter.domains
                    if junction.backup is (importance == 'secondary')
                ]
                possible_storagerouters = set()
                for domain in domains:
                    possible_storagerouters.update(
                        StorageRouterList.
                        get_primary_storagerouters_for_domain(domain))

                for load in sorted(mds_per_load):
                    if len(config_set[storagerouter.guid]) >= safety:
                        break
                    other_storagerouters = mds_per_load[load]
                    random.shuffle(other_storagerouters)
                    for other_storagerouter in other_storagerouters:
                        if len(config_set[storagerouter.guid]) >= safety:
                            break
                        if other_storagerouter != storagerouter and other_storagerouter in possible_storagerouters:
                            config_set[storagerouter.guid].append(
                                mds_per_storagerouter[other_storagerouter])
        return config_set

    @staticmethod
    @celery.task(name='ovs.mds.mds_checkup',
                 schedule=Schedule(minute='30', hour='0,4,8,12,16,20'))
    @ensure_single(task_name='ovs.mds.mds_checkup', mode='CHAINED')
    def mds_checkup():
        """
        Validates the current MDS setup/configuration and takes actions where required
        """
        MDSServiceController._logger.info('MDS checkup - Started')
        mds_dict = {}
        for vpool in VPoolList.get_vpools():
            MDSServiceController._logger.info('MDS checkup - vPool {0}'.format(
                vpool.name))
            mds_dict[vpool] = {}
            for mds_service in vpool.mds_services:
                storagerouter = mds_service.service.storagerouter
                if storagerouter not in mds_dict[vpool]:
                    mds_dict[vpool][storagerouter] = {
                        'client': None,
                        'services': []
                    }
                    try:
                        mds_dict[vpool][storagerouter]['client'] = SSHClient(
                            storagerouter, username='******')
                        MDSServiceController._logger.info(
                            'MDS checkup - vPool {0} - Storage Router {1} - ONLINE'
                            .format(vpool.name, storagerouter.name))
                    except UnableToConnectException:
                        MDSServiceController._logger.info(
                            'MDS checkup - vPool {0} - Storage Router {1} - OFFLINE'
                            .format(vpool.name, storagerouter.name))
                mds_dict[vpool][storagerouter]['services'].append(mds_service)

        failures = []
        max_load = Configuration.get(
            '/ovs/framework/storagedriver|mds_maxload')
        for vpool, storagerouter_info in mds_dict.iteritems():
            # 1. First, make sure there's at least one MDS on every StorageRouter that's not overloaded
            # If not, create an extra MDS for that StorageRouter
            for storagerouter in storagerouter_info:
                client = mds_dict[vpool][storagerouter]['client']
                mds_services = mds_dict[vpool][storagerouter]['services']
                has_room = False
                for mds_service in mds_services[:]:
                    if mds_service.capacity == 0 and len(
                            mds_service.vdisks_guids) == 0:
                        MDSServiceController._logger.info(
                            'MDS checkup - Removing mds_service {0} for vPool {1}'
                            .format(mds_service.number, vpool.name))
                        MDSServiceController.remove_mds_service(
                            mds_service,
                            vpool,
                            reconfigure=True,
                            allow_offline=client is None)
                        mds_services.remove(mds_service)
                for mds_service in mds_services:
                    _, load = MDSServiceController.get_mds_load(mds_service)
                    if load < max_load:
                        has_room = True
                        break
                MDSServiceController._logger.info(
                    'MDS checkup - vPool {0} - Storage Router {1} - Capacity available: {2}'
                    .format(vpool.name, storagerouter.name, has_room))
                if has_room is False and client is not None:
                    mds_service = MDSServiceController.prepare_mds_service(
                        storagerouter=storagerouter,
                        vpool=vpool,
                        fresh_only=False,
                        reload_config=True)
                    if mds_service is None:
                        raise RuntimeError('Could not add MDS node')
                    mds_services.append(mds_service)
            mds_config_set = MDSServiceController.get_mds_storagedriver_config_set(
                vpool, True)
            for storagerouter in storagerouter_info:
                client = mds_dict[vpool][storagerouter]['client']
                if client is None:
                    MDSServiceController._logger.info(
                        'MDS checkup - vPool {0} - Storage Router {1} - Marked as offline, not setting default MDS configuration'
                        .format(vpool.name, storagerouter.name))
                    continue
                storagedriver = [
                    sd for sd in storagerouter.storagedrivers
                    if sd.vpool_guid == vpool.guid
                ][0]
                storagedriver_config = StorageDriverConfiguration(
                    'storagedriver', vpool.guid,
                    storagedriver.storagedriver_id)
                storagedriver_config.load()
                if storagedriver_config.is_new is False:
                    MDSServiceController._logger.info(
                        'MDS checkup - vPool {0} - Storage Router {1} - Storing default MDS configuration: {2}'
                        .format(vpool.name, storagerouter.name,
                                mds_config_set[storagerouter.guid]))
                    storagedriver_config.configure_filesystem(
                        fs_metadata_backend_mds_nodes=mds_config_set[
                            storagerouter.guid])
                    storagedriver_config.save(client)
            # 2. Per VPool, execute a safety check, making sure the master/slave configuration is optimal.
            MDSServiceController._logger.info(
                'MDS checkup - vPool {0} - Ensuring safety for all virtual disks'
                .format(vpool.name))
            for vdisk in vpool.vdisks:
                try:
                    MDSServiceController.ensure_safety(vdisk)
                except Exception:
                    message = 'Ensure safety for vDisk {0} with guid {1} failed'.format(
                        vdisk.name, vdisk.guid)
                    MDSServiceController._logger.exception(message)
                    failures.append(message)
        if len(failures) > 0:
            raise Exception('\n - ' + '\n - '.join(failures))
        MDSServiceController._logger.info('MDS checkup - Finished')

    @staticmethod
    def print_current_mds_layout():
        """
        Prints the current MDS layout
        """
        output = [
            '', 'Open vStorage - MDS debug information',
            '=====================================',
            'timestamp: {0}'.format(time.time()), ''
        ]
        for storagerouter in StorageRouterList.get_storagerouters():
            output.append('+ {0} ({1})'.format(storagerouter.name,
                                               storagerouter.ip))
            vpools = set(sd.vpool for sd in storagerouter.storagedrivers)
            for vpool in vpools:
                output.append('  + {0}'.format(vpool.name))
                for mds_service in vpool.mds_services:
                    if mds_service.service.storagerouter_guid == storagerouter.guid:
                        masters, slaves = 0, 0
                        for junction in mds_service.vdisks:
                            if junction.is_master:
                                masters += 1
                            else:
                                slaves += 1
                        capacity = mds_service.capacity
                        if capacity == -1:
                            capacity = 'infinite'
                        load, _ = MDSServiceController.get_mds_load(
                            mds_service)
                        if load == float('inf'):
                            load = 'infinite'
                        else:
                            load = '{0}%'.format(round(load, 2))
                        output.append(
                            '    + {0} - port {1} - {2} master(s), {3} slave(s) - capacity: {4}, load: {5}'
                            .format(mds_service.number,
                                    mds_service.service.ports[0], masters,
                                    slaves, capacity, load))
        print '\n'.join(output)
Ejemplo n.º 4
0
class MonitoringController(object):
    """
    A controller that can execute various quality/monitoring checks
    """
    _logger = LogHandler.get('lib', name='ovs-monitoring')

    @staticmethod
    def test_ssh_connectivity():
        """
        Validates whether all nodes can SSH into eachother
        """
        MonitoringController._logger.info('Starting SSH connectivity test...')
        ips = [sr.ip for sr in StorageRouterList.get_storagerouters()]
        for ip in ips:
            for primary_username in ['root', 'ovs']:
                try:
                    with remote(ip, [SSHClient],
                                username=primary_username) as rem:
                        for local_ip in ips:
                            for username in ['root', 'ovs']:
                                message = '* Connection from {0}@{1} to {2}@{3}... {{0}}'.format(
                                    primary_username, ip, username, local_ip)
                                try:
                                    c = rem.SSHClient(local_ip,
                                                      username=username)
                                    assert c.run(['whoami'
                                                  ]).strip() == username
                                    message = message.format('OK')
                                    logger = MonitoringController._logger.info
                                except Exception as ex:
                                    message = message.format(ex.message)
                                    logger = MonitoringController._logger.error
                                logger(message)
                except Exception as ex:
                    MonitoringController._logger.error(
                        '* Could not connect to {0}@{1}: {2}'.format(
                            primary_username, ip, ex.message))
        MonitoringController._logger.info('Finished')

    @staticmethod
    @ovs_task(name='ovs.monitoring.verify_vdisk_cache_quota',
              schedule=Schedule(minute='15', hour='*'),
              ensure_single_info={'mode': 'DEFAULT'})
    def verify_vdisk_cache_quota():
        """
        Validates whether the caching quota is reaching its limits or has surpassed it
        Each vDisk can consume a part of the total fragment caching capacity
        """
        MonitoringController._logger.info(
            'Starting vDisk caching quota verification...')
        alba_guid_size_map = {}
        for storagedriver in StorageDriverList.get_storagedrivers():
            storagedriver.invalidate_dynamics(
                ['vpool_backend_info', 'vdisks_guids'])

            for cache_type in [[
                    'cache_quota_fc', 'backend_info', 'connection_info',
                    'fragment'
            ],
                               [
                                   'cache_quota_bc',
                                   'block_cache_backend_info',
                                   'block_cache_connection_info', 'block'
                               ]]:
                cache_quota = storagedriver.vpool_backend_info[cache_type[0]]
                backend_info = storagedriver.vpool_backend_info[cache_type[1]]
                connection_info = storagedriver.vpool_backend_info[
                    cache_type[2]]
                if backend_info is None or connection_info is None:
                    continue

                alba_backend_name = backend_info['name']
                alba_backend_host = connection_info['host']
                alba_backend_guid = backend_info['alba_backend_guid']
                if alba_backend_guid not in alba_guid_size_map:
                    ovs_client = OVSClient(
                        ip=alba_backend_host,
                        port=connection_info['port'],
                        credentials=(connection_info['client_id'],
                                     connection_info['client_secret']),
                        version=2,
                        cache_store=VolatileFactory.get_client())
                    try:
                        alba_guid_size_map[alba_backend_guid] = {
                            'name':
                            alba_backend_name,
                            'backend_ip':
                            alba_backend_host,
                            'total_size':
                            ovs_client.get('/alba/backends/{0}/'.format(
                                alba_backend_guid),
                                           params={'contents': 'usages'
                                                   })['usages']['size'],
                            'requested_size':
                            0
                        }
                    except Exception:
                        MonitoringController._logger.exception(
                            'Failed to retrieve ALBA Backend info for {0} on host {1}'
                            .format(alba_backend_name, alba_backend_host))
                        continue

                for vdisk_guid in storagedriver.vdisks_guids:
                    vdisk = VDisk(vdisk_guid)
                    vdisk_cq = vdisk.cache_quota.get(
                        cache_type[3]
                    ) if vdisk.cache_quota is not None else None
                    if vdisk_cq is None:
                        alba_guid_size_map[alba_backend_guid][
                            'requested_size'] += cache_quota if cache_quota is not None else 0
                    else:
                        alba_guid_size_map[alba_backend_guid][
                            'requested_size'] += vdisk_cq

        local_ips = [sr.ip for sr in StorageRouterList.get_storagerouters()]
        for alba_backend_info in alba_guid_size_map.itervalues():
            name = alba_backend_info['name']
            backend_ip = alba_backend_info['backend_ip']

            location = 'local'
            remote_msg = ''
            if backend_ip not in local_ips:
                location = 'remote'
                remote_msg = ' (on remote IP {0})'.format(backend_ip)

            percentage = alba_backend_info[
                'requested_size'] / alba_backend_info['total_size'] * 100
            if percentage > 100:
                MonitoringController._logger.error(
                    'OVS_WARNING: Over-allocation for vDisk caching quota on {0} ALBA Backend {1}{2}. Unexpected behavior might occur'
                    .format(location, name, remote_msg))
            elif percentage > 70:
                MonitoringController._logger.warning(
                    'OVS_WARNING: vDisk caching quota on {0} ALBA Backend {1} is at {2:.1f}%{3}'
                    .format(location, name, percentage, remote_msg))
        MonitoringController._logger.info(
            'Finished vDisk cache quota verification')
Ejemplo n.º 5
0
class MDSServiceController(MDSShared):
    """
    Contains all BLL related to MDSServices
    """
    _logger = Logger('lib')
    _log_level = LOG_LEVEL_MAPPING[_logger.getEffectiveLevel()]

    # noinspection PyCallByClass,PyTypeChecker
    storagerouterclient.Logger.setupLogging(
        Logger.load_path('storagerouterclient'), _log_level)
    # noinspection PyArgumentList
    storagerouterclient.Logger.enableLogging()

    @staticmethod
    def remove_mds_service(mds_service, reconfigure, allow_offline=False):
        """
        Removes an MDS service
        :param mds_service: The MDS service to remove
        :type mds_service: ovs.dal.hybrids.j_mdsservice.MDSService
        :param reconfigure: Indicates whether reconfiguration is required
        :type reconfigure: bool
        :param allow_offline: Indicates whether it's OK that the node for which mds services are cleaned is offline
        :type allow_offline: bool
        :raises RuntimeError: When vDisks present on the MDSService to be removed
                              No StorageDriver is linked to the MDSService to be removed
        :raises UnableToConnectException: When StorageRouter on which the MDSService resides is unreachable and allow_offline flag is False
        :return: None
        :rtype: NoneType
        """
        if len(mds_service.vdisks_guids) > 0 and allow_offline is False:
            raise RuntimeError(
                'Cannot remove MDSService that is still serving disks')

        if len(
                mds_service.storagedriver_partitions
        ) == 0 or mds_service.storagedriver_partitions[0].storagedriver is None:
            raise RuntimeError(
                'Failed to retrieve the linked StorageDriver to this MDS Service {0}'
                .format(mds_service.service.name))

        vpool = mds_service.vpool
        root_client = None
        storagerouter = mds_service.service.storagerouter
        storagedriver = mds_service.storagedriver_partitions[0].storagedriver
        MDSServiceController._logger.info(
            'StorageRouter {0} - vPool {1}: Removing MDS junction service for port {2}'
            .format(storagerouter.name, vpool.name,
                    mds_service.service.ports[0]))
        try:
            root_client = SSHClient(endpoint=storagerouter, username='******')
            MDSServiceController._logger.debug(
                'StorageRouter {0} - vPool {1}: Established SSH connection'.
                format(storagerouter.name, vpool.name))
        except UnableToConnectException:
            if allow_offline is True:
                MDSServiceController._logger.warning(
                    'StorageRouter {0} - vPool {1}: Allowed offline node during MDS service removal'
                    .format(storagerouter.name, vpool.name))
            else:
                MDSServiceController._logger.exception(
                    'StorageRouter {0} - vPool {1}: Failed to connect to StorageRouter'
                    .format(storagerouter.name, vpool.name))
                raise

        # Reconfigure StorageDriver
        if reconfigure is True and root_client is not None:
            mds_nodes = []
            for sd_partition in storagedriver.partitions:
                if sd_partition.role == DiskPartition.ROLES.DB and sd_partition.sub_role == StorageDriverPartition.SUBROLE.MDS and sd_partition.mds_service != mds_service:
                    service = sd_partition.mds_service.service
                    mds_nodes.append({
                        'host':
                        service.storagerouter.ip,
                        'port':
                        service.ports[0],
                        'db_directory':
                        '{0}/db'.format(sd_partition.path),
                        'scratch_directory':
                        '{0}/scratch'.format(sd_partition.path)
                    })

            # Generate the correct section in the StorageDriver's configuration
            MDSServiceController._logger.info(
                'StorageRouter {0} - vPool {1}: Configuring StorageDriver with MDS nodes: {2}'
                .format(storagerouter.name, vpool.name, mds_nodes))
            storagedriver_config = StorageDriverConfiguration(
                vpool.guid, storagedriver.storagedriver_id)
            storagedriver_config.configure_metadata_server(mds_nodes=mds_nodes)
            storagedriver_config.save(root_client)

        # Clean up model
        MDSServiceController._logger.info(
            'StorageRouter {0} - vPool {1}: Cleaning model'.format(
                storagerouter.name, vpool.name))
        directories_to_clean = []
        for sd_partition in mds_service.storagedriver_partitions:
            directories_to_clean.append(sd_partition.path)
            sd_partition.delete()

        if allow_offline is True:  # Certain vDisks might still be attached to this offline MDS service --> Delete relations
            for junction in mds_service.vdisks:
                junction.delete()

        mds_service.delete()
        mds_service.service.delete()

        # Clean up file system
        if root_client is not None:
            MDSServiceController._logger.info(
                'StorageRouter {0} - vPool {1}: Deleting directories from file system: {2}'
                .format(storagerouter.name, vpool.name, directories_to_clean))
            tries = 5
            while tries > 0:
                try:
                    root_client.dir_delete(directories=directories_to_clean,
                                           follow_symlinks=True)
                    for dir_name in directories_to_clean:
                        MDSServiceController._logger.debug(
                            'StorageRouter {0} - vPool {1}: Recursively removed directory: {2}'
                            .format(storagerouter.name, vpool.name, dir_name))
                    break
                except Exception:
                    MDSServiceController._logger.warning(
                        'StorageRouter {0} - vPool {1}: Waiting for the MDS service to go down...'
                        .format(storagerouter.name, vpool.name))
                    time.sleep(5)
                    tries -= 1
                    if tries == 0:
                        MDSServiceController._logger.exception(
                            'StorageRouter {0} - vPool {1}: Deleting directories failed'
                            .format(storagerouter.name, vpool.name))
                        raise

    @staticmethod
    @ovs_task(name='ovs.mds.mds_checkup',
              schedule=Schedule(minute='30', hour='0,4,8,12,16,20'),
              ensure_single_info={'mode': 'CHAINED'})
    def mds_checkup():
        """
        Validates the current MDS setup/configuration and takes actions where required
        Actions:
            * Verify which StorageRouters are available
            * Make mapping between vPools and its StorageRouters
            * For each vPool make sure every StorageRouter has at least 1 MDS service with capacity available
            * For each vPool retrieve the optimal configuration and store it for each StorageDriver
            * For each vPool run an ensure safety for all vDisks
        :raises RuntimeError: When ensure safety fails for any vDisk
        :return: None
        :rtype: NoneType
        """
        MDSServiceController._logger.info('Started')

        # Verify StorageRouter availability
        root_client_cache = {}
        storagerouters = StorageRouterList.get_storagerouters()
        storagerouters.sort(key=lambda _sr: ExtensionsToolbox.advanced_sort(
            element=_sr.ip, separator='.'))
        offline_nodes = []
        for storagerouter in storagerouters:
            try:
                root_client = SSHClient(endpoint=storagerouter,
                                        username='******')
                MDSServiceController._logger.debug(
                    'StorageRouter {0} - ONLINE'.format(storagerouter.name))
            except UnableToConnectException:
                root_client = None
                offline_nodes.append(storagerouter)
                MDSServiceController._logger.error(
                    'StorageRouter {0} - OFFLINE'.format(storagerouter.name))
            root_client_cache[storagerouter] = root_client

        # Create mapping per vPool and its StorageRouters
        mds_dict = collections.OrderedDict()
        for vpool in sorted(VPoolList.get_vpools(), key=lambda k: k.name):
            MDSServiceController._logger.info('vPool {0}'.format(vpool.name))
            mds_dict[vpool] = {}

            # Loop all StorageDrivers and add StorageDriver to mapping
            for storagedriver in vpool.storagedrivers:
                storagerouter = storagedriver.storagerouter
                if storagerouter not in mds_dict[vpool]:
                    mds_dict[vpool][storagerouter] = {
                        'client': root_client_cache.get(storagerouter),
                        'services': [],
                        'storagedriver': storagedriver
                    }

            # Loop all MDS Services and append services to appropriate vPool / StorageRouter combo
            mds_services = vpool.mds_services
            mds_services.sort(
                key=lambda _mds_service: ExtensionsToolbox.advanced_sort(
                    element=_mds_service.service.storagerouter.ip,
                    separator='.'))
            for mds_service in mds_services:
                service = mds_service.service
                storagerouter = service.storagerouter
                if storagerouter not in mds_dict[vpool]:
                    mds_dict[vpool][storagerouter] = {
                        'client': root_client_cache.get(storagerouter),
                        'services': [],
                        'storagedriver': None
                    }
                MDSServiceController._logger.debug(
                    'vPool {0} - StorageRouter {1} - Service on port {2}'.
                    format(vpool.name, storagerouter.name, service.ports[0]))
                mds_dict[vpool][storagerouter]['services'].append(mds_service)

        failures = []
        for vpool, storagerouter_info in mds_dict.iteritems():
            # Make sure there's at least 1 MDS on every StorageRouter that's not overloaded
            # Remove all MDS Services which have been manually marked for removal (by setting its capacity to 0)
            max_load = Configuration.get(
                '/ovs/vpools/{0}/mds_config|mds_maxload'.format(vpool.guid))
            for storagerouter in sorted(storagerouter_info,
                                        key=lambda k: k.ip):
                total_load = 0.0
                root_client = mds_dict[vpool][storagerouter]['client']
                mds_services = mds_dict[vpool][storagerouter]['services']

                for mds_service in list(
                        sorted(mds_services, key=lambda k: k.number)):
                    port = mds_service.service.ports[0]
                    number = mds_service.number
                    # Manual intervention required here in order for the MDS to be cleaned up
                    # @TODO: Remove this and make a dynamic calculation to check which MDSes to remove
                    if mds_service.capacity == 0 and len(
                            mds_service.vdisks_guids) == 0:
                        MDSServiceController._logger.warning(
                            'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: Removing'
                            .format(vpool.name, storagerouter.name, number,
                                    port))
                        try:
                            MDSServiceController.remove_mds_service(
                                mds_service=mds_service,
                                reconfigure=True,
                                allow_offline=root_client is None)
                        except Exception:
                            MDSServiceController._logger.exception(
                                'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: Failed to remove'
                                .format(vpool.name, storagerouter.name, number,
                                        port))
                        mds_services.remove(mds_service)
                    else:
                        _, next_load = MDSServiceController.get_mds_load(
                            mds_service=mds_service)
                        if next_load == float('inf'):
                            total_load = sys.maxint * -1  # Cast to lowest possible value if any MDS service capacity is set to infinity
                        else:
                            total_load += next_load

                        if next_load < max_load:
                            MDSServiceController._logger.debug(
                                'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: Capacity available - Load at {4}%'
                                .format(vpool.name, storagerouter.name, number,
                                        port, next_load))
                        else:
                            MDSServiceController._logger.debug(
                                'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: No capacity available - Load at {4}%'
                                .format(vpool.name, storagerouter.name, number,
                                        port, next_load))

                if total_load >= max_load * len(mds_services):
                    mds_services_to_add = int(
                        math.ceil((total_load - max_load * len(mds_services)) /
                                  max_load))
                    MDSServiceController._logger.info(
                        'vPool {0} - StorageRouter {1} - Average load per service {2:.2f}% - Max load per service {3:.2f}% - {4} MDS service{5} will be added'
                        .format(vpool.name, storagerouter.name,
                                total_load / len(mds_services), max_load,
                                mds_services_to_add,
                                '' if mds_services_to_add == 1 else 's'))

                    for _ in range(mds_services_to_add):
                        MDSServiceController._logger.info(
                            'vPool {0} - StorageRouter {1} - Adding new MDS Service'
                            .format(vpool.name, storagerouter.name))
                        try:
                            mds_services.append(
                                MDSServiceController.prepare_mds_service(
                                    storagerouter=storagerouter, vpool=vpool))
                        except Exception:
                            MDSServiceController._logger.exception(
                                'vPool {0} - StorageRouter {1} - Failed to create new MDS Service'
                                .format(vpool.name, storagerouter.name))

            # After potentially having added new MDSes, retrieve the optimal configuration
            mds_config_set = {}
            try:
                mds_config_set = MDSServiceController.get_mds_storagedriver_config_set(
                    vpool=vpool, offline_nodes=offline_nodes)
                MDSServiceController._logger.debug(
                    'vPool {0} - Optimal configuration {1}'.format(
                        vpool.name, mds_config_set))
            except (NotFoundException, RuntimeError):
                MDSServiceController._logger.exception(
                    'vPool {0} - Failed to retrieve the optimal configuration'.
                    format(vpool.name))

            # Apply the optimal MDS configuration per StorageDriver
            for storagerouter in sorted(storagerouter_info,
                                        key=lambda k: k.ip):
                root_client = mds_dict[vpool][storagerouter]['client']
                storagedriver = mds_dict[vpool][storagerouter]['storagedriver']

                if storagedriver is None:
                    MDSServiceController._logger.critical(
                        'vPool {0} - StorageRouter {1} - No matching StorageDriver found'
                        .format(vpool.name, storagerouter.name))
                    continue
                if storagerouter.guid not in mds_config_set:
                    MDSServiceController._logger.critical(
                        'vPool {0} - StorageRouter {1} - Not marked as offline, but could not retrieve an optimal MDS config'
                        .format(vpool.name, storagerouter.name))
                    continue
                if root_client is None:
                    MDSServiceController._logger.debug(
                        'vPool {0} - StorageRouter {1} - Marked as offline, not setting optimal MDS configuration'
                        .format(vpool.name, storagerouter.name))
                    continue

                storagedriver_config = StorageDriverConfiguration(
                    vpool_guid=vpool.guid,
                    storagedriver_id=storagedriver.storagedriver_id)
                if storagedriver_config.config_missing is False:
                    optimal_mds_config = mds_config_set[storagerouter.guid]
                    MDSServiceController._logger.debug(
                        'vPool {0} - StorageRouter {1} - Storing optimal MDS configuration: {2}'
                        .format(vpool.name, storagerouter.name,
                                optimal_mds_config))
                    # Filesystem section in StorageDriver configuration are all parameters used for vDisks created directly on the filesystem
                    # So when a vDisk gets created on the filesystem, these MDSes will be assigned to them
                    storagedriver_config.configure_filesystem(
                        fs_metadata_backend_mds_nodes=optimal_mds_config)
                    storagedriver_config.save(root_client)

            # Execute a safety check, making sure the master/slave configuration is optimal.
            MDSServiceController._logger.info(
                'vPool {0} - Ensuring safety for all vDisks'.format(
                    vpool.name))
            for vdisk in vpool.vdisks:
                try:
                    MDSServiceController.ensure_safety(vdisk_guid=vdisk.guid)
                except Exception:
                    message = 'Ensure safety for vDisk {0} with guid {1} failed'.format(
                        vdisk.name, vdisk.guid)
                    MDSServiceController._logger.exception(message)
                    failures.append(message)
        if len(failures) > 0:
            raise RuntimeError('\n - ' + '\n - '.join(failures))
        MDSServiceController._logger.info('Finished')

    # noinspection PyUnresolvedReferences
    @staticmethod
    @ovs_task(name='ovs.mds.ensure_safety',
              ensure_single_info={'mode': 'CHAINED'})
    def ensure_safety(vdisk_guid, excluded_storagerouter_guids=None):
        """
        Ensures (or tries to ensure) the safety of a given vDisk.
        Assumptions:
            * A local overloaded master is better than a non-local non-overloaded master
            * Prefer master/slaves to be on different hosts, a subsequent slave on the same node doesn't add safety
            * Don't actively overload services (e.g. configure an MDS as slave causing it to get overloaded)
            * Too much safety is not wanted (it adds loads to nodes while not required)
            * Order of slaves is:
                * All slaves on StorageRouters in primary Domain of vDisk host
                * All slaves on StorageRouters in secondary Domain of vDisk host
                * Eg: Safety of 2 (1 master + 1 slave)
                    mds config = [local master in primary, slave in secondary]
                * Eg: Safety of 3 (1 master + 2 slaves)
                    mds config = [local master in primary, slave in primary, slave in secondary]
                * Eg: Safety of 4 (1 master + 3 slaves)
                    mds config = [local master in primary, slave in primary, slave in secondary, slave in secondary]
        :param vdisk_guid: vDisk GUID to calculate a new safety for
        :type vdisk_guid: str
        :param excluded_storagerouter_guids: GUIDs of StorageRouters to leave out of calculation (Eg: When 1 is down or unavailable)
        :type excluded_storagerouter_guids: list[str]
        :raises RuntimeError: If host of vDisk is part of the excluded StorageRouters
                              If host of vDisk is not part of the StorageRouters in the primary domain
                              If catchup command fails for a slave
                              If MDS client cannot be created for any of the current or new MDS services
                              If updateMetadataBackendConfig would fail for whatever reason
        :raises SRCObjectNotFoundException: If vDisk does not have a StorageRouter GUID
        :return: None
        :rtype: NoneType
        """
        if excluded_storagerouter_guids is None:
            excluded_storagerouter_guids = []

        safety_ensurer = SafetyEnsurer(vdisk_guid,
                                       excluded_storagerouter_guids)
        safety_ensurer.ensure_safety()

    @staticmethod
    def get_preferred_mds(storagerouter, vpool):
        """
        Gets the MDS on this StorageRouter/vPool pair which is preferred to achieve optimal balancing
        :param storagerouter: StorageRouter to retrieve the best MDS service for
        :type storagerouter: ovs.dal.hybrids.storagerouter.StorageRouter
        :param vpool: vPool to retrieve the best MDS service for
        :type vpool: ovs.dal.hybrids.vpool.VPool
        :return: Preferred MDS service (least loaded), current load on that MDS service
        :rtype: tuple(ovs.dal.hybrids.j_mdsservice.MDSService, float)
        """
        mds_info = (None, float('inf'))
        for mds_service in vpool.mds_services:
            if mds_service.service.storagerouter_guid == storagerouter.guid:
                load = MDSServiceController.get_mds_load(
                    mds_service=mds_service)[0]
                if mds_info[0] is None or load < mds_info[1]:
                    mds_info = (mds_service, load)
        return mds_info

    @staticmethod
    def get_mds_storagedriver_config_set(vpool, offline_nodes=None):
        """
        Builds a configuration for all StorageRouters from a given vPool with following goals:
            * Primary MDS is the local one
            * All slaves are on different hosts
            * Maximum `mds_safety` nodes are returned
        The configuration returned is the default configuration used by the volumedriver of which in normal use-cases
        only the 1st entry is used, because at volume creation time, the volumedriver needs to create 1 master MDS
        During ensure_safety, we actually create/set the MDS slaves for each volume

        :param vpool: vPool to get StorageDriver configuration for
        :type vpool: ovs.dal.hybrids.vpool.VPool
        :param offline_nodes: Nodes which are currently unreachable via the SSHClient functionality
        :type offline_nodes: list
        :raises RuntimeError: When no MDS Service can be found for a specific vPool/StorageRouter combo
        :raises NotFoundException: When configuration management is unavailable
        :return: MDS configuration for a vPool
        :rtype: dict[list]
        """
        if offline_nodes is None:
            offline_nodes = []
        mds_per_storagerouter = {}
        mds_per_load = {}
        for storagedriver in vpool.storagedrivers:
            storagerouter = storagedriver.storagerouter
            if storagerouter in offline_nodes:
                continue
            mds_service, load = MDSServiceController.get_preferred_mds(
                storagerouter, vpool)
            if mds_service is None:
                raise RuntimeError('Could not find an MDS service')
            mds_per_storagerouter[storagerouter] = {
                'host': storagerouter.ip,
                'port': mds_service.service.ports[0]
            }
            if load not in mds_per_load:
                mds_per_load[load] = []
            mds_per_load[load].append(storagerouter)

        safety = Configuration.get(
            '/ovs/vpools/{0}/mds_config|mds_safety'.format(vpool.guid))
        config_set = {}
        for storagerouter, ip_info in mds_per_storagerouter.iteritems():
            config_set[storagerouter.guid] = [ip_info]
            for importance in ['primary', 'secondary']:
                domains = [
                    junction.domain for junction in storagerouter.domains
                    if junction.backup is (importance == 'secondary')
                ]
                possible_storagerouters = set()
                for domain in domains:
                    possible_storagerouters.update(
                        StorageRouterList.
                        get_primary_storagerouters_for_domain(domain))

                for load in sorted(mds_per_load):
                    if len(config_set[storagerouter.guid]) >= safety:
                        break
                    other_storagerouters = mds_per_load[load]
                    random.shuffle(other_storagerouters)
                    for other_storagerouter in other_storagerouters:
                        if len(config_set[storagerouter.guid]) >= safety:
                            break
                        if other_storagerouter != storagerouter and other_storagerouter in possible_storagerouters:
                            config_set[storagerouter.guid].append(
                                mds_per_storagerouter[other_storagerouter])
        return config_set

    @staticmethod
    def monitor_mds_layout():
        """
        Prints the current MDS layout
        :return: None
        :rtype: NoneType
        """
        try:
            while True:
                output = [
                    '', 'Open vStorage - MDS debug information',
                    '=====================================',
                    'timestamp: {0}'.format(datetime.datetime.now()), ''
                ]
                vpools_deployed = False
                for storagerouter in sorted(
                        StorageRouterList.get_storagerouters(),
                        key=lambda k: k.name):
                    vpools = set(sd.vpool
                                 for sd in storagerouter.storagedrivers)
                    if len(vpools) > 0:
                        vpools_deployed = True
                        output.append('+ {0} ({1})'.format(
                            storagerouter.name, storagerouter.ip))
                    for vpool in sorted(vpools, key=lambda k: k.name):
                        output.append('  + {0}'.format(vpool.name))
                        for mds_service in sorted(vpool.mds_services,
                                                  key=lambda k: k.number):
                            if mds_service.service.storagerouter_guid == storagerouter.guid:
                                masters, slaves = 0, 0
                                for junction in mds_service.vdisks:
                                    if junction.is_master:
                                        masters += 1
                                    else:
                                        slaves += 1
                                capacity = mds_service.capacity
                                if capacity == -1:
                                    capacity = 'infinite'
                                load, _ = MDSServiceController.get_mds_load(
                                    mds_service)
                                if load == float('inf'):
                                    load = 'infinite'
                                else:
                                    load = '{0}%'.format(round(load, 2))
                                output.append(
                                    '    + {0} - port {1} - {2} master(s), {3} slave(s) - capacity: {4}, load: {5}'
                                    .format(mds_service.number,
                                            mds_service.service.ports[0],
                                            masters, slaves, capacity, load))
                if vpools_deployed is False:
                    output.append('No vPools deployed')
                print '\x1b[2J\x1b[H' + '\n'.join(output)
                time.sleep(1)
        except KeyboardInterrupt:
            pass

    @staticmethod
    @ovs_task(name='ovs.mds.mds_catchup',
              schedule=Schedule(minute='30', hour='*/2'),
              ensure_single_info={'mode': 'DEFAULT'})
    def mds_catchup():
        """
        Looks to catch up all MDS slaves which are too far behind
        Only one catch for every storagedriver is invoked
        """

        # Only for caching purposes
        def storagedriver_worker(queue, error_list):
            # type: (Queue.Queue, List[str]) -> None
            while not queue.empty():
                mds_catch_up = queue.get()  # type: MDSCatchUp
                try:
                    mds_catch_up.catch_up(async=False)
                except Exception as ex:
                    MDSServiceController._logger.exception(
                        'Exceptions while catching for vDisk {0}'.format(
                            mds_catch_up.vdisk.guid))
                    error_list.append(str(ex))
                finally:
                    queue.task_done()

        storagedriver_queues = {}
        for vdisk in VDiskList.get_vdisks():
            if vdisk.storagedriver_id not in storagedriver_queues:
                storagedriver_queues[vdisk.storagedriver_id] = Queue.Queue()
            # Putting it in the Queue ensures that the reference is still there so the caching is used optimally
            catch_up = MDSCatchUp(vdisk.guid)
            storagedriver_queues[vdisk.storagedriver_id].put(catch_up)

        errors = []
        threads = []
        for storadriver_id, storagedriver_queue in storagedriver_queues.iteritems(
        ):
            thread = Thread(target=storagedriver_worker,
                            args=(
                                storagedriver_queue,
                                errors,
                            ))
            thread.start()
            threads.append(thread)
        for thread in threads:
            thread.join()

        if len(errors) > 0:
            raise RuntimeError(
                'Exception occurred while catching up: \n - {0}'.format(
                    '\n - '.join(errors)))
Ejemplo n.º 6
0
class StatsMonkeyController(StatsMonkey):
    """
    Stats Monkey class which retrieves statistics for the cluster
    Methods:
        * run_all
        * get_stats_mds
        * get_stats_vpools
        * get_stats_storagerouters
    """
    _logger = Logger(name='lib')
    _dynamic_dependencies = {'get_stats_vpools': {VPool: ['statistics']},  # The statistics being retrieved depend on the caching timeouts of these properties
                             'get_stats_storagerouters': {StorageRouter: ['statistics']}}

    def __init__(self):
        """
        Init method. This class is a completely static class, so cannot be instantiated
        """
        raise RuntimeError('StatsMonkeyController is a static class')

    @staticmethod
    @ovs_task(name='ovs.stats_monkey.run_all', schedule=Schedule(minute='*', hour='*'), ensure_single_info={"mode": "DEFAULT"})
    def run_all():
        """
        Run all the get stats methods from StatsMonkeyController
        Prerequisites when adding content:
            * New methods which need to be picked up by this method need to start with 'get_stats_'
            * New methods need to collect the information and return a bool and list of stats. Then 'run_all_get_stat_methods' method, will send the stats to the configured instance (influx / redis)
            * The frequency each method needs to be executed can be configured via the configuration management by setting the function name as key and the interval in seconds as value
            *    Eg: {'get_stats_mds': 20}  --> Every 20 seconds, the MDS statistics will be checked upon
        """
        StatsMonkeyController.run_all_get_stat_methods()

    @classmethod
    def get_stats_mds(cls):
        """
        Retrieve how many vDisks each MDS service is serving, whether as master or slave
        """
        if cls._config is None:
            cls.validate_and_retrieve_config()

        stats = []
        environment = cls._config['environment']
        service_type = ServiceTypeList.get_by_name('MetadataServer')
        if service_type is None:
            raise RuntimeError('MetadataServer service not found in the model')

        for service in service_type.services:
            slaves = 0
            masters = 0
            mds_service = service.mds_service
            for junction in mds_service.vdisks:
                if junction.is_master is True:
                    masters += 1
                else:
                    slaves += 1
            stats.append({'tags': {'vpool_name': mds_service.vpool.name,
                                   'mds_number': mds_service.number,
                                   'environment': environment,
                                   'storagerouter_name': service.storagerouter.name},
                          'fields': {'load': MDSServiceController.get_mds_load(mds_service)[0],
                                     'capacity': mds_service.capacity if mds_service.capacity != -1 else 'infinite',
                                     'masters': masters,
                                     'slaves': slaves},
                          'measurement': 'mds'})
        return False, stats

    @classmethod
    def get_stats_storagerouters(cls):
        """
        Retrieve amount of vDisks and some read/write statistics for all StorageRouters
        """
        if cls._config is None:
            cls.validate_and_retrieve_config()

        stats = []
        errors = False
        environment = cls._config['environment']
        for storagerouter in StorageRouterList.get_storagerouters():
            if len(storagerouter.storagedrivers) == 0:
                cls._logger.debug('StorageRouter {0} does not have any StorageDrivers linked to it, skipping'.format(storagerouter.name))
                continue
            try:
                statistics = storagerouter.statistics
                stats.append({'tags': {'environment': environment,
                                       'storagerouter_name': storagerouter.name},
                              'fields': {'read_byte': statistics['data_read'],
                                         'write_byte': statistics['data_written'],
                                         'operations': statistics['4k_operations'],
                                         'amount_vdisks': len(storagerouter.vdisks_guids),
                                         'read_operations': statistics['4k_read_operations'],
                                         'write_operations': statistics['4k_write_operations']},
                              'measurement': 'storagerouter'})
            except Exception:
                errors = True
                cls._logger.exception('Retrieving statistics for StorageRouter {0} failed'.format(storagerouter.name))
        return errors, stats

    @classmethod
    def get_stats_vpools(cls):
        """
        Retrieve statistics for each vPool
        """
        if cls._config is None:
            cls.validate_and_retrieve_config()

        stats = []
        errors = False
        environment = cls._config['environment']
        for vpool in VPoolList.get_vpools():
            try:
                stats.append({'tags': {'vpool_name': vpool.name,
                                       'environment': environment},
                              'fields': cls._convert_to_float_values(cls._pop_realtime_info(vpool.statistics)),
                              'measurement': 'vpool'})
            except Exception:
                errors = True
                cls._logger.exception('Retrieving statistics for vPool {0} failed'.format(vpool.name))
        return errors, stats
Ejemplo n.º 7
0
class GenericController(object):
    """
    This controller contains all generic task code. These tasks can be
    executed at certain intervals and should be self-containing
    """
    _logger = Logger('lib')

    @staticmethod
    @ovs_task(name='ovs.generic.snapshot_all_vdisks',
              schedule=Schedule(minute='0', hour='*'),
              ensure_single_info={
                  'mode': 'DEFAULT',
                  'extra_task_names': ['ovs.generic.delete_snapshots']
              })
    def snapshot_all_vdisks():
        """
        Snapshots all vDisks
        """
        GenericController._logger.info('[SSA] started')
        success = []
        fail = []
        for vdisk in VDiskList.get_vdisks():
            if vdisk.is_vtemplate is True:
                continue
            try:
                metadata = {
                    'label': '',
                    'is_consistent': False,
                    'timestamp': str(int(time.time())),
                    'is_automatic': True,
                    'is_sticky': False
                }
                VDiskController.create_snapshot(vdisk_guid=vdisk.guid,
                                                metadata=metadata)
                success.append(vdisk.guid)
            except Exception:
                GenericController._logger.exception(
                    'Error taking snapshot for vDisk {0}'.format(vdisk.guid))
                fail.append(vdisk.guid)
        GenericController._logger.info(
            '[SSA] Snapshot has been taken for {0} vDisks, {1} failed.'.format(
                len(success), len(fail)))
        return success, fail

    @staticmethod
    @ovs_task(name='ovs.generic.delete_snapshots',
              schedule=Schedule(minute='1', hour='2'),
              ensure_single_info={'mode': 'DEFAULT'})
    def delete_snapshots(timestamp=None):
        """
        Delete snapshots & scrubbing policy

        Implemented delete snapshot policy:
        < 1d | 1d bucket | 1 | best of bucket   | 1d
        < 1w | 1d bucket | 6 | oldest of bucket | 7d = 1w
        < 1m | 1w bucket | 3 | oldest of bucket | 4w = 1m
        > 1m | delete

        :param timestamp: Timestamp to determine whether snapshots should be kept or not, if none provided, current time will be used
        :type timestamp: float

        :return: None
        """
        GenericController._logger.info('Delete snapshots started')

        day = timedelta(1)
        week = day * 7

        def make_timestamp(offset):
            """
            Create an integer based timestamp
            :param offset: Offset in days
            :return: Timestamp
            """
            return int(mktime((base - offset).timetuple()))

        # Calculate bucket structure
        if timestamp is None:
            timestamp = time.time()
        base = datetime.fromtimestamp(timestamp).date() - day
        buckets = []
        # Buckets first 7 days: [0-1[, [1-2[, [2-3[, [3-4[, [4-5[, [5-6[, [6-7[
        for i in xrange(0, 7):
            buckets.append({
                'start': make_timestamp(day * i),
                'end': make_timestamp(day * (i + 1)),
                'type': '1d',
                'snapshots': []
            })
        # Week buckets next 3 weeks: [7-14[, [14-21[, [21-28[
        for i in xrange(1, 4):
            buckets.append({
                'start': make_timestamp(week * i),
                'end': make_timestamp(week * (i + 1)),
                'type': '1w',
                'snapshots': []
            })
        buckets.append({
            'start': make_timestamp(week * 4),
            'end': 0,
            'type': 'rest',
            'snapshots': []
        })

        # Get a list of all snapshots that are used as parents for clones
        parent_snapshots = set(
            [vd.parentsnapshot for vd in VDiskList.get_with_parent_snaphots()])

        # Place all snapshots in bucket_chains
        bucket_chains = []
        for vdisk in VDiskList.get_vdisks():
            if vdisk.info['object_type'] in ['BASE']:
                bucket_chain = copy.deepcopy(buckets)
                for snapshot in vdisk.snapshots:
                    if snapshot.get('is_sticky') is True:
                        continue
                    if snapshot['guid'] in parent_snapshots:
                        GenericController._logger.info(
                            'Not deleting snapshot {0} because it has clones'.
                            format(snapshot['guid']))
                        continue
                    timestamp = int(snapshot['timestamp'])
                    for bucket in bucket_chain:
                        if bucket['start'] >= timestamp > bucket['end']:
                            bucket['snapshots'].append({
                                'timestamp':
                                timestamp,
                                'snapshot_id':
                                snapshot['guid'],
                                'vdisk_guid':
                                vdisk.guid,
                                'is_consistent':
                                snapshot['is_consistent']
                            })
                bucket_chains.append(bucket_chain)

        # Clean out the snapshot bucket_chains, we delete the snapshots we want to keep
        # And we'll remove all snapshots that remain in the buckets
        for bucket_chain in bucket_chains:
            first = True
            for bucket in bucket_chain:
                if first is True:
                    best = None
                    for snapshot in bucket['snapshots']:
                        if best is None:
                            best = snapshot
                        # Consistent is better than inconsistent
                        elif snapshot[
                                'is_consistent'] and not best['is_consistent']:
                            best = snapshot
                        # Newer (larger timestamp) is better than older snapshots
                        elif snapshot['is_consistent'] == best['is_consistent'] and \
                                snapshot['timestamp'] > best['timestamp']:
                            best = snapshot
                    bucket['snapshots'] = [
                        s for s in bucket['snapshots']
                        if s['timestamp'] != best['timestamp']
                    ]
                    first = False
                elif bucket['end'] > 0:
                    oldest = None
                    for snapshot in bucket['snapshots']:
                        if oldest is None:
                            oldest = snapshot
                        # Older (smaller timestamp) is the one we want to keep
                        elif snapshot['timestamp'] < oldest['timestamp']:
                            oldest = snapshot
                    bucket['snapshots'] = [
                        s for s in bucket['snapshots']
                        if s['timestamp'] != oldest['timestamp']
                    ]

        # Delete obsolete snapshots
        for bucket_chain in bucket_chains:
            for bucket in bucket_chain:
                for snapshot in bucket['snapshots']:
                    VDiskController.delete_snapshot(
                        vdisk_guid=snapshot['vdisk_guid'],
                        snapshot_id=snapshot['snapshot_id'])
        GenericController._logger.info('Delete snapshots finished')

    @staticmethod
    @ovs_task(name='ovs.generic.execute_scrub',
              schedule=Schedule(minute='0', hour='3'),
              ensure_single_info={'mode': 'DEDUPED'})
    def execute_scrub(vpool_guids=None,
                      vdisk_guids=None,
                      storagerouter_guid=None,
                      manual=False):
        """
        Divide the scrub work among all StorageRouters with a SCRUB partition
        :param vpool_guids: Guids of the vPools that need to be scrubbed completely
        :type vpool_guids: list
        :param vdisk_guids: Guids of the vDisks that need to be scrubbed
        :type vdisk_guids: list
        :param storagerouter_guid: Guid of the StorageRouter to execute the scrub work on
        :type storagerouter_guid: str
        :param manual: Indicator whether the execute_scrub is called manually or as scheduled task (automatically)
        :type manual: bool
        :return: None
        :rtype: NoneType
        """
        # GenericController.execute_scrub.request.id gets the current celery task id (None if executed directly)
        # Fetching the task_id with the hasattr because Unit testing does not execute the wrapper (No celery task but a normal function being called)
        if os.environ.get('RUNNING_UNITTESTS') == 'True':
            task_id = 'unittest'
        else:
            task_id = GenericController.execute_scrub.request.id if hasattr(
                GenericController.execute_scrub, 'request') else None
        scrubber = Scrubber(vpool_guids,
                            vdisk_guids,
                            storagerouter_guid,
                            manual=manual,
                            task_id=task_id)
        return scrubber.execute_scrubbing()

    @staticmethod
    @ovs_task(name='ovs.generic.collapse_arakoon',
              schedule=Schedule(minute='10',
                                hour='0,2,4,6,8,10,12,14,16,18,20,22'),
              ensure_single_info={'mode': 'DEFAULT'})
    def collapse_arakoon():
        """
        Collapse Arakoon's Tlogs
        :return: None
        """
        from ovs_extensions.generic.toolbox import ExtensionsToolbox

        GenericController._logger.info('Arakoon collapse started')
        cluster_info = []
        storagerouters = StorageRouterList.get_storagerouters()
        if os.environ.get('RUNNING_UNITTESTS') != 'True':
            cluster_info = [('cacc', storagerouters[0])]

        cluster_names = []
        for service in ServiceList.get_services():
            if service.is_internal is True and service.type.name in (
                    ServiceType.SERVICE_TYPES.ARAKOON,
                    ServiceType.SERVICE_TYPES.NS_MGR,
                    ServiceType.SERVICE_TYPES.ALBA_MGR):
                cluster = ExtensionsToolbox.remove_prefix(
                    service.name, 'arakoon-')
                if cluster in cluster_names and cluster not in [
                        Configuration.ARAKOON_NAME,
                        Configuration.ARAKOON_NAME_UNITTEST
                ]:
                    continue
                cluster_names.append(cluster)
                cluster_info.append((cluster, service.storagerouter))
        workload = {}
        cluster_config_map = {}
        for cluster, storagerouter in cluster_info:
            GenericController._logger.debug(
                '  Collecting info for cluster {0}'.format(cluster))
            ip = storagerouter.ip if cluster in [
                Configuration.ARAKOON_NAME, Configuration.ARAKOON_NAME_UNITTEST
            ] else None
            try:
                config = ArakoonClusterConfig(cluster_id=cluster, source_ip=ip)
                cluster_config_map[cluster] = config
            except:
                GenericController._logger.exception(
                    '  Retrieving cluster information on {0} for {1} failed'.
                    format(storagerouter.ip, cluster))
                continue
            for node in config.nodes:
                if node.ip not in workload:
                    workload[node.ip] = {'node_id': node.name, 'clusters': []}
                workload[node.ip]['clusters'].append((cluster, ip))
        for storagerouter in storagerouters:
            try:
                if storagerouter.ip not in workload:
                    continue
                node_workload = workload[storagerouter.ip]
                client = SSHClient(storagerouter)
                for cluster, ip in node_workload['clusters']:
                    try:
                        GenericController._logger.debug(
                            '  Collapsing cluster {0} on {1}'.format(
                                cluster, storagerouter.ip))
                        client.run([
                            'arakoon', '--collapse-local',
                            node_workload['node_id'], '2', '-config',
                            cluster_config_map[cluster].external_config_path
                        ])
                        GenericController._logger.debug(
                            '  Collapsing cluster {0} on {1} completed'.format(
                                cluster, storagerouter.ip))
                    except:
                        GenericController._logger.exception(
                            '  Collapsing cluster {0} on {1} failed'.format(
                                cluster, storagerouter.ip))
            except UnableToConnectException:
                GenericController._logger.error(
                    '  Could not collapse any cluster on {0} (not reachable)'.
                    format(storagerouter.name))
        GenericController._logger.info('Arakoon collapse finished')

    @staticmethod
    @ovs_task(name='ovs.generic.refresh_package_information',
              schedule=Schedule(minute='10', hour='*'),
              ensure_single_info={'mode': 'DEFAULT'})
    def refresh_package_information():
        """
        Retrieve and store the package information of all StorageRouters
        :return: None
        """
        GenericController._logger.info('Updating package information')

        client_map = {}
        prerequisites = []
        package_info_cluster = {}
        all_storagerouters = StorageRouterList.get_storagerouters()
        all_storagerouters.sort(key=lambda sr: ExtensionsToolbox.advanced_sort(
            element=sr.ip, separator='.'))
        for storagerouter in all_storagerouters:
            package_info_cluster[storagerouter.ip] = {}
            try:
                # We make use of these clients in Threads --> cached = False
                client_map[storagerouter] = SSHClient(endpoint=storagerouter,
                                                      username='******',
                                                      cached=False)
            except (NotAuthenticatedException, UnableToConnectException):
                GenericController._logger.warning(
                    'StorageRouter {0} is inaccessible'.format(
                        storagerouter.ip))
                prerequisites.append(['node_down', storagerouter.name])
                package_info_cluster[storagerouter.ip]['errors'] = [
                    'StorageRouter {0} is inaccessible'.format(
                        storagerouter.name)
                ]

        # Retrieve for each StorageRouter in the cluster the installed and candidate versions of related packages
        # This also validates whether all required packages have been installed
        GenericController._logger.debug(
            'Retrieving package information for the cluster')
        threads = []
        for storagerouter, client in client_map.iteritems():
            for fct in Toolbox.fetch_hooks(
                    component='update',
                    sub_component='get_package_update_info_cluster'):
                thread = Thread(target=fct,
                                args=(client, package_info_cluster))
                thread.start()
                threads.append(thread)

        for thread in threads:
            thread.join()

        # Retrieve the related downtime / service restart information
        GenericController._logger.debug(
            'Retrieving update information for the cluster')
        update_info_cluster = {}
        for storagerouter, client in client_map.iteritems():
            update_info_cluster[storagerouter.ip] = {
                'errors':
                package_info_cluster[storagerouter.ip].get('errors', [])
            }
            for fct in Toolbox.fetch_hooks(
                    component='update',
                    sub_component='get_update_info_cluster'):
                fct(client, update_info_cluster,
                    package_info_cluster[storagerouter.ip])

        # Retrieve the update information for plugins (eg: ALBA, iSCSI)
        GenericController._logger.debug(
            'Retrieving package and update information for the plugins')
        threads = []
        update_info_plugin = {}
        for fct in Toolbox.fetch_hooks('update', 'get_update_info_plugin'):
            thread = Thread(target=fct, args=(update_info_plugin, ))
            thread.start()
            threads.append(thread)

        for thread in threads:
            thread.join()

        # Add the prerequisites
        if len(prerequisites) > 0:
            for ip, component_info in update_info_cluster.iteritems():
                if PackageFactory.COMP_FWK in component_info:
                    component_info[PackageFactory.COMP_FWK][
                        'prerequisites'].extend(prerequisites)

        # Store information in model and collect errors for OVS cluster
        errors = set()
        for storagerouter in all_storagerouters:
            GenericController._logger.debug(
                'Storing update information for StorageRouter {0}'.format(
                    storagerouter.ip))
            update_info = update_info_cluster.get(storagerouter.ip, {})

            # Remove the errors from the update information
            sr_errors = update_info.pop('errors', [])
            if len(sr_errors) > 0:
                errors.update([
                    '{0}: {1}'.format(storagerouter.ip, error)
                    for error in sr_errors
                ])
                update_info = {
                }  # If any error occurred, we store no update information for this StorageRouter

            # Remove the components without updates from the update information
            update_info_copy = copy.deepcopy(update_info)
            for component, info in update_info_copy.iteritems():
                if len(info['packages']) == 0:
                    update_info.pop(component)

            # Store the update information
            storagerouter.package_information = update_info
            storagerouter.save()

        # Collect errors for plugins
        for ip, plugin_errors in update_info_plugin.iteritems():
            if len(plugin_errors) > 0:
                errors.update(
                    ['{0}: {1}'.format(ip, error) for error in plugin_errors])

        if len(errors) > 0:
            raise Exception('\n - {0}'.format('\n - '.join(errors)))
        GenericController._logger.info('Finished updating package information')

    @staticmethod
    @ovs_task(name='ovs.generic.run_backend_domain_hooks')
    def run_backend_domain_hooks(backend_guid):
        """
        Run hooks when the Backend Domains have been updated
        :param backend_guid: Guid of the Backend to update
        :type backend_guid: str
        :return: None
        """
        for fct in Toolbox.fetch_hooks('backend', 'domains-update'):
            fct(backend_guid=backend_guid)
Ejemplo n.º 8
0
class StorageDriverController(object):
    """
    Contains all BLL related to Storage Drivers
    """
    _logger = LogHandler.get('lib', name='storagedriver')

    ################
    # CELERY TASKS #
    ################
    @staticmethod
    @ovs_task(name='ovs.storagedriver.mark_offline')
    def mark_offline(storagerouter_guid):
        """
        Marks all StorageDrivers on this StorageRouter offline
        :param storagerouter_guid: Guid of the Storage Router
        :type storagerouter_guid: str
        :return: None
        """
        for storagedriver in StorageRouter(storagerouter_guid).storagedrivers:
            vpool = storagedriver.vpool
            if len(vpool.storagedrivers) > 1:
                storagedriver_client = StorageDriverClient.load(
                    vpool, excluded_storagedrivers=[storagedriver])
                storagedriver_client.mark_node_offline(
                    str(storagedriver.storagedriver_id))

    @staticmethod
    @ovs_task(name='ovs.storagedriver.volumedriver_error')
    @log('VOLUMEDRIVER_TASK')
    def volumedriver_error(code, volume_id):
        """
        Handles error messages/events from the volumedriver
        :param code: Volumedriver error code
        :type code: int
        :param volume_id: Name of the volume throwing the error
        :type volume_id: str
        :return: None
        """
        if code == VolumeDriverEvents_pb2.MDSFailover:
            disk = VDiskList.get_vdisk_by_volume_id(volume_id)
            if disk is not None:
                MDSServiceController.ensure_safety(disk)

    @staticmethod
    @ovs_task(name='ovs.storagedriver.cluster_registry_checkup',
              schedule=Schedule(minute='0', hour='0'),
              ensure_single_info={'mode': 'CHAINED'})
    def cluster_registry_checkup():
        """
        Verify whether changes have occurred in the cluster registry for each vPool
        :return: Information whether changes occurred
        :rtype: dict
        """
        changed_vpools = {}
        for vpool in VPoolList.get_vpools():
            changed_vpools[vpool.guid] = {'changes': False, 'success': True}
            try:
                StorageDriverController._logger.info(
                    'Validating cluster registry settings for Vpool {0}'.
                    format(vpool.guid))

                current_configs = vpool.clusterregistry_client.get_node_configs(
                )
                changes = len(current_configs) == 0
                node_configs = []
                for sd in vpool.storagedrivers:
                    sd.invalidate_dynamics(['cluster_node_config'])
                    new_config = sd.cluster_node_config
                    node_configs.append(ClusterNodeConfig(**new_config))
                    if changes is False:
                        current_node_configs = [
                            config for config in current_configs
                            if config.vrouter_id == sd.storagedriver_id
                        ]
                        if len(current_node_configs) == 1:
                            current_node_config = current_node_configs[0]
                            for key in new_config:
                                if getattr(current_node_config,
                                           key) != new_config[key]:
                                    changes = True
                                    break
                changed_vpools[vpool.guid]['changes'] = changes

                if changes is True:
                    StorageDriverController._logger.info(
                        'Cluster registry settings for Vpool {0} needs to be updated'
                        .format(vpool.guid))
                    available_storagedrivers = []
                    for sd in vpool.storagedrivers:
                        storagerouter = sd.storagerouter
                        try:
                            SSHClient(storagerouter, username='******')
                        except UnableToConnectException:
                            StorageDriverController._logger.warning(
                                'StorageRouter {0} not available.'.format(
                                    storagerouter.name))
                            continue

                        with remote(storagerouter.ip,
                                    [LocalStorageRouterClient]) as rem:
                            sd_key = '/ovs/vpools/{0}/hosts/{1}/config'.format(
                                vpool.guid, sd.storagedriver_id)
                            if Configuration.exists(sd_key) is True:
                                path = Configuration.get_configuration_path(
                                    sd_key)
                                try:
                                    lsrc = rem.LocalStorageRouterClient(path)
                                    lsrc.server_revision(
                                    )  # 'Cheap' call to verify whether volumedriver is responsive
                                    available_storagedrivers.append(sd)
                                except Exception as ex:
                                    if 'ClusterNotReachableException' in str(
                                            ex):
                                        StorageDriverController._logger.warning(
                                            'StorageDriver {0} on StorageRouter {1} not available.'
                                            .format(sd.guid,
                                                    storagerouter.name))
                                    else:
                                        StorageDriverController._logger.exception(
                                            'Got exception when validating StorageDriver {0} on StorageRouter {1}.'
                                            .format(sd.guid,
                                                    storagerouter.name))

                    StorageDriverController._logger.info(
                        'Updating cluster node configs for VPool {0}'.format(
                            vpool.guid))
                    vpool.clusterregistry_client.set_node_configs(node_configs)
                    for sd in available_storagedrivers:
                        StorageDriverController._logger.info(
                            'Trigger config reload for StorageDriver {0}'.
                            format(sd.guid))
                        vpool.storagedriver_client.update_cluster_node_configs(
                            str(sd.storagedriver_id), req_timeout_secs=10)
                    StorageDriverController._logger.info(
                        'Updating cluster node configs for Vpool {0} completed'
                        .format(vpool.guid))
                else:
                    StorageDriverController._logger.info(
                        'Cluster registry settings for Vpool {0} is up to date'
                        .format(vpool.guid))
            except Exception as ex:
                StorageDriverController._logger.exception(
                    'Got exception when validating cluster registry settings for Vpool {0}.'
                    .format(vpool.name))
                changed_vpools[vpool.guid]['success'] = False
                changed_vpools[vpool.guid]['error'] = ex.message
        return changed_vpools

    @staticmethod
    @ovs_task(name='ovs.storagedriver.scheduled_voldrv_arakoon_checkup',
              schedule=Schedule(minute='15', hour='*'),
              ensure_single_info={
                  'mode':
                  'DEFAULT',
                  'extra_task_names':
                  ['ovs.storagedriver.manual_voldrv_arakoon_checkup']
              })
    def scheduled_voldrv_arakoon_checkup():
        """
        Makes sure the volumedriver arakoon is on all available master nodes
        :return: None
        """
        StorageDriverController._voldrv_arakoon_checkup(False)

    @staticmethod
    @ovs_task(name='ovs.storagedriver.manual_voldrv_arakoon_checkup',
              ensure_single_info={
                  'mode':
                  'DEFAULT',
                  'extra_task_names':
                  ['ovs.storagedriver.scheduled_voldrv_arakoon_checkup']
              })
    def manual_voldrv_arakoon_checkup():
        """
        Creates a new Arakoon Cluster if required and extends cluster if possible on all available master nodes
        :return: True if task completed, None if task was discarded (by decorator)
        :rtype: bool|None
        """
        StorageDriverController._voldrv_arakoon_checkup(True)
        return True

    @staticmethod
    @ovs_task(name='ovs.storagedriver.refresh_configuration')
    def refresh_configuration(storagedriver_guid):
        """
        Refresh the StorageDriver's configuration (Configuration must have been updated manually)
        :param storagedriver_guid: Guid of the StorageDriver
        :type storagedriver_guid: str
        :return: Amount of changes the volumedriver detected
        :rtype: int
        """
        storagedriver = StorageDriver(storagedriver_guid)
        try:
            client = SSHClient(endpoint=storagedriver.storagerouter)
        except UnableToConnectException:
            raise Exception(
                'StorageRouter with IP {0} is not reachable. Cannot refresh the configuration'
                .format(storagedriver.storagerouter.ip))

        storagedriver_config = StorageDriverConfiguration(
            config_type='storagedriver',
            vpool_guid=storagedriver.vpool_guid,
            storagedriver_id=storagedriver.storagedriver_id)
        storagedriver_config.load()
        return len(storagedriver_config.save(client=client, force_reload=True))

    #########
    # HOOKS #
    #########
    @staticmethod
    @add_hooks('nodetype', 'demote')
    def _on_demote(cluster_ip, master_ip, offline_node_ips=None):
        """
        Handles the demote for the StorageDrivers
        :param cluster_ip: IP of the node to demote
        :type cluster_ip: str
        :param master_ip: IP of the master node
        :type master_ip: str
        :param offline_node_ips: IPs of nodes which are offline
        :type offline_node_ips: list
        :return: None
        """
        _ = master_ip
        if offline_node_ips is None:
            offline_node_ips = []
        servicetype = ServiceTypeList.get_by_name(
            ServiceType.SERVICE_TYPES.ARAKOON)
        current_service = None
        remaining_ips = []
        for service in servicetype.services:
            if service.name == 'arakoon-voldrv' and service.is_internal is True:  # Externally managed arakoon cluster services do not have StorageRouters
                if service.storagerouter.ip == cluster_ip:
                    current_service = service
                elif service.storagerouter.ip not in offline_node_ips:
                    remaining_ips.append(service.storagerouter.ip)
        if current_service is not None:
            if len(remaining_ips) == 0:
                raise RuntimeError(
                    'Could not find any remaining arakoon nodes for the voldrv cluster'
                )
            StorageDriverController._logger.debug(
                '* Shrink StorageDriver cluster')
            cluster_name = str(
                Configuration.get('/ovs/framework/arakoon_clusters|voldrv'))
            arakoon_installer = ArakoonInstaller(cluster_name=cluster_name)
            arakoon_installer.load()
            arakoon_installer.shrink_cluster(removal_ip=cluster_ip,
                                             offline_nodes=offline_node_ips)
            arakoon_installer.restart_cluster_after_shrinking()
            current_service.delete()
            StorageDriverController._configure_arakoon_to_volumedriver(
                cluster_name=cluster_name)

    @staticmethod
    @add_hooks('noderemoval', 'remove')
    def _on_remove(cluster_ip, complete_removal):
        """
        Handles the StorageDriver removal part of a node
        :param cluster_ip: IP of the node which is being removed from the cluster
        :type cluster_ip: str
        :param complete_removal: Unused for StorageDriver, used for AlbaController
        :type complete_removal: bool
        :return: None
        """
        _ = complete_removal

        service_manager = ServiceFactory.get_manager()
        service_name = 'watcher-volumedriver'
        try:
            client = SSHClient(endpoint=cluster_ip, username='******')
            if service_manager.has_service(name=service_name, client=client):
                service_manager.stop_service(name=service_name, client=client)
                service_manager.remove_service(name=service_name,
                                               client=client)
        except (UnableToConnectException, NotAuthenticatedException):
            pass

    ####################
    # PUBLIC FUNCTIONS #
    ####################
    @staticmethod
    def add_storagedriverpartition(storagedriver, partition_info):
        """
        Stores new storagedriver partition object with correct number
        :param storagedriver: Storagedriver to create the partition for
        :type storagedriver: StorageDriver
        :param partition_info: Partition information containing, role, size, sub_role, disk partition, MDS service
        :type partition_info: dict
        :return: Newly created storage driver partition
        :rtype: StorageDriverPartition
        """
        role = partition_info['role']
        size = partition_info.get('size')
        sub_role = partition_info.get('sub_role')
        partition = partition_info['partition']
        mds_service = partition_info.get('mds_service')
        highest_number = 0
        for existing_sdp in storagedriver.partitions:
            if existing_sdp.partition_guid == partition.guid and existing_sdp.role == role and existing_sdp.sub_role == sub_role:
                highest_number = max(existing_sdp.number, highest_number)
        sdp = StorageDriverPartition()
        sdp.role = role
        sdp.size = size
        sdp.number = highest_number + 1
        sdp.sub_role = sub_role
        sdp.partition = partition
        sdp.mds_service = mds_service
        sdp.storagedriver = storagedriver
        sdp.save()
        return sdp

    #####################
    # PRIVATE FUNCTIONS #
    #####################
    @staticmethod
    def _voldrv_arakoon_checkup(create_cluster):
        def _add_service(service_storagerouter, arakoon_ports, service_name):
            """ Add a service to the storage router """
            new_service = Service()
            new_service.name = service_name
            new_service.type = service_type
            new_service.ports = arakoon_ports
            new_service.storagerouter = service_storagerouter
            new_service.save()
            return new_service

        current_ips = []
        current_services = []
        service_type = ServiceTypeList.get_by_name(
            ServiceType.SERVICE_TYPES.ARAKOON)
        cluster_name = Configuration.get(
            '/ovs/framework/arakoon_clusters').get('voldrv')
        if cluster_name is not None:
            arakoon_service_name = ArakoonInstaller.get_service_name_for_cluster(
                cluster_name=cluster_name)
            for service in service_type.services:
                if service.name == arakoon_service_name:
                    current_services.append(service)
                    if service.is_internal is True:
                        current_ips.append(service.storagerouter.ip)

        all_sr_ips = [
            storagerouter.ip
            for storagerouter in StorageRouterList.get_slaves()
        ]
        available_storagerouters = {}
        for storagerouter in StorageRouterList.get_masters():
            storagerouter.invalidate_dynamics(['partition_config'])
            if len(storagerouter.partition_config[DiskPartition.ROLES.DB]) > 0:
                available_storagerouters[storagerouter] = DiskPartition(
                    storagerouter.partition_config[DiskPartition.ROLES.DB][0])
            all_sr_ips.append(storagerouter.ip)

        if create_cluster is True and len(
                current_services) == 0:  # Create new cluster
            metadata = ArakoonInstaller.get_unused_arakoon_metadata_and_claim(
                cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.SD)
            if metadata is None:  # No externally managed cluster found, we create 1 ourselves
                if not available_storagerouters:
                    raise RuntimeError(
                        'Could not find any Storage Router with a DB role')

                storagerouter, partition = available_storagerouters.items()[0]
                arakoon_voldrv_cluster = 'voldrv'
                arakoon_installer = ArakoonInstaller(
                    cluster_name=arakoon_voldrv_cluster)
                arakoon_installer.create_cluster(
                    cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.SD,
                    ip=storagerouter.ip,
                    base_dir=partition.folder,
                    log_sinks=LogHandler.get_sink_path(
                        'arakoon-server_{0}'.format(arakoon_voldrv_cluster)),
                    crash_log_sinks=LogHandler.get_sink_path(
                        'arakoon-server-crash_{0}'.format(
                            arakoon_voldrv_cluster)))
                arakoon_installer.start_cluster()
                ports = arakoon_installer.ports[storagerouter.ip]
                metadata = arakoon_installer.metadata
                current_ips.append(storagerouter.ip)
            else:
                ports = []
                storagerouter = None

            cluster_name = metadata['cluster_name']
            Configuration.set('/ovs/framework/arakoon_clusters|voldrv',
                              cluster_name)
            StorageDriverController._logger.info(
                'Claiming {0} managed arakoon cluster: {1}'.format(
                    'externally' if storagerouter is None else 'internally',
                    cluster_name))
            StorageDriverController._configure_arakoon_to_volumedriver(
                cluster_name=cluster_name)
            current_services.append(
                _add_service(
                    service_storagerouter=storagerouter,
                    arakoon_ports=ports,
                    service_name=ArakoonInstaller.get_service_name_for_cluster(
                        cluster_name=cluster_name)))

        cluster_name = Configuration.get(
            '/ovs/framework/arakoon_clusters').get('voldrv')
        if cluster_name is None:
            return
        metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(
            cluster_name=cluster_name)
        if 0 < len(current_services) < len(
                available_storagerouters) and metadata['internal'] is True:
            for storagerouter, partition in available_storagerouters.iteritems(
            ):
                if storagerouter.ip in current_ips:
                    continue
                arakoon_installer = ArakoonInstaller(cluster_name=cluster_name)
                arakoon_installer.load()
                arakoon_installer.extend_cluster(
                    new_ip=storagerouter.ip,
                    base_dir=partition.folder,
                    log_sinks=LogHandler.get_sink_path(
                        'arakoon-server_{0}'.format(cluster_name)),
                    crash_log_sinks=LogHandler.get_sink_path(
                        'arakoon-server-crash_{0}'.format(cluster_name)))
                _add_service(
                    service_storagerouter=storagerouter,
                    arakoon_ports=arakoon_installer.ports[storagerouter.ip],
                    service_name=ArakoonInstaller.get_service_name_for_cluster(
                        cluster_name=cluster_name))
                current_ips.append(storagerouter.ip)
                arakoon_installer.restart_cluster_after_extending(
                    new_ip=storagerouter.ip)
            StorageDriverController._configure_arakoon_to_volumedriver(
                cluster_name=cluster_name)

    @staticmethod
    def _configure_arakoon_to_volumedriver(cluster_name):
        StorageDriverController._logger.info('Update existing vPools')
        config = ArakoonClusterConfig(cluster_id=cluster_name)
        arakoon_nodes = []
        for node in config.nodes:
            arakoon_nodes.append({
                'host': node.ip,
                'port': node.client_port,
                'node_id': node.name
            })
        if Configuration.dir_exists('/ovs/vpools'):
            for vpool_guid in Configuration.list('/ovs/vpools'):
                for storagedriver_id in Configuration.list(
                        '/ovs/vpools/{0}/hosts'.format(vpool_guid)):
                    storagedriver_config = StorageDriverConfiguration(
                        'storagedriver', vpool_guid, storagedriver_id)
                    storagedriver_config.load()
                    storagedriver_config.configure_volume_registry(
                        vregistry_arakoon_cluster_id=cluster_name,
                        vregistry_arakoon_cluster_nodes=arakoon_nodes)
                    storagedriver_config.configure_distributed_lock_store(
                        dls_type='Arakoon',
                        dls_arakoon_cluster_id=cluster_name,
                        dls_arakoon_cluster_nodes=arakoon_nodes)
                    storagedriver_config.save()

    @staticmethod
    def generate_backoff_gap_settings(cache_size):
        """
        Generates decent gap sizes for a given cache size
        :param cache_size: Size of the cache on which the gap sizes should be based
        :type cache_size: int
        :return: Dictionary with keys 'trigger' and 'backoff', containing their sizes in bytes
        :rtype: dict
        """
        if cache_size is None:
            StorageDriverController._logger.warning(
                'Got request to calculate gaps for None as cache size. Returned default (2/1GiB)'
                .format(cache_size))
            return {'backoff': 2 * 1024**3, 'trigger': 1 * 1024**3}
        gap_configuration = {}
        # Below "settings" = [factor of smallest parition size, maximum size in GiB, minimum size in bytes]
        for gap, gap_settings in {
                'backoff': [0.1, 50, 2],
                'trigger': [0.08, 40, 1]
        }.iteritems():
            current_config = int(cache_size * gap_settings[0])
            current_config = min(current_config, gap_settings[1] * 1024**3)
            current_config = max(current_config, gap_settings[2])
            gap_configuration[gap] = current_config
        return gap_configuration
Ejemplo n.º 9
0
class MigrationController(object):
    """
    This controller contains (part of the) migration code. It runs out-of-band with the updater so we reduce the risk of
    failures during the update
    """
    _logger = LogHandler.get('lib', name='migrations')

    @staticmethod
    @ovs_task(name='ovs.migration.migrate', schedule=Schedule(minute='0', hour='6'), ensure_single_info={'mode': 'DEFAULT'})
    def migrate():
        """
        Executes async migrations. It doesn't matter too much when they are executed, as long as they get eventually
        executed. This code will typically contain:
        * "dangerous" migration code (it needs certain running services)
        * Migration code depending on a cluster-wide state
        * ...
        """
        MigrationController._logger.info('Preparing out of band migrations...')

        from ovs.dal.lists.storagedriverlist import StorageDriverList
        from ovs.dal.lists.storagerouterlist import StorageRouterList
        from ovs.dal.lists.vpoollist import VPoolList
        from ovs.extensions.generic.configuration import Configuration
        from ovs.extensions.generic.sshclient import SSHClient
        from ovs_extensions.generic.toolbox import ExtensionsToolbox
        from ovs_extensions.services.interfaces.systemd import Systemd
        from ovs.extensions.services.servicefactory import ServiceFactory
        from ovs.extensions.storageserver.storagedriver import StorageDriverConfiguration
        from ovs.lib.generic import GenericController

        MigrationController._logger.info('Start out of band migrations...')
        service_manager = ServiceFactory.get_manager()

        sr_client_map = {}
        for storagerouter in StorageRouterList.get_storagerouters():
            sr_client_map[storagerouter.guid] = SSHClient(endpoint=storagerouter,
                                                          username='******')

        #########################################################
        # Addition of 'ExecReload' for AlbaProxy SystemD services
        if ServiceFactory.get_service_type() == 'systemd':
            changed_clients = set()
            for storagedriver in StorageDriverList.get_storagedrivers():
                root_client = sr_client_map[storagedriver.storagerouter_guid]
                for alba_proxy in storagedriver.alba_proxies:
                    service = alba_proxy.service
                    service_name = 'ovs-{0}'.format(service.name)
                    if not service_manager.has_service(name=service_name, client=root_client):
                        continue
                    if 'ExecReload=' in root_client.file_read(filename='/lib/systemd/system/{0}.service'.format(service_name)):
                        continue

                    try:
                        service_manager.regenerate_service(name='ovs-albaproxy', client=root_client, target_name=service_name)
                        changed_clients.add(root_client)
                    except:
                        MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name))
            for root_client in changed_clients:
                root_client.run(['systemctl', 'daemon-reload'])

        ##################################################################
        # Adjustment of open file descriptors for Arakoon services to 8192
        changed_clients = set()
        for storagerouter in StorageRouterList.get_storagerouters():
            root_client = sr_client_map[storagerouter.guid]
            for service_name in service_manager.list_services(client=root_client):
                if not service_name.startswith('ovs-arakoon-'):
                    continue

                if ServiceFactory.get_service_type() == 'systemd':
                    path = '/lib/systemd/system/{0}.service'.format(service_name)
                    check = 'LimitNOFILE=8192'
                else:
                    path = '/etc/init/{0}.conf'.format(service_name)
                    check = 'limit nofile 8192 8192'

                if not root_client.file_exists(path):
                    continue
                if check in root_client.file_read(path):
                    continue

                try:
                    service_manager.regenerate_service(name='ovs-arakoon', client=root_client, target_name=service_name)
                    changed_clients.add(root_client)
                    ExtensionsToolbox.edit_version_file(client=root_client, package_name='arakoon', old_service_name=service_name)
                except:
                    MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name))
        for root_client in changed_clients:
            root_client.run(['systemctl', 'daemon-reload'])

        #############################
        # Migrate to multiple proxies
        for storagedriver in StorageDriverList.get_storagedrivers():
            vpool = storagedriver.vpool
            root_client = sr_client_map[storagedriver.storagerouter_guid]
            for alba_proxy in storagedriver.alba_proxies:
                # Rename alba_proxy service in model
                service = alba_proxy.service
                old_service_name = 'albaproxy_{0}'.format(vpool.name)
                new_service_name = 'albaproxy_{0}_0'.format(vpool.name)
                if old_service_name != service.name:
                    continue
                service.name = new_service_name
                service.save()

                if not service_manager.has_service(name=old_service_name, client=root_client):
                    continue
                old_configuration_key = '/ovs/framework/hosts/{0}/services/{1}'.format(storagedriver.storagerouter.machine_id, old_service_name)
                if not Configuration.exists(key=old_configuration_key):
                    continue

                # Add '-reboot' to alba_proxy services (because of newly created services and removal of old service)
                ExtensionsToolbox.edit_version_file(client=root_client,
                                                    package_name='alba',
                                                    old_service_name=old_service_name,
                                                    new_service_name=new_service_name)

                # Register new service and remove old service
                service_manager.add_service(name='ovs-albaproxy',
                                            client=root_client,
                                            params=Configuration.get(old_configuration_key),
                                            target_name='ovs-{0}'.format(new_service_name))

                # Update scrub proxy config
                proxy_config_key = '/ovs/vpools/{0}/proxies/{1}/config/main'.format(vpool.guid, alba_proxy.guid)
                proxy_config = None if Configuration.exists(key=proxy_config_key) is False else Configuration.get(proxy_config_key)
                if proxy_config is not None:
                    fragment_cache = proxy_config.get('fragment_cache', ['none', {}])
                    if fragment_cache[0] == 'alba' and fragment_cache[1].get('cache_on_write') is True:  # Accelerated ALBA configured
                        fragment_cache_scrub_info = copy.deepcopy(fragment_cache)
                        fragment_cache_scrub_info[1]['cache_on_read'] = False
                        proxy_scrub_config_key = '/ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid)
                        proxy_scrub_config = None if Configuration.exists(key=proxy_scrub_config_key) is False else Configuration.get(proxy_scrub_config_key)
                        if proxy_scrub_config is not None and proxy_scrub_config['fragment_cache'] == ['none']:
                            proxy_scrub_config['fragment_cache'] = fragment_cache_scrub_info
                            Configuration.set(proxy_scrub_config_key,
                                              json.dumps(proxy_scrub_config, indent=4),
                                              raw=True)

            # Update 'backend_connection_manager' section
            changes = False
            storagedriver_config = StorageDriverConfiguration('storagedriver', vpool.guid, storagedriver.storagedriver_id)
            storagedriver_config.load()
            if 'backend_connection_manager' not in storagedriver_config.configuration:
                continue

            current_config = storagedriver_config.configuration['backend_connection_manager']
            if current_config.get('backend_type') != 'MULTI':
                changes = True
                backend_connection_manager = {'backend_type': 'MULTI'}
                for index, proxy in enumerate(sorted(storagedriver.alba_proxies, key=lambda pr: pr.service.ports[0])):
                    backend_connection_manager[str(index)] = copy.deepcopy(current_config)
                    # noinspection PyUnresolvedReferences
                    backend_connection_manager[str(index)]['alba_connection_use_rora'] = True
                    # noinspection PyUnresolvedReferences
                    backend_connection_manager[str(index)]['alba_connection_rora_manifest_cache_capacity'] = 5000
                    # noinspection PyUnresolvedReferences
                    for key, value in backend_connection_manager[str(index)].items():
                        if key.startswith('backend_interface'):
                            backend_connection_manager[key] = value
                            # noinspection PyUnresolvedReferences
                            del backend_connection_manager[str(index)][key]
                for key, value in {'backend_interface_retries_on_error': 5,
                                   'backend_interface_retry_interval_secs': 1,
                                   'backend_interface_retry_backoff_multiplier': 2.0}.iteritems():
                    if key not in backend_connection_manager:
                        backend_connection_manager[key] = value
            else:
                backend_connection_manager = current_config
                for value in backend_connection_manager.values():
                    if isinstance(value, dict):
                        for key, val in value.items():
                            if key.startswith('backend_interface'):
                                backend_connection_manager[key] = val
                                changes = True
                                del value[key]
                for key, value in {'backend_interface_retries_on_error': 5,
                                   'backend_interface_retry_interval_secs': 1,
                                   'backend_interface_retry_backoff_multiplier': 2.0}.iteritems():
                    if key not in backend_connection_manager:
                        changes = True
                        backend_connection_manager[key] = value

            if changes is True:
                storagedriver_config.clear_backend_connection_manager()
                storagedriver_config.configure_backend_connection_manager(**backend_connection_manager)
                storagedriver_config.save(root_client)

                # Add '-reboot' to volumedriver services (because of updated 'backend_connection_manager' section)
                ExtensionsToolbox.edit_version_file(client=root_client,
                                                    package_name='volumedriver',
                                                    old_service_name='volumedriver_{0}'.format(vpool.name))
                if service_manager.ImplementationClass == Systemd:
                    root_client.run(['systemctl', 'daemon-reload'])

        ########################################
        # Update metadata_store_bits information
        for vpool in VPoolList.get_vpools():
            bits = None
            for storagedriver in vpool.storagedrivers:
                key = '/ovs/framework/hosts/{0}/services/volumedriver_{1}'.format(storagedriver.storagerouter.machine_id, vpool.name)
                if Configuration.exists(key=key) and 'METADATASTORE_BITS' not in Configuration.get(key=key):
                    if bits is None:
                        entries = service_manager.extract_from_service_file(name='ovs-volumedriver_{0}'.format(vpool.name),
                                                                            client=sr_client_map[storagedriver.storagerouter_guid],
                                                                            entries=['METADATASTORE_BITS='])
                        if len(entries) == 1:
                            bits = entries[0].split('=')[-1]
                            bits = int(bits) if bits.isdigit() else 5
                    if bits is not None:
                        try:
                            content = Configuration.get(key=key)
                            content['METADATASTORE_BITS'] = bits
                            Configuration.set(key=key, value=content)
                        except:
                            MigrationController._logger.exception('Error updating volumedriver info for vPool {0} on StorageRouter {1}'.format(vpool.name, storagedriver.storagerouter.name))

            if bits is not None:
                vpool.metadata_store_bits = bits
                vpool.save()

        MigrationController._logger.info('Finished out of band migrations')
        GenericController.refresh_package_information()
Ejemplo n.º 10
0
class GenericController(object):
    """
    This controller contains all generic task code. These tasks can be
    executed at certain intervals and should be self-containing
    """
    _logger = LogHandler.get('lib', name='generic tasks')

    @staticmethod
    @ovs_task(name='ovs.generic.snapshot_all_vdisks',
              schedule=Schedule(minute='0', hour='*'),
              ensure_single_info={
                  'mode': 'DEFAULT',
                  'extra_task_names': ['ovs.generic.delete_snapshots']
              })
    def snapshot_all_vdisks():
        """
        Snapshots all vDisks
        """
        GenericController._logger.info('[SSA] started')
        success = []
        fail = []
        for vdisk in VDiskList.get_vdisks():
            if vdisk.is_vtemplate is True:
                continue
            try:
                metadata = {
                    'label': '',
                    'is_consistent': False,
                    'timestamp': str(int(time.time())),
                    'is_automatic': True,
                    'is_sticky': False
                }
                VDiskController.create_snapshot(vdisk_guid=vdisk.guid,
                                                metadata=metadata)
                success.append(vdisk.guid)
            except Exception:
                GenericController._logger.exception(
                    'Error taking snapshot for vDisk {0}'.format(vdisk.guid))
                fail.append(vdisk.guid)
        GenericController._logger.info(
            '[SSA] Snapshot has been taken for {0} vDisks, {1} failed.'.format(
                len(success), len(fail)))
        return success, fail

    @staticmethod
    @ovs_task(name='ovs.generic.delete_snapshots',
              schedule=Schedule(minute='1', hour='2'),
              ensure_single_info={'mode': 'DEFAULT'})
    def delete_snapshots(timestamp=None):
        """
        Delete snapshots & scrubbing policy

        Implemented delete snapshot policy:
        < 1d | 1d bucket | 1 | best of bucket   | 1d
        < 1w | 1d bucket | 6 | oldest of bucket | 7d = 1w
        < 1m | 1w bucket | 3 | oldest of bucket | 4w = 1m
        > 1m | delete

        :param timestamp: Timestamp to determine whether snapshots should be kept or not, if none provided, current time will be used
        :type timestamp: float

        :return: None
        """
        GenericController._logger.info('Delete snapshots started')

        day = timedelta(1)
        week = day * 7

        def make_timestamp(offset):
            """
            Create an integer based timestamp
            :param offset: Offset in days
            :return: Timestamp
            """
            return int(mktime((base - offset).timetuple()))

        # Calculate bucket structure
        if timestamp is None:
            timestamp = time.time()
        base = datetime.fromtimestamp(timestamp).date() - day
        buckets = []
        # Buckets first 7 days: [0-1[, [1-2[, [2-3[, [3-4[, [4-5[, [5-6[, [6-7[
        for i in xrange(0, 7):
            buckets.append({
                'start': make_timestamp(day * i),
                'end': make_timestamp(day * (i + 1)),
                'type': '1d',
                'snapshots': []
            })
        # Week buckets next 3 weeks: [7-14[, [14-21[, [21-28[
        for i in xrange(1, 4):
            buckets.append({
                'start': make_timestamp(week * i),
                'end': make_timestamp(week * (i + 1)),
                'type': '1w',
                'snapshots': []
            })
        buckets.append({
            'start': make_timestamp(week * 4),
            'end': 0,
            'type': 'rest',
            'snapshots': []
        })

        # Get a list of all snapshots that are used as parents for clones
        parent_snapshots = set(
            [vd.parentsnapshot for vd in VDiskList.get_with_parent_snaphots()])

        # Place all snapshots in bucket_chains
        bucket_chains = []
        for vdisk in VDiskList.get_vdisks():
            if vdisk.info['object_type'] in ['BASE']:
                bucket_chain = copy.deepcopy(buckets)
                for snapshot in vdisk.snapshots:
                    if snapshot.get('is_sticky') is True:
                        continue
                    if snapshot['guid'] in parent_snapshots:
                        GenericController._logger.info(
                            'Not deleting snapshot {0} because it has clones'.
                            format(snapshot['guid']))
                        continue
                    timestamp = int(snapshot['timestamp'])
                    for bucket in bucket_chain:
                        if bucket['start'] >= timestamp > bucket['end']:
                            bucket['snapshots'].append({
                                'timestamp':
                                timestamp,
                                'snapshot_id':
                                snapshot['guid'],
                                'vdisk_guid':
                                vdisk.guid,
                                'is_consistent':
                                snapshot['is_consistent']
                            })
                bucket_chains.append(bucket_chain)

        # Clean out the snapshot bucket_chains, we delete the snapshots we want to keep
        # And we'll remove all snapshots that remain in the buckets
        for bucket_chain in bucket_chains:
            first = True
            for bucket in bucket_chain:
                if first is True:
                    best = None
                    for snapshot in bucket['snapshots']:
                        if best is None:
                            best = snapshot
                        # Consistent is better than inconsistent
                        elif snapshot[
                                'is_consistent'] and not best['is_consistent']:
                            best = snapshot
                        # Newer (larger timestamp) is better than older snapshots
                        elif snapshot['is_consistent'] == best['is_consistent'] and \
                                snapshot['timestamp'] > best['timestamp']:
                            best = snapshot
                    bucket['snapshots'] = [
                        s for s in bucket['snapshots']
                        if s['timestamp'] != best['timestamp']
                    ]
                    first = False
                elif bucket['end'] > 0:
                    oldest = None
                    for snapshot in bucket['snapshots']:
                        if oldest is None:
                            oldest = snapshot
                        # Older (smaller timestamp) is the one we want to keep
                        elif snapshot['timestamp'] < oldest['timestamp']:
                            oldest = snapshot
                    bucket['snapshots'] = [
                        s for s in bucket['snapshots']
                        if s['timestamp'] != oldest['timestamp']
                    ]

        # Delete obsolete snapshots
        for bucket_chain in bucket_chains:
            for bucket in bucket_chain:
                for snapshot in bucket['snapshots']:
                    VDiskController.delete_snapshot(
                        vdisk_guid=snapshot['vdisk_guid'],
                        snapshot_id=snapshot['snapshot_id'])
        GenericController._logger.info('Delete snapshots finished')

    @staticmethod
    @ovs_task(name='ovs.generic.execute_scrub',
              schedule=Schedule(minute='0', hour='3'),
              ensure_single_info={'mode': 'DEDUPED'})
    def execute_scrub(vpool_guids=None,
                      vdisk_guids=None,
                      storagerouter_guid=None):
        """
        Divide the scrub work among all StorageRouters with a SCRUB partition
        :param vpool_guids: Guids of the vPools that need to be scrubbed completely
        :type vpool_guids: list
        :param vdisk_guids: Guids of the vDisks that need to be scrubbed
        :type vdisk_guids: list
        :param storagerouter_guid: Guid of the StorageRouter to execute the scrub work on
        :type storagerouter_guid: str
        :return: None
        :rtype: NoneType
        """
        if vpool_guids is not None and not isinstance(vpool_guids, list):
            raise ValueError('vpool_guids should be a list')
        if vdisk_guids is not None and not isinstance(vdisk_guids, list):
            raise ValueError('vdisk_guids should be a list')
        if storagerouter_guid is not None and not isinstance(
                storagerouter_guid, basestring):
            raise ValueError('storagerouter_guid should be a str')

        GenericController._logger.info('Scrubber - Started')
        scrub_locations = []
        storagerouters = StorageRouterList.get_storagerouters(
        ) if storagerouter_guid is None else [
            StorageRouter(storagerouter_guid)
        ]
        for storage_router in storagerouters:
            scrub_partitions = storage_router.partition_config.get(
                DiskPartition.ROLES.SCRUB, [])
            if len(scrub_partitions) == 0:
                continue

            try:
                SSHClient(endpoint=storage_router, username='******')
                for partition_guid in scrub_partitions:
                    partition = DiskPartition(partition_guid)
                    GenericController._logger.info(
                        'Scrubber - Storage Router {0} has {1} partition at {2}'
                        .format(storage_router.ip, DiskPartition.ROLES.SCRUB,
                                partition.folder))
                    scrub_locations.append({
                        'scrub_path': str(partition.folder),
                        'partition_guid': partition.guid,
                        'storage_router': storage_router
                    })
            except UnableToConnectException:
                GenericController._logger.warning(
                    'Scrubber - Storage Router {0} is not reachable'.format(
                        storage_router.ip))

        if len(scrub_locations) == 0:
            raise ValueError('No scrub locations found, cannot scrub')

        vpool_vdisk_map = {}
        if vpool_guids is None and vdisk_guids is None:
            vpool_vdisk_map = dict((vpool, list(vpool.vdisks))
                                   for vpool in VPoolList.get_vpools())
        else:
            if vpool_guids is not None:
                for vpool_guid in set(vpool_guids):
                    vpool = VPool(vpool_guid)
                    vpool_vdisk_map[vpool] = list(vpool.vdisks)
            if vdisk_guids is not None:
                for vdisk_guid in set(vdisk_guids):
                    vdisk = VDisk(vdisk_guid)
                    if vdisk.vpool not in vpool_vdisk_map:
                        vpool_vdisk_map[vdisk.vpool] = []
                    if vdisk not in vpool_vdisk_map[vdisk.vpool]:
                        vpool_vdisk_map[vdisk.vpool].append(vdisk)

        number_of_vpools = len(vpool_vdisk_map)
        if number_of_vpools >= 6:
            max_stacks_per_vpool = 1
        elif number_of_vpools >= 3:
            max_stacks_per_vpool = 2
        else:
            max_stacks_per_vpool = 5

        threads = []
        counter = 0
        error_messages = []
        for vp, vdisks in vpool_vdisk_map.iteritems():
            # Verify amount of vDisks on vPool
            GenericController._logger.info(
                'Scrubber - vPool {0} - Checking scrub work'.format(vp.name))
            if len(vdisks) == 0:
                GenericController._logger.info(
                    'Scrubber - vPool {0} - No scrub work'.format(vp.name))
                continue

            # Fill queue with all vDisks for current vPool
            vpool_queue = Queue()
            for vd in vdisks:
                if vd.is_vtemplate is True:
                    GenericController._logger.info(
                        'Scrubber - vPool {0} - vDisk {1} {2} - Is a template, not scrubbing'
                        .format(vp.name, vd.guid, vd.name))
                    continue
                vd.invalidate_dynamics('storagedriver_id')
                if not vd.storagedriver_id:
                    GenericController._logger.warning(
                        'Scrubber - vPool {0} - vDisk {1} {2} - No StorageDriver ID found'
                        .format(vp.name, vd.guid, vd.name))
                    continue
                vpool_queue.put(vd.guid)

            stacks_to_spawn = min(max_stacks_per_vpool, len(scrub_locations))
            GenericController._logger.info(
                'Scrubber - vPool {0} - Spawning {1} stack{2}'.format(
                    vp.name, stacks_to_spawn,
                    '' if stacks_to_spawn == 1 else 's'))
            for _ in xrange(stacks_to_spawn):
                scrub_target = scrub_locations[counter % len(scrub_locations)]
                stack = Thread(
                    target=GenericController._deploy_stack_and_scrub,
                    args=(vpool_queue, vp, scrub_target, error_messages))
                stack.start()
                threads.append(stack)
                counter += 1

        for thread in threads:
            thread.join()

        if len(error_messages) > 0:
            raise Exception('Errors occurred while scrubbing:\n  - {0}'.format(
                '\n  - '.join(error_messages)))

    @staticmethod
    def _execute_scrub(queue, vpool, scrub_info, scrub_dir, error_messages):
        def _verify_mds_config(current_vdisk):
            current_vdisk.invalidate_dynamics('info')
            vdisk_configs = current_vdisk.info['metadata_backend_config']
            if len(vdisk_configs) == 0:
                raise RuntimeError('Could not load MDS configuration')
            return vdisk_configs

        storagerouter = scrub_info['storage_router']
        partition_guid = scrub_info['partition_guid']
        volatile_client = VolatileFactory.get_client()
        backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format(
            vpool.guid, partition_guid)
        try:
            # Empty the queue with vDisks to scrub
            with remote(storagerouter.ip, [VDisk]) as rem:
                while True:
                    vdisk = None
                    vdisk_guid = queue.get(
                        False
                    )  # Raises Empty Exception when queue is empty, so breaking the while True loop
                    volatile_key = 'ovs_scrubbing_vdisk_{0}'.format(vdisk_guid)
                    try:
                        # Check MDS master is local. Trigger MDS handover if necessary
                        vdisk = rem.VDisk(vdisk_guid)
                        GenericController._logger.info(
                            'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Started scrubbing at location {3}'
                            .format(vpool.name, storagerouter.name, vdisk.name,
                                    scrub_dir))
                        configs = _verify_mds_config(current_vdisk=vdisk)
                        storagedriver = StorageDriverList.get_by_storagedriver_id(
                            vdisk.storagedriver_id)
                        if configs[0].get(
                                'ip') != storagedriver.storagerouter.ip:
                            GenericController._logger.info(
                                'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - MDS master is not local, trigger handover'
                                .format(vpool.name, storagerouter.name,
                                        vdisk.name))
                            MDSServiceController.ensure_safety(
                                VDisk(vdisk_guid)
                            )  # Do not use a remote VDisk instance here
                            configs = _verify_mds_config(current_vdisk=vdisk)
                            if configs[0].get(
                                    'ip') != storagedriver.storagerouter.ip:
                                GenericController._logger.warning(
                                    'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because master MDS still not local'
                                    .format(vpool.name, storagerouter.name,
                                            vdisk.name))
                                continue

                        # Check if vDisk is already being scrubbed
                        if volatile_client.add(key=volatile_key,
                                               value=volatile_key,
                                               time=24 * 60 * 60) is False:
                            GenericController._logger.warning(
                                'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because vDisk is already being scrubbed'
                                .format(vpool.name, storagerouter.name,
                                        vdisk.name))
                            continue

                        # Do the actual scrubbing
                        with vdisk.storagedriver_client.make_locked_client(
                                str(vdisk.volume_id)) as locked_client:
                            GenericController._logger.info(
                                'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Retrieve and apply scrub work'
                                .format(vpool.name, storagerouter.name,
                                        vdisk.name))
                            work_units = locked_client.get_scrubbing_workunits(
                            )
                            for work_unit in work_units:
                                res = locked_client.scrub(
                                    work_unit=work_unit,
                                    scratch_dir=scrub_dir,
                                    log_sinks=[
                                        LogHandler.get_sink_path(
                                            'scrubber_{0}'.format(vpool.name),
                                            allow_override=True,
                                            forced_target_type='file')
                                    ],
                                    backend_config=Configuration.
                                    get_configuration_path(backend_config_key))
                                locked_client.apply_scrubbing_result(
                                    scrubbing_work_result=res)
                            if work_units:
                                GenericController._logger.info(
                                    'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - {3} work units successfully applied'
                                    .format(vpool.name, storagerouter.name,
                                            vdisk.name, len(work_units)))
                            else:
                                GenericController._logger.info(
                                    'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - No scrubbing required'
                                    .format(vpool.name, storagerouter.name,
                                            vdisk.name))
                    except Exception:
                        if vdisk is None:
                            message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk with guid {2} could not be found'.format(
                                vpool.name, storagerouter.name, vdisk_guid)
                        else:
                            message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Scrubbing failed'.format(
                                vpool.name, storagerouter.name, vdisk.name)
                        error_messages.append(message)
                        GenericController._logger.exception(message)
                    finally:
                        # Remove vDisk from volatile memory
                        volatile_client.delete(volatile_key)

        except Empty:  # Raised when all items have been fetched from the queue
            GenericController._logger.info(
                'Scrubber - vPool {0} - StorageRouter {1} - Queue completely processed'
                .format(vpool.name, storagerouter.name))
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - Scrubbing failed'.format(
                vpool.name, storagerouter.name)
            error_messages.append(message)
            GenericController._logger.exception(message)

    @staticmethod
    def _deploy_stack_and_scrub(queue, vpool, scrub_info, error_messages):
        """
        Executes scrub work for a given vDisk queue and vPool, based on scrub_info
        :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool)
        :type queue: Queue
        :param vpool: the vPool object of the vDisks
        :type vpool: VPool
        :param scrub_info: A dict containing scrub information:
                           `scrub_path` with the path where to scrub
                           `storage_router` with the StorageRouter that needs to do the work
        :type scrub_info: dict
        :param error_messages: A list of error messages to be filled (by reference)
        :type error_messages: list
        :return: None
        :rtype: NoneType
        """
        if len(vpool.storagedrivers
               ) == 0 or not vpool.storagedrivers[0].storagedriver_id:
            error_messages.append(
                'vPool {0} does not have any valid StorageDrivers configured'.
                format(vpool.name))
            return

        service_manager = ServiceFactory.get_manager()
        client = None
        lock_time = 5 * 60
        storagerouter = scrub_info['storage_router']
        partition_guid = scrub_info['partition_guid']
        alba_proxy_service = 'ovs-albaproxy_{0}_{1}_{2}_scrub'.format(
            vpool.name, storagerouter.name, partition_guid)
        scrub_directory = '{0}/scrub_work_{1}_{2}'.format(
            scrub_info['scrub_path'], vpool.name, partition_guid)
        scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format(
            vpool.guid, partition_guid)
        backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format(
            vpool.guid, partition_guid)

        # Deploy a proxy
        try:
            with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time):
                GenericController._logger.info(
                    'Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}'
                    .format(vpool.name, storagerouter.name,
                            alba_proxy_service))
                client = SSHClient(storagerouter, 'root')
                client.dir_create(scrub_directory)
                client.dir_chmod(
                    scrub_directory, 0777
                )  # Celery task executed by 'ovs' user and should be able to write in it
                if service_manager.has_service(
                        name=alba_proxy_service, client=client
                ) is True and service_manager.get_service_status(
                        name=alba_proxy_service, client=client) == 'active':
                    GenericController._logger.info(
                        'Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}'
                        .format(vpool.name, storagerouter.name,
                                alba_proxy_service))
                    scrub_config = Configuration.get(scrub_config_key)
                else:
                    machine_id = System.get_my_machine_id(client)
                    port_range = Configuration.get(
                        '/ovs/framework/hosts/{0}/ports|storagedriver'.format(
                            machine_id))
                    with volatile_mutex('deploy_proxy_for_scrub_{0}'.format(
                            storagerouter.guid),
                                        wait=30):
                        port = System.get_free_ports(selected_range=port_range,
                                                     nr=1,
                                                     client=client)[0]
                    scrub_config = Configuration.get(
                        'ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(
                            vpool.guid))
                    scrub_config['port'] = port
                    scrub_config['transport'] = 'tcp'
                    Configuration.set(scrub_config_key,
                                      json.dumps(scrub_config, indent=4),
                                      raw=True)

                    params = {
                        'VPOOL_NAME':
                        vpool.name,
                        'LOG_SINK':
                        LogHandler.get_sink_path(alba_proxy_service),
                        'CONFIG_PATH':
                        Configuration.get_configuration_path(scrub_config_key)
                    }
                    service_manager.add_service(name='ovs-albaproxy',
                                                params=params,
                                                client=client,
                                                target_name=alba_proxy_service)
                    service_manager.start_service(name=alba_proxy_service,
                                                  client=client)
                    GenericController._logger.info(
                        'Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}'
                        .format(vpool.name, storagerouter.name,
                                alba_proxy_service))

                backend_config = Configuration.get(
                    'ovs/vpools/{0}/hosts/{1}/config'.format(
                        vpool.guid, vpool.storagedrivers[0].storagedriver_id
                    ))['backend_connection_manager']
                if backend_config.get('backend_type') != 'MULTI':
                    backend_config['alba_connection_host'] = '127.0.0.1'
                    backend_config['alba_connection_port'] = scrub_config[
                        'port']
                else:
                    for value in backend_config.itervalues():
                        if isinstance(value, dict):
                            value['alba_connection_host'] = '127.0.0.1'
                            value['alba_connection_port'] = scrub_config[
                                'port']
                # Copy backend connection manager information in separate key
                Configuration.set(
                    backend_config_key,
                    json.dumps({"backend_connection_manager": backend_config},
                               indent=4),
                    raw=True)
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format(
                vpool.name, storagerouter.name, alba_proxy_service)
            error_messages.append(message)
            GenericController._logger.exception(message)
            if client is not None and service_manager.has_service(
                    name=alba_proxy_service, client=client) is True:
                if service_manager.get_service_status(
                        name=alba_proxy_service, client=client) == 'active':
                    service_manager.stop_service(name=alba_proxy_service,
                                                 client=client)
                service_manager.remove_service(name=alba_proxy_service,
                                               client=client)
            if Configuration.exists(scrub_config_key):
                Configuration.delete(scrub_config_key)

        # Execute the actual scrubbing
        threads = []
        threads_key = '/ovs/framework/hosts/{0}/config|scrub_stack_threads'.format(
            storagerouter.machine_id)
        amount_threads = Configuration.get(
            key=threads_key) if Configuration.exists(key=threads_key) else 2
        if not isinstance(amount_threads, int):
            error_messages.append(
                'Amount of threads to spawn must be an integer for StorageRouter with ID {0}'
                .format(storagerouter.machine_id))
            return

        amount_threads = max(amount_threads,
                             1)  # Make sure amount_threads is at least 1
        amount_threads = min(min(queue.qsize(), amount_threads),
                             20)  # Make sure amount threads is max 20
        GenericController._logger.info(
            'Scrubber - vPool {0} - StorageRouter {1} - Spawning {2} threads for proxy service {3}'
            .format(vpool.name, storagerouter.name, amount_threads,
                    alba_proxy_service))
        for index in range(amount_threads):
            thread = Thread(name='execute_scrub_{0}_{1}_{2}'.format(
                vpool.guid, partition_guid, index),
                            target=GenericController._execute_scrub,
                            args=(queue, vpool, scrub_info, scrub_directory,
                                  error_messages))
            thread.start()
            threads.append(thread)
        for thread in threads:
            thread.join()

        # Delete the proxy again
        try:
            with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time):
                GenericController._logger.info(
                    'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}'
                    .format(vpool.name, storagerouter.name,
                            alba_proxy_service))
                client = SSHClient(storagerouter, 'root')
                client.dir_delete(scrub_directory)
                if service_manager.has_service(alba_proxy_service,
                                               client=client):
                    service_manager.stop_service(alba_proxy_service,
                                                 client=client)
                    service_manager.remove_service(alba_proxy_service,
                                                   client=client)
                if Configuration.exists(scrub_config_key):
                    Configuration.delete(scrub_config_key)
                GenericController._logger.info(
                    'Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}'
                    .format(vpool.name, storagerouter.name,
                            alba_proxy_service))
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format(
                vpool.name, storagerouter.name, alba_proxy_service)
            error_messages.append(message)
            GenericController._logger.exception(message)

    @staticmethod
    @ovs_task(name='ovs.generic.collapse_arakoon',
              schedule=Schedule(minute='10',
                                hour='0,2,4,6,8,10,12,14,16,18,20,22'),
              ensure_single_info={'mode': 'DEFAULT'})
    def collapse_arakoon():
        """
        Collapse Arakoon's Tlogs
        :return: None
        """
        from ovs_extensions.generic.toolbox import ExtensionsToolbox

        GenericController._logger.info('Arakoon collapse started')
        cluster_info = []
        storagerouters = StorageRouterList.get_storagerouters()
        if os.environ.get('RUNNING_UNITTESTS') != 'True':
            cluster_info = [('cacc', storagerouters[0])]

        cluster_names = []
        for service in ServiceList.get_services():
            if service.is_internal is True and service.type.name in (
                    ServiceType.SERVICE_TYPES.ARAKOON,
                    ServiceType.SERVICE_TYPES.NS_MGR,
                    ServiceType.SERVICE_TYPES.ALBA_MGR):
                cluster = ExtensionsToolbox.remove_prefix(
                    service.name, 'arakoon-')
                if cluster in cluster_names and cluster not in [
                        'cacc', 'unittest-cacc'
                ]:
                    continue
                cluster_names.append(cluster)
                cluster_info.append((cluster, service.storagerouter))
        workload = {}
        cluster_config_map = {}
        for cluster, storagerouter in cluster_info:
            GenericController._logger.debug(
                '  Collecting info for cluster {0}'.format(cluster))
            ip = storagerouter.ip if cluster in ['cacc', 'unittest-cacc'
                                                 ] else None
            try:
                config = ArakoonClusterConfig(cluster_id=cluster, source_ip=ip)
                cluster_config_map[cluster] = config
            except:
                GenericController._logger.exception(
                    '  Retrieving cluster information on {0} for {1} failed'.
                    format(storagerouter.ip, cluster))
                continue
            for node in config.nodes:
                if node.ip not in workload:
                    workload[node.ip] = {'node_id': node.name, 'clusters': []}
                workload[node.ip]['clusters'].append((cluster, ip))
        for storagerouter in storagerouters:
            try:
                if storagerouter.ip not in workload:
                    continue
                node_workload = workload[storagerouter.ip]
                client = SSHClient(storagerouter)
                for cluster, ip in node_workload['clusters']:
                    try:
                        GenericController._logger.debug(
                            '  Collapsing cluster {0} on {1}'.format(
                                cluster, storagerouter.ip))
                        client.run([
                            'arakoon', '--collapse-local',
                            node_workload['node_id'], '2', '-config',
                            cluster_config_map[cluster].external_config_path
                        ])
                        GenericController._logger.debug(
                            '  Collapsing cluster {0} on {1} completed'.format(
                                cluster, storagerouter.ip))
                    except:
                        GenericController._logger.exception(
                            '  Collapsing cluster {0} on {1} failed'.format(
                                cluster, storagerouter.ip))
            except UnableToConnectException:
                GenericController._logger.error(
                    '  Could not collapse any cluster on {0} (not reachable)'.
                    format(storagerouter.name))
        GenericController._logger.info('Arakoon collapse finished')

    @staticmethod
    @ovs_task(name='ovs.generic.refresh_package_information',
              schedule=Schedule(minute='10', hour='*'),
              ensure_single_info={'mode': 'DEDUPED'})
    def refresh_package_information():
        """
        Retrieve and store the package information of all StorageRouters
        :return: None
        """
        GenericController._logger.info('Updating package information')
        threads = []
        information = {}
        all_storagerouters = StorageRouterList.get_storagerouters()
        for storagerouter in all_storagerouters:
            information[storagerouter.ip] = {}
            for fct in Toolbox.fetch_hooks('update', 'get_package_info_multi'):
                try:
                    # We make use of these clients in Threads --> cached = False
                    client = SSHClient(endpoint=storagerouter,
                                       username='******',
                                       cached=False)
                except UnableToConnectException:
                    information[storagerouter.ip]['errors'] = [
                        'StorageRouter {0} is inaccessible'.format(
                            storagerouter.name)
                    ]
                    break
                thread = Thread(target=fct, args=(client, information))
                thread.start()
                threads.append(thread)

        for fct in Toolbox.fetch_hooks('update', 'get_package_info_single'):
            thread = Thread(target=fct, args=(information, ))
            thread.start()
            threads.append(thread)

        for thread in threads:
            thread.join()

        errors = []
        copy_information = copy.deepcopy(information)
        for ip, info in information.iteritems():
            if len(info.get('errors', [])) > 0:
                errors.extend(
                    ['{0}: {1}'.format(ip, error) for error in info['errors']])
                copy_information.pop(ip)

        for storagerouter in all_storagerouters:
            info = copy_information.get(storagerouter.ip, {})
            if 'errors' in info:
                info.pop('errors')
            storagerouter.package_information = info
            storagerouter.save()

        if len(errors) > 0:
            errors = [str(error) for error in set(errors)]
            raise Exception(' - {0}'.format('\n - '.join(errors)))

    @staticmethod
    @ovs_task(name='ovs.generic.run_backend_domain_hooks')
    def run_backend_domain_hooks(backend_guid):
        """
        Run hooks when the Backend Domains have been updated
        :param backend_guid: Guid of the Backend to update
        :type backend_guid: str
        :return: None
        """
        for fct in Toolbox.fetch_hooks('backend', 'domains-update'):
            fct(backend_guid=backend_guid)
Ejemplo n.º 11
0
class AlbaStatsMonkeyController(StatsMonkey):
    """
    Stats Monkey class which retrieves ALBA statistics for the cluster
    Methods:
        * run_all
        * get_stats_nsms
        * get_stats_osds
        * get_stats_vdisks
        * get_stats_proxies
        * get_stats_alba_backends
    """
    _logger = Logger(name='lib')
    _dynamic_dependencies = {
        'get_stats_osds': {
            AlbaBackend: ['osd_statistics']
        },  # The statistics being retrieved depend on the caching timeouts of these properties
        'get_stats_alba_backends': {
            AlbaBackend: ['local_summary']
        }
    }

    _FAILOVER_MAP = {
        'ok_sync': 0.0,
        'catchup': 1.0,
        'degraded': 2.0,
        'disabled': 0.0,
        'ok_standalone': 0.0,
        'checkup_required': 1.0
    }

    @classmethod
    def _get_configuration(cls):
        return Configuration

    def __init__(self):
        """
        Init method. This class is a completely static class, so cannot be instantiated
        """
        raise RuntimeError('AlbaStatsMonkeyController is a static class')

    @staticmethod
    @ovs_task(name='alba.stats_monkey.run_all',
              schedule=Schedule(minute='*', hour='*'),
              ensure_single_info={"mode": "DEFAULT"})
    def run_all():
        """
        Run all the get stats methods from AlbaStatsMonkeyController
        Prerequisites when adding content:
            * New methods which need to be picked up by this method need to start with 'get_stats_'
            * New methods need to collect the information and return a bool and list of stats. Then 'run_all_get_stat_methods' method, will send the stats to the configured instance (influx / redis)
            * The frequency each method needs to be executed can be configured via the configuration management by setting the function name as key and the interval in seconds as value
            *    Eg: {'get_stats_nsms': 20}  --> Every 20 seconds, the NSM statistics will be checked upon
        """
        AlbaStatsMonkeyController.run_all_get_stat_methods()

    @classmethod
    def get_stats_nsms(cls):
        """
        Retrieve the amount of NSMs deployed and their statistics
        """
        if cls._config is None:
            cls.validate_and_retrieve_config()

        stats = []
        errors = False
        environment = cls._config['environment']
        for alba_backend in AlbaBackendList.get_albabackends():
            for nsm in alba_backend.nsm_clusters:
                stats.append({
                    'tags': {
                        'nsm_number': nsm.number,
                        'environment': environment,
                        'backend_name': alba_backend.name,
                        'abm_service_name': alba_backend.abm_cluster.name
                    },
                    'fields': {
                        'load': float(AlbaArakoonController.get_load(nsm))
                    },
                    'measurement': 'nsm'
                })

            config_path = Configuration.get_configuration_path(
                alba_backend.abm_cluster.config_location)
            try:
                nsm_host_ids = [
                    nsm_host['id']
                    for nsm_host in AlbaCLI.run(command='list-nsm-hosts',
                                                config=config_path)
                ]
                nsm_hosts_statistics = AlbaCLI.run(
                    command='nsm-hosts-statistics',
                    config=config_path,
                    named_params={'nsm-hosts': ','.join(nsm_host_ids)})
                for nsm_host_id, statistics in nsm_hosts_statistics.iteritems(
                ):
                    stats.append({
                        'tags': {
                            'nsm_name': nsm_host_id,
                            'environment': environment,
                            'backend_name': alba_backend.name
                        },
                        'fields':
                        cls._convert_to_float_values(statistics['statistics']),
                        'measurement':
                        'nsm_statistic'
                    })
            except Exception:
                errors = True
                cls._logger.exception(
                    'Retrieving NSM statistics for ALBA Backend {0} failed'.
                    format(alba_backend.name))
        return errors, stats

    @classmethod
    def get_stats_proxies(cls):
        """
        Retrieve statistics for all ALBA proxies
        """
        if cls._config is None:
            cls.validate_and_retrieve_config()

        stats = []
        errors = False
        environment = cls._config['environment']
        vpool_namespace_cache = {}
        for storagedriver in StorageDriverList.get_storagedrivers():
            for alba_proxy_service in storagedriver.alba_proxies:
                ip = storagedriver.storage_ip
                port = alba_proxy_service.service.ports[0]
                try:
                    vpool = storagedriver.vpool
                    if vpool.guid not in vpool_namespace_cache:
                        vpool_namespace_cache[
                            vpool.
                            guid] = vpool.storagedriver_client.list_volumes(
                                req_timeout_secs=5)
                    active_namespaces = vpool_namespace_cache[vpool.guid]
                    for namespace_stats in AlbaCLI.run(
                            command='proxy-statistics',
                            named_params={
                                'host': ip,
                                'port': port
                            })['ns_stats']:
                        namespace = namespace_stats[0]
                        if namespace not in active_namespaces:
                            continue

                        stats.append({
                            'tags': {
                                'server':
                                storagedriver.storagerouter.name,
                                'namespace':
                                namespace,
                                'vpool_name':
                                vpool.name,
                                'environment':
                                environment,
                                'backend_name':
                                vpool.metadata['backend']['backend_info']
                                ['name'],
                                'service_name':
                                alba_proxy_service.service.name
                            },
                            'fields':
                            cls._convert_to_float_values(namespace_stats[1]),
                            'measurement':
                            'proxyperformance_namespace'
                        })
                except Exception:
                    errors = True
                    cls._logger.exception(
                        "Failed to retrieve proxy statistics for proxy service running at {0}:{1}"
                        .format(ip, port))
        return errors, stats

    @classmethod
    def get_stats_vdisks(cls):
        """
        Retrieve statistics about all vDisks on the system.
        Check the safety, storage amount on the Backend, fail-over status and others
        """
        if cls._config is None:
            cls.validate_and_retrieve_config()

        stats = []
        errors = False
        environment = cls._config['environment']
        alba_backend_info = {}
        for alba_backend in AlbaBackendList.get_albabackends():
            config_path = Configuration.get_configuration_path(
                alba_backend.abm_cluster.config_location)
            disk_safety = {}
            namespace_usage = {}

            # Retrieve namespace, preset and disk safety information
            try:
                preset_info = AlbaCLI.run(
                    command='list-presets', config=config_path
                )  # Not using alba_backend.presets, because it takes a whole lot longer to retrieve
                all_namespace_info = AlbaCLI.run(command='show-namespaces',
                                                 config=config_path,
                                                 extra_params=['--max=-1'])[1]
                all_disk_safety_info = AlbaCLI.run(command='get-disk-safety',
                                                   config=config_path)
            except Exception:
                errors = True
                cls._logger.exception(
                    'Retrieving information for ALBA Backend {0} failed'.
                    format(alba_backend.name))
                continue

            alba_backend_info[alba_backend.guid] = {
                'disk_safety': disk_safety,
                'namespace_usage': namespace_usage
            }

            # Parse namespace information
            for namespace_info in all_namespace_info:
                namespace_usage[namespace_info['name']] = float(
                    namespace_info['statistics']['storage'])

            # Parse preset information
            policies = []
            preset_name = None
            for preset in preset_info:
                if preset['in_use'] is not True:
                    continue
                preset_name = preset['name']
                policies.extend(preset['policies'])
            if preset_name is None:
                continue

            # Parse disk safety information
            total_objects = 0
            max_lost_disks = 0
            max_disk_safety = 0
            bucket_overview = {}
            disk_lost_overview = {}
            disk_safety_overview = {}
            for disk_safety_info in all_disk_safety_info:
                safety = disk_safety_info['safety']
                volume_id = disk_safety_info['namespace']
                disk_safety[volume_id] = float(
                    safety) if safety is not None else safety

                for bucket_safety in disk_safety_info['bucket_safety']:
                    bucket = bucket_safety['bucket']
                    objects = bucket_safety['count']
                    remaining_safety = bucket_safety['remaining_safety']

                    if bucket[1] > max_lost_disks:
                        max_lost_disks = bucket[1]
                    if remaining_safety > max_disk_safety:
                        max_disk_safety = remaining_safety

                    for policy in policies:
                        k = policy[0] == bucket[0]
                        m = policy[1] == bucket[1]
                        c = policy[2] <= bucket[2]
                        x = policy[3] >= bucket[3]
                        if k and m and c and x:
                            if preset_name not in bucket_overview:
                                bucket_overview[preset_name] = {
                                    'policy': str(policy),
                                    'presets': {}
                                }

                    bucket[2] -= bucket_safety['applicable_dead_osds']
                    if str(bucket
                           ) not in bucket_overview[preset_name]['presets']:
                        bucket_overview[preset_name]['presets'][str(
                            bucket)] = {
                                'objects': 0,
                                'disk_safety': 0
                            }

                    disk_lost = bucket[0] + bucket[1] - bucket[
                        2]  # Data fragments + parity fragments - amount of fragments to write + dead osds
                    if disk_lost not in disk_lost_overview:
                        disk_lost_overview[disk_lost] = 0
                    if remaining_safety not in disk_safety_overview:
                        disk_safety_overview[remaining_safety] = 0

                    total_objects += objects
                    disk_lost_overview[disk_lost] += objects
                    disk_safety_overview[remaining_safety] += objects
                    bucket_overview[preset_name]['presets'][str(
                        bucket)]['objects'] += objects
                    bucket_overview[preset_name]['presets'][str(
                        bucket)]['disk_safety'] = remaining_safety

            # Create statistics regarding disk safety
            for disk_lost_number in xrange(max_lost_disks + 1):
                stats.append({
                    'tags': {
                        'disk_lost': disk_lost_number,
                        'environment': environment,
                        'backend_name': alba_backend.name
                    },
                    'fields': {
                        'objects': disk_lost_overview.get(disk_lost_number, 0),
                        'total_objects': total_objects
                    },
                    'measurement': 'disk_lost'
                })

            for disk_safety_number in xrange(max_disk_safety + 1):
                stats.append({
                    'tags': {
                        'disk_safety': disk_safety_number,
                        'environment': environment,
                        'backend_name': alba_backend.name
                    },
                    'fields': {
                        'objects':
                        disk_safety_overview.get(disk_safety_number, 0),
                        'total_objects': total_objects
                    },
                    'measurement': 'disk_safety'
                })

            for preset_name, result in bucket_overview.iteritems():
                for bucket_count, bucket_result in result['presets'].iteritems(
                ):
                    stats.append({
                        'tags': {
                            'bucket': bucket_count,
                            'policy': result['policy'],
                            'preset_name': preset_name,
                            'environment': environment,
                            'disk_safety': bucket_result['disk_safety'],
                            'backend_name': alba_backend.name
                        },
                        'fields': {
                            'objects': bucket_result['objects'],
                            'total_objects': total_objects
                        },
                        'measurement': 'bucket'
                    })

        # Integrate namespace and disk safety information in vPool stats
        for vpool in VPoolList.get_vpools():
            alba_backend_guid = vpool.metadata['backend']['backend_info'][
                'alba_backend_guid']
            for vdisk in vpool.vdisks:
                try:
                    metrics = cls._convert_to_float_values(
                        cls._pop_realtime_info(vdisk.statistics))
                    metrics['failover_mode'] = vdisk.dtl_status
                    metrics['frontend_size'] = float(vdisk.size)
                    metrics['failover_mode_status'] = cls._FAILOVER_MAP.get(
                        vdisk.dtl_status, 3)
                    if alba_backend_guid in alba_backend_info:
                        metrics['disk_safety'] = alba_backend_info[
                            alba_backend_guid]['disk_safety'].get(
                                vdisk.volume_id)
                        metrics['backend_stored'] = alba_backend_info[
                            alba_backend_guid]['namespace_usage'].get(
                                vdisk.volume_id)

                    stats.append({
                        'tags': {
                            'disk_name':
                            vdisk.name,
                            'volume_id':
                            vdisk.volume_id,
                            'vpool_name':
                            vdisk.vpool.name,
                            'environment':
                            environment,
                            'storagerouter_name':
                            StorageRouter(vdisk.storagerouter_guid).name
                        },
                        'fields': metrics,
                        'measurement': 'vdisk'
                    })
                except Exception:
                    errors = True
                    cls._logger.exception(
                        'Retrieving statistics for vDisk {0} with guid {1} failed'
                        .format(vdisk.name, vdisk.guid))
        return errors, stats

    @classmethod
    def get_stats_alba_backends(cls):
        """
        Retrieve statistics about all ALBA Backends and their maintenance work
        """
        if cls._config is None:
            cls.validate_and_retrieve_config()

        stats = []
        errors = False
        environment = cls._config['environment']
        for alba_backend in AlbaBackendList.get_albabackends():
            try:
                local_summary = alba_backend.local_summary
                sizes = local_summary['sizes']
                devices = local_summary['devices']
                stats.append({
                    'tags': {
                        'environment': environment,
                        'backend_name': alba_backend.name
                    },
                    'fields': {
                        'red':
                        int(devices['red']),
                        'free':
                        float(sizes['size'] - sizes['used']),
                        'used':
                        float(sizes['used']),
                        'green':
                        int(devices['green']),
                        'orange':
                        int(devices['orange']),
                        'maintenance_work':
                        int(
                            AlbaCLI.run(
                                command='list-work',
                                config=Configuration.get_configuration_path(
                                    alba_backend.abm_cluster.config_location))
                            ['count'])
                    },
                    'measurement': 'backend'
                })
            except Exception:
                errors = True
                cls._logger.exception(
                    'Retrieving statistics for ALBA Backend {0} failed'.format(
                        alba_backend.name))
        return errors, stats

    @classmethod
    def get_stats_osds(cls):
        """
        Retrieve the OSD statistics for all ALBA Backends
        """
        def _get_stats_osds_for_alba_backend(alba_backend, statistics,
                                             errored_calls):
            try:
                for osd_id, result in alba_backend.osd_statistics.iteritems():
                    # Remove the 'version' key as it is a non-numeric value
                    result.pop('version', None)
                    statistics.append({
                        'tags': {
                            'guid': alba_backend.guid,
                            'long_id': osd_id,
                            'environment': environment,
                            'backend_name': alba_backend.name
                        },
                        'fields':
                        cls._convert_to_float_values(result),
                        'measurement':
                        'asd'
                    })
            except Exception:
                errored_calls.append(alba_backend.name)
                cls._logger.exception(
                    'Retrieving OSD statistics failed for ALBA Backend {0}'.
                    format(alba_backend.name))

        if cls._config is None:
            cls.validate_and_retrieve_config()

        stats = []
        errors = []
        threads = []
        environment = cls._config['environment']
        for ab in AlbaBackendList.get_albabackends():
            thread = Thread(name=ab.name,
                            target=_get_stats_osds_for_alba_backend,
                            args=(ab, stats, errors))
            thread.start()
            threads.append(thread)

        for thr in threads:
            thr.join(timeout=20)

        if len(errors) > 0:
            raise Exception(
                'Retrieving OSD statistics failed for ALBA Backends:\n * {0}'.
                format('\n * '.join(errors)))
        return False, stats
Ejemplo n.º 12
0
class AlbaMigrationController(object):
    """
    This controller contains (part of the) migration code. It runs out-of-band with the updater so we reduce the risk of failures during the update
    """
    _logger = Logger(name='update', forced_target_type='file')

    @staticmethod
    @ovs_task(name='alba.migration.migrate',
              schedule=Schedule(minute='15', hour='6'),
              ensure_single_info={'mode': 'DEFAULT'})
    def migrate():
        """
        Executes async migrations. It doesn't matter too much when they are executed, as long as they get eventually
        executed. This code will typically contain:
        * "dangerous" migration code (it needs certain running services)
        * Migration code depending on a cluster-wide state
        * ...
        """
        AlbaMigrationController._logger.info(
            'Preparing out of band migrations...')

        from ovs.dal.hybrids.diskpartition import DiskPartition
        from ovs.dal.lists.albabackendlist import AlbaBackendList
        from ovs.dal.lists.albanodelist import AlbaNodeList
        from ovs.dal.lists.albaosdlist import AlbaOSDList
        from ovs.dal.lists.storagerouterlist import StorageRouterList
        from ovs.extensions.generic.configuration import Configuration
        from ovs.extensions.generic.sshclient import SSHClient, UnableToConnectException
        from ovs.extensions.migration.migration.albamigrator import ExtensionMigrator
        from ovs.extensions.packages.albapackagefactory import PackageFactory
        from ovs.extensions.services.albaservicefactory import ServiceFactory
        from ovs.extensions.plugins.albacli import AlbaCLI, AlbaError
        from ovs.lib.alba import AlbaController
        from ovs.lib.disk import DiskController

        AlbaMigrationController._logger.info('Start out of band migrations...')

        #############################################
        # Introduction of IP:port combination on OSDs
        osd_info_map = {}
        alba_backends = AlbaBackendList.get_albabackends()
        for alba_backend in alba_backends:
            AlbaMigrationController._logger.info(
                'Verifying ALBA Backend {0}'.format(alba_backend.name))
            if alba_backend.abm_cluster is None:
                AlbaMigrationController._logger.warning(
                    'ALBA Backend {0} does not have an ABM cluster registered'.
                    format(alba_backend.name))
                continue

            AlbaMigrationController._logger.debug(
                'Retrieving configuration path for ALBA Backend {0}'.format(
                    alba_backend.name))
            try:
                config = Configuration.get_configuration_path(
                    alba_backend.abm_cluster.config_location)
            except:
                AlbaMigrationController._logger.exception(
                    'Failed to retrieve the configuration path for ALBA Backend {0}'
                    .format(alba_backend.name))
                continue

            AlbaMigrationController._logger.info(
                'Retrieving OSD information for ALBA Backend {0}'.format(
                    alba_backend.name))
            try:
                osd_info = AlbaCLI.run(command='list-all-osds', config=config)
            except (AlbaError, RuntimeError):
                AlbaMigrationController._logger.exception(
                    'Failed to retrieve OSD information for ALBA Backend {0}'.
                    format(alba_backend.name))
                continue

            for osd_info in osd_info:
                if osd_info.get('long_id'):
                    osd_info_map[osd_info['long_id']] = {
                        'ips': osd_info.get('ips', []),
                        'port': osd_info.get('port')
                    }

        for osd in AlbaOSDList.get_albaosds():
            if osd.osd_id not in osd_info_map:
                AlbaMigrationController._logger.warning(
                    'OSD with ID {0} is modelled but could not be found through ALBA'
                    .format(osd.osd_id))
                continue

            ips = osd_info_map[osd.osd_id]['ips']
            port = osd_info_map[osd.osd_id]['port']
            changes = False
            if osd.ips is None:
                changes = True
                osd.ips = ips
            if osd.port is None:
                changes = True
                osd.port = port
            if changes is True:
                AlbaMigrationController._logger.info(
                    'Updating OSD with ID {0} with IPS {1} and port {2}'.
                    format(osd.osd_id, ips, port))
                osd.save()

        ###################################################
        # Read preference for GLOBAL ALBA Backends (1.10.3)  (https://github.com/openvstorage/framework-alba-plugin/issues/452)
        if Configuration.get(key='/ovs/framework/migration|read_preference',
                             default=False) is False:
            try:
                name_backend_map = dict((alba_backend.name, alba_backend)
                                        for alba_backend in alba_backends)
                for alba_node in AlbaNodeList.get_albanodes():
                    AlbaMigrationController._logger.info(
                        'Processing maintenance services running on ALBA Node {0} with ID {1}'
                        .format(alba_node.ip, alba_node.node_id))
                    alba_node.invalidate_dynamics('maintenance_services')
                    for alba_backend_name, services in alba_node.maintenance_services.iteritems(
                    ):
                        if alba_backend_name not in name_backend_map:
                            AlbaMigrationController._logger.error(
                                'ALBA Node {0} has services for an ALBA Backend {1} which is not modelled'
                                .format(alba_node.ip, alba_backend_name))
                            continue

                        alba_backend = name_backend_map[alba_backend_name]
                        AlbaMigrationController._logger.info(
                            'Processing {0} ALBA Backend {1} with GUID {2}'.
                            format(alba_backend.scaling, alba_backend.name,
                                   alba_backend.guid))
                        if alba_backend.scaling == alba_backend.SCALINGS.LOCAL:
                            read_preferences = [alba_node.node_id]
                        else:
                            read_preferences = AlbaController.get_read_preferences_for_global_backend(
                                alba_backend=alba_backend,
                                alba_node_id=alba_node.node_id,
                                read_preferences=[])

                        for service_name, _ in services:
                            AlbaMigrationController._logger.info(
                                'Processing service {0}'.format(service_name))
                            old_config_key = '/ovs/alba/backends/{0}/maintenance/config'.format(
                                alba_backend.guid)
                            new_config_key = '/ovs/alba/backends/{0}/maintenance/{1}/config'.format(
                                alba_backend.guid, service_name)
                            if Configuration.exists(key=old_config_key):
                                new_config = Configuration.get(
                                    key=old_config_key)
                                new_config[
                                    'read_preference'] = read_preferences
                                Configuration.set(key=new_config_key,
                                                  value=new_config)
                for alba_backend in alba_backends:
                    Configuration.delete(
                        key='/ovs/alba/backends/{0}/maintenance/config'.format(
                            alba_backend.guid))
                AlbaController.checkup_maintenance_agents.delay()

                Configuration.set(
                    key='/ovs/framework/migration|read_preference', value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Updating read preferences for ALBA Backends failed')

        #######################################################
        # Storing actual package name in version files (1.11.0) (https://github.com/openvstorage/framework/issues/1876)
        changed_clients = set()
        storagerouters = StorageRouterList.get_storagerouters()
        if Configuration.get(
                key=
                '/ovs/framework/migration|actual_package_name_in_version_file_alba',
                default=False) is False:
            try:
                service_manager = ServiceFactory.get_manager()
                alba_pkg_name, alba_version_cmd = PackageFactory.get_package_and_version_cmd_for(
                    component=PackageFactory.COMP_ALBA)
                for storagerouter in storagerouters:
                    try:
                        root_client = SSHClient(
                            endpoint=storagerouter.ip, username='******'
                        )  # Use '.ip' instead of StorageRouter object because this code is executed during post-update at which point the heartbeat has not been updated for some time
                    except UnableToConnectException:
                        AlbaMigrationController._logger.exception(
                            'Updating actual package name for version files failed on StorageRouter {0}'
                            .format(storagerouter.ip))
                        continue

                    for file_name in root_client.file_list(
                            directory=ServiceFactory.RUN_FILE_DIR):
                        if not file_name.endswith('.version'):
                            continue
                        file_path = '{0}/{1}'.format(
                            ServiceFactory.RUN_FILE_DIR, file_name)
                        contents = root_client.file_read(filename=file_path)
                        if alba_pkg_name == PackageFactory.PKG_ALBA_EE and '{0}='.format(
                                PackageFactory.PKG_ALBA) in contents:
                            # Rewrite the version file in the RUN_FILE_DIR
                            contents = contents.replace(
                                PackageFactory.PKG_ALBA,
                                PackageFactory.PKG_ALBA_EE)
                            root_client.file_write(filename=file_path,
                                                   contents=contents)

                            # Regenerate the service and update the EXTRA_VERSION_CMD in the configuration management
                            service_name = file_name.split('.')[0]
                            service_config_key = ServiceFactory.SERVICE_CONFIG_KEY.format(
                                storagerouter.machine_id, service_name)
                            if Configuration.exists(key=service_config_key):
                                service_config = Configuration.get(
                                    key=service_config_key)
                                if 'EXTRA_VERSION_CMD' in service_config:
                                    service_config[
                                        'EXTRA_VERSION_CMD'] = '{0}=`{1}`'.format(
                                            alba_pkg_name, alba_version_cmd)
                                    Configuration.set(key=service_config_key,
                                                      value=service_config)
                                    service_manager.regenerate_service(
                                        name='ovs-arakoon',
                                        client=root_client,
                                        target_name='ovs-{0}'.format(
                                            service_name)
                                    )  # Leave out .version
                                    changed_clients.add(root_client)
                Configuration.set(
                    key=
                    '/ovs/framework/migration|actual_package_name_in_version_file_alba',
                    value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Updating actual package name for version files failed')

        for root_client in changed_clients:
            try:
                root_client.run(['systemctl', 'daemon-reload'])
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Executing command "systemctl daemon-reload" failed')

        ####################################
        # Fix for migration version (1.11.0)
        # Previous code could potentially store a higher version number in the config management than the actual version number
        if Configuration.get(
                key='/ovs/framework/migration|alba_migration_version_fix',
                default=False) is False:
            try:
                for storagerouter in storagerouters:
                    config_key = '/ovs/framework/hosts/{0}/versions'.format(
                        storagerouter.machine_id)
                    if Configuration.exists(key=config_key):
                        versions = Configuration.get(key=config_key)
                        if versions.get(PackageFactory.COMP_MIGRATION_ALBA,
                                        0) > ExtensionMigrator.THIS_VERSION:
                            versions[
                                PackageFactory.
                                COMP_MIGRATION_ALBA] = ExtensionMigrator.THIS_VERSION
                            Configuration.set(key=config_key, value=versions)
                Configuration.set(
                    key='/ovs/framework/migration|alba_migration_version_fix',
                    value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Updating migration version failed')

        ####################################
        # Enable auto-cleanup
        migration_auto_cleanup_key = '/ovs/framework/migration|alba_auto_cleanup'
        if Configuration.get(key=migration_auto_cleanup_key,
                             default=False) is False:
            try:
                for storagerouter in StorageRouterList.get_storagerouters():
                    storagerouter.invalidate_dynamics(
                        'features')  # New feature was added
                errors = []
                for alba_backend in AlbaBackendList.get_albabackends():
                    try:
                        AlbaController.set_auto_cleanup(alba_backend.guid)
                    except Exception as ex:
                        AlbaMigrationController._logger.exception(
                            'Failed to set the auto-cleanup for ALBA Backend {0}'
                            .format(alba_backend.name))
                        errors.append(ex)
                if len(errors) == 0:
                    Configuration.set(key=migration_auto_cleanup_key,
                                      value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Updating auto cleanup failed')

        ####################################
        # Change cache eviction
        migration_random_eviction_key = '/ovs/framework/migration|alba_cache_eviction_random'
        if Configuration.get(key=migration_random_eviction_key,
                             default=False) is False:
            try:
                errors = []
                for alba_backend in AlbaBackendList.get_albabackends():
                    try:
                        AlbaController.set_cache_eviction(alba_backend.guid)
                    except Exception as ex:
                        AlbaMigrationController._logger.exception(
                            'Failed to set the auto-cleanup for ALBA Backend {0}'
                            .format(alba_backend.name))
                        errors.append(ex)
                if len(errors) == 0:
                    Configuration.set(key=migration_random_eviction_key,
                                      value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Updating auto cleanup failed')

        ###################################################
        # Sync all disks and apply the backend role. Backend role was removed with the AD (since 1.10)
        albanode_backend_role_sync_key = '/ovs/framework/migration|albanode_backend_role_sync'
        if not Configuration.get(key=albanode_backend_role_sync_key,
                                 default=False):
            try:
                errors = []
                for alba_node in AlbaNodeList.get_albanodes():
                    try:
                        if not alba_node.storagerouter:
                            continue
                        stack = alba_node.client.get_stack()  # type: dict
                        for slot_id, slot_information in stack.iteritems():
                            osds = slot_information.get('osds',
                                                        {})  # type: dict
                            slot_aliases = slot_information.get(
                                'aliases', [])  # type: list
                            if not osds:  # No osds means no partition was made
                                continue
                            # Sync to add all potential partitions that will need a backend role
                            DiskController.sync_with_reality(
                                storagerouter_guid=alba_node.storagerouter_guid
                            )
                            for disk in alba_node.storagerouter.disks:
                                if set(disk.aliases).intersection(
                                        set(slot_aliases)):
                                    partition = disk.partitions[0]
                                    if DiskPartition.ROLES.BACKEND not in partition.roles:
                                        partition.roles.append(
                                            DiskPartition.ROLES.BACKEND)
                                        partition.save()
                    except Exception as ex:
                        AlbaMigrationController._logger.exception(
                            'Syncing for storagerouter/albanode {0} failed'.
                            format(alba_node.storagerouter.ip))
                        errors.append(ex)
                if not errors:
                    Configuration.set(key=albanode_backend_role_sync_key,
                                      value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Syncing up the disks for backend roles failed')

        AlbaMigrationController._logger.info('Finished out of band migrations')

    @staticmethod
    @ovs_task(name='alba.migration.migrate_sdm',
              schedule=Schedule(minute='30', hour='6'),
              ensure_single_info={'mode': 'DEFAULT'})
    def migrate_sdm():
        """
        Executes async migrations for ALBA SDM node. It doesn't matter too much when they are executed, as long as they get eventually executed.
        This code will typically contain:
        * "dangerous" migration code (it needs certain running services)
        * Migration code depending on a cluster-wide state
        * ...
        """
        from ovs.dal.lists.albanodelist import AlbaNodeList

        AlbaMigrationController._logger.info(
            'Preparing out of band migrations for SDM...')
        for alba_node in AlbaNodeList.get_albanodes():
            try:
                AlbaMigrationController._logger.info(
                    'Executing post-update migration code for ALBA Node {0}'.
                    format(alba_node.node_id))
                alba_node.client.update_execute_migration_code()
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Executing post-update migration code for ALBA Node {0} failed'
                    .format(alba_node.node_id))
        AlbaMigrationController._logger.info(
            'Finished out of band migrations for SDM')
class AlbaArakoonController(object):

    _logger = Logger('lib')

    @staticmethod
    def abms_reachable(alba_backend):
        # type: (AlbaBackend) -> None
        """
        Check if all ABMs are reachable for a backend
        Only checks internally managed services
        :param alba_backend: AlbaBackend object
        :type alba_backend: AlbaBackend
        :return: None
        :rtype: NoneType
        :raises: RuntimeError: When an ABM could not be reached
        """
        if alba_backend.abm_cluster is not None:
            for abm_service in alba_backend.abm_cluster.abm_services:
                if abm_service.service.is_internal is True:
                    service = abm_service.service
                    try:
                        SSHClient(endpoint=service.storagerouter,
                                  username='******')
                    except UnableToConnectException:
                        raise RuntimeError(
                            'Node {0} with IP {1} is not reachable'.format(
                                service.storagerouter.name,
                                service.storagerouter.ip))

    @staticmethod
    def nsms_reachable(alba_backend):
        # type: (AlbaBackend) -> None
        """
        Check if all NSMs are reachable for a backend
        Only checks internally managed services
        :param alba_backend: AlbaBackend object
        :type alba_backend: AlbaBackend
        :return: None
        :rtype: NoneType
        :raises: RuntimeError: When an NSM could not be reached
        """
        if alba_backend.abm_cluster is not None:
            for nsm_cluster in alba_backend.nsm_clusters:
                for nsm_service in nsm_cluster.nsm_services:
                    service = nsm_service.service
                    if service.is_internal is True:
                        try:
                            SSHClient(endpoint=service.storagerouter,
                                      username='******')
                        except UnableToConnectException:
                            raise RuntimeError(
                                'Node {0} with IP {1} is not reachable'.format(
                                    service.storagerouter.name,
                                    service.storagerouter.ip))

    @classmethod
    def remove_alba_arakoon_clusters(cls,
                                     alba_backend_guid,
                                     validate_clusters_reachable=True):
        # type: (basestring, bool) -> None
        """
        Removes all backend related Arakoon clusters
        :param alba_backend_guid: Guid of the ALBA Backend
        :type alba_backend_guid: str
        :param validate_clusters_reachable: Validate if all clusters are reachable
        :type validate_clusters_reachable: bool
        :return: None
        :rtype: NoneType
        """
        alba_backend = AlbaBackend(alba_backend_guid)
        if validate_clusters_reachable:
            AlbaArakoonController.abms_reachable(alba_backend)
            AlbaArakoonController.nsms_reachable(alba_backend)

        if alba_backend.abm_cluster is not None:
            AlbaArakoonController._logger.debug(
                'Removing clusters for ALBA Backend {0}'.format(
                    alba_backend.name))
            arakoon_clusters = list(Configuration.list('/ovs/arakoon'))
            # Remove the ABM cluster
            ABMInstaller.remove_abm_cluster(alba_backend.abm_cluster,
                                            arakoon_clusters)
            # Remove NSM Arakoon clusters and services
            for nsm_cluster in alba_backend.nsm_clusters:
                NSMInstaller.remove_nsm_cluster(nsm_cluster, arakoon_clusters)

    @classmethod
    def get_available_arakoon_storagerouters(cls, ssh_clients=None):
        # type: (Optional[Dict[StorageRouter, SSHClient]]) -> Dict[StorageRouter, DiskPartition]
        """
        Retrieves all Storagerouters which are suitable to deploy Arakoons on
        :return: Set of all Storagerouters that are suitable
        :rtype: Dict[StorageRouter, DiskPartition]
        """
        ssh_clients = ssh_clients or {}
        available_storagerouters = {}
        masters = StorageRouterList.get_masters()
        for storagerouter in masters:
            try:
                partition = AlbaArakoonInstaller.get_db_partition(
                    storagerouter)
                try:
                    if ssh_clients:
                        client = ssh_clients.get(storagerouter)
                    else:
                        client = SSHClient(storagerouter)
                    if client:
                        available_storagerouters[storagerouter] = partition
                except UnableToConnectException:
                    cls._logger.warning(
                        'Storage Router with IP {0} is not reachable'.format(
                            storagerouter.ip))
            except ValueError:
                pass  # Ignore storagerouters without DB parition
        return available_storagerouters

    @classmethod
    def _alba_arakoon_checkup(cls,
                              alba_backend_guid=None,
                              abm_cluster=None,
                              nsm_clusters=None):
        # type: (Optional[str], Optional[str], Optional[List[str]]) -> None
        """
        Creates a new Arakoon cluster if required and extends cluster if possible on all available master nodes
        :param alba_backend_guid: Guid of the ALBA Backend
        :type alba_backend_guid: str
        :param nsm_clusters: NSM clusters for this ALBA Backend
        The code will claim the Arakoon clusters for this backend when provided
        :type nsm_clusters: list[str]
        :param abm_cluster: ABM cluster for this ALBA Backend
        The code will claim the Arakoon cluster for this backend when provided
        :type abm_cluster: str|None
        :return:None
        :rtype: NoneType
        """
        slaves = StorageRouterList.get_slaves()
        masters = StorageRouterList.get_masters()
        clients = {}
        for storagerouter in masters + slaves:
            try:
                clients[storagerouter] = SSHClient(storagerouter)
            except UnableToConnectException:
                cls._logger.warning(
                    'Storage Router with IP {0} is not reachable'.format(
                        storagerouter.ip))
        available_storagerouters = cls.get_available_arakoon_storagerouters(
            clients)

        # Call here, because this potentially raises error, which should happen before actually making changes
        abm_installer = ABMInstaller(ssh_clients=clients)
        nsm_installer = NSMInstaller(version_str=abm_installer.version_str,
                                     ssh_clients=clients)

        # Cluster creation
        if alba_backend_guid is not None:
            alba_backend = AlbaBackend(alba_backend_guid)
            # @todo revisit. This might enforce the ABM name for externals (might be unintended)
            abm_cluster_name = '{0}-abm'.format(alba_backend.name)

            # ABM Arakoon cluster creation
            if alba_backend.abm_cluster is None:
                # Fallback to installing the cluster on an available storagerouter
                storagerouter, partition = available_storagerouters.items()[0]
                abm_installer.deploy_abm_cluster(
                    alba_backend,
                    abm_cluster_name,
                    requested_abm_cluster_name=abm_cluster,
                    storagerouter=storagerouter)

            # NSM Arakoon cluster creation
            if len(alba_backend.nsm_clusters
                   ) == 0 and nsm_clusters is not None:
                storagerouter, partition = available_storagerouters.items()[0]
                nsm_installer.deploy_nsm_cluster(alba_backend,
                                                 storagerouter=storagerouter,
                                                 nsm_clusters=nsm_clusters)

        # ABM Cluster extension
        for alba_backend in AlbaBackendList.get_albabackends():
            if alba_backend.abm_cluster is None:
                AlbaArakoonController._logger.warning(
                    'ALBA Backend {0} does not have an ABM cluster registered'.
                    format(alba_backend.name))
                continue
            cls.ensure_abm_cluster_safety(alba_backend.abm_cluster,
                                          available_storagerouters,
                                          abm_installer=abm_installer)

    @staticmethod
    @ovs_task(name='alba.scheduled_alba_arakoon_checkup',
              schedule=Schedule(minute='30', hour='*'),
              ensure_single_info={
                  'mode': 'DEFAULT',
                  'extra_task_names': ['alba.manual_alba_arakoon_checkup']
              })
    def scheduled_alba_arakoon_checkup():
        # type: () -> None
        """
        Makes sure the ABM Arakoon is on all available master nodes
        :return: None
        """
        AlbaArakoonController._alba_arakoon_checkup()

    @staticmethod
    @ovs_task(name='alba.manual_alba_arakoon_checkup',
              ensure_single_info={
                  'mode': 'DEFAULT',
                  'extra_task_names': ['alba.scheduled_alba_arakoon_checkup']
              })
    def manual_alba_arakoon_checkup(alba_backend_guid,
                                    nsm_clusters,
                                    abm_cluster=None):
        # type: (str, List[str], Optional[str]) -> Union[bool, None]
        """
        Creates a new Arakoon cluster if required and extends cluster if possible on all available master nodes
        :param alba_backend_guid: Guid of the ALBA Backend
        :type alba_backend_guid: str
        :param nsm_clusters: NSM clusters for this ALBA Backend
        The code will claim the Arakoon clusters for this backend when provided
        :type nsm_clusters: list[str]
        :param abm_cluster: ABM cluster for this ALBA Backend
        The code will claim the Arakoon cluster for this backend when provided
        :type abm_cluster: str|None
        :return: True if task completed, None if task was discarded (by decorator)
        :rtype: bool|None
        """
        if (abm_cluster is not None
                and len(nsm_clusters) == 0) or (len(nsm_clusters) > 0
                                                and abm_cluster is None):
            raise ValueError(
                'Both ABM cluster and NSM clusters must be provided')
        if abm_cluster is not None:
            # Check if the requested clusters are available
            for cluster_name in [abm_cluster] + nsm_clusters:
                try:
                    metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(
                        cluster_name=cluster_name)
                    if metadata['in_use'] is True:
                        raise ValueError(
                            'Cluster {0} has already been claimed'.format(
                                cluster_name))
                except NotFoundException:
                    raise ValueError(
                        'Could not find an Arakoon cluster with name: {0}'.
                        format(cluster_name))
        AlbaArakoonController._alba_arakoon_checkup(
            alba_backend_guid=alba_backend_guid,
            abm_cluster=abm_cluster,
            nsm_clusters=nsm_clusters)
        return True

    @staticmethod
    def get_load(nsm_cluster):
        """
        Calculates the load of an NSM node, returning a float percentage
        :param nsm_cluster: NSM cluster to retrieve the load for
        :type nsm_cluster: ovs.dal.hybrids.albansmcluster.NSMCluster
        :return: Load of the NSM service
        :rtype: float
        """
        service_capacity = float(nsm_cluster.capacity)
        if service_capacity < 0:
            return 50.0
        if service_capacity == 0:
            return float('inf')

        config = Configuration.get_configuration_path(
            key=nsm_cluster.alba_backend.abm_cluster.config_location)
        hosts_data = AlbaCLI.run(command='list-nsm-hosts', config=config)
        try:
            host = [
                host for host in hosts_data if host['id'] == nsm_cluster.name
            ][0]
        except IndexError:
            raise ValueError(
                'No host data could be retrieved from Alba for NSM cluster {0}'
                .format(nsm_cluster.name))
        usage = host['namespaces_count']
        return round(usage / service_capacity * 100.0, 5)

    @classmethod
    def get_nsm_loads(cls, alba_backend):
        # type: (AlbaBackend) -> Dict[int, float]
        """
        Get the amount of nsm for every storagerouter
        :param alba_backend: Alba Backend to list nsms for
        :return: An overview of how much load is on each nsm cluster
        :rtype: Dict[int, float]
        """
        nsm_loads = collections.OrderedDict()
        sorted_nsm_clusters = sorted(alba_backend.nsm_clusters,
                                     key=lambda k: k.number)
        for nsm_cluster in sorted_nsm_clusters:
            nsm_loads[nsm_cluster.number] = cls.get_load(nsm_cluster)
        return nsm_loads

    @classmethod
    def get_nsms_per_storagerouter(cls, alba_backend):
        # type: (AlbaBackend) -> Dict[StorageRouter, int]
        """
        Get the amount of nsm for every storagerouter
        :param alba_backend: Alba Backend to list nsms for
        :return: An overview of how many NSMs are on each storagerouter
        :rtype: Dict[StorageRouter, int]
        """
        internal = AlbaArakoonInstaller.is_internally_managed(alba_backend)
        nsm_storagerouters = {}
        sorted_nsm_clusters = sorted(alba_backend.nsm_clusters,
                                     key=lambda k: k.number)
        for nsm_cluster in sorted_nsm_clusters:
            if internal:
                for nsm_service in nsm_cluster.nsm_services:
                    if nsm_service.service.storagerouter not in nsm_storagerouters:
                        nsm_storagerouters[
                            nsm_service.service.storagerouter] = 0
                    nsm_storagerouters[nsm_service.service.storagerouter] += 1

        # Include ABM hosts as potential candidates to extend to
        if internal:
            for abm_service in alba_backend.abm_cluster.abm_services:
                if abm_service.service.storagerouter not in nsm_storagerouters:
                    nsm_storagerouters[abm_service.service.storagerouter] = 0

        return nsm_storagerouters

    @classmethod
    def ensure_abm_cluster_safety(cls,
                                  abm_cluster,
                                  available_storagerouters,
                                  abm_installer=None):
        # type: (ABMCluster, Dict[StorageRouter, DiskPartition],  Optional[ABMInstaller]) -> None
        """
        Ensure that the ABM cluster is safe and sound
        :param abm_cluster: ABM Cluster object
        :type abm_cluster: ABMCluster
        :param available_storagerouters: All available storagerouters mapped with their DB partition
        :type available_storagerouters:  Dict[StorageRouter, DiskPartition]
        :param abm_installer: The ABMInstaller to use. Defaults to creating a new one
        :type abm_installer: ABMInstaller
        :return: None
        :rtype: NoneType
        """
        abm_installer = abm_installer or ABMInstaller()

        metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(
            cluster_name=abm_cluster.name)
        if 0 < len(abm_cluster.abm_services) < len(
                available_storagerouters) and metadata['internal'] is True:
            current_abm_ips = [
                abm_service.service.storagerouter.ip
                for abm_service in abm_cluster.abm_services
            ]
            for storagerouter, partition in available_storagerouters.iteritems(
            ):
                if storagerouter.ip in current_abm_ips:
                    continue
                abm_installer.extend_abm_cluster(storagerouter, abm_cluster)
                current_abm_ips.append(storagerouter.ip)

    @classmethod
    def ensure_nsm_cluster_safety(cls,
                                  nsm_cluster,
                                  nsms_per_storagerouter=None,
                                  nsm_installer=None):
        # type: (NSMCluster, Optional[Dict[StorageRouter, int]], Optional[NSMInstaller]) -> None
        """
        Ensure that the NSM clusters are safe and sound
        :param nsm_cluster: NSM Cluster to extend
        :type nsm_cluster: NSMCluster
        :param nsms_per_storagerouter: Amount of NSMs mapped by StorageRouter
        :type nsms_per_storagerouter: Dict[StorageRouter, int]
        :param nsm_installer: NSMInstaller instance to use. Defaults to creating a new one
        :type nsm_installer: NSMInstaller
        """
        nsm_installer = nsm_installer or NSMInstaller()
        alba_backend = nsm_cluster.alba_backend
        nsms_per_storagerouter = nsms_per_storagerouter if nsms_per_storagerouter is not None else cls.get_nsms_per_storagerouter(
            alba_backend)

        safety = Configuration.get(
            '/ovs/framework/plugins/alba/config|nsm.safety')
        AlbaArakoonController._logger.debug(
            'NSM safety is configured at: {0}'.format(safety))

        # Check amount of nodes
        if len(nsm_cluster.nsm_services) < safety:
            cls._logger.info('ALBA Backend {0} - Extending if possible'.format(
                alba_backend.name))
            current_sr_hosts = [
                nsm_service.service.storagerouter
                for nsm_service in nsm_cluster.nsm_services
            ]
            available_sr_hosts = [
                storagerouter for storagerouter in nsms_per_storagerouter
                if storagerouter not in current_sr_hosts
            ]
            while len(available_sr_hosts) > 0 and len(
                    current_sr_hosts) < safety:
                candidate_sr = None
                candidate_load = None
                for storagerouter in available_sr_hosts:
                    # Determine the least NSM-loaded Storagerouter to extend to
                    storagerouter_nsm_load = nsms_per_storagerouter[
                        storagerouter]
                    if candidate_load is None or storagerouter_nsm_load < candidate_load:
                        candidate_sr = storagerouter
                        candidate_load = storagerouter_nsm_load
                if candidate_sr is None or candidate_load is None:
                    raise RuntimeError(
                        'Could not determine a candidate StorageRouter')
                current_sr_hosts.append(candidate_sr.ip)
                available_sr_hosts.remove(candidate_sr)
                # Extend the cluster (configuration, services, ...)
                nsm_installer.extend_nsm_cluster(candidate_sr, nsm_cluster)

    @classmethod
    def ensure_nsm_clusters_load(cls,
                                 alba_backend,
                                 nsms_per_storagerouter=None,
                                 min_internal_nsms=1,
                                 external_nsm_cluster_names=None,
                                 version_str=None,
                                 ssh_clients=None):
        # type: (AlbaBackend, Optional[Dict[StorageRouter, int]], Optional[int], Optional[List[str], Optional[str]], Optional[StorageRouter, SSHClient]) -> None
        """
        Ensure that all NSM clusters are not overloaded
        :param alba_backend: Alba Backend to ensure NSM Cluster load for
        :type alba_backend: AlbaBackend
        :param nsms_per_storagerouter: Amount of NSMs mapped by StorageRouter
        :type nsms_per_storagerouter: Dict[StorageRouter, int]
        :param min_internal_nsms: Minimum amount of NSM hosts that need to be provided
        :type min_internal_nsms: int
        :param external_nsm_cluster_names: Information about the additional clusters to claim (only for externally managed Arakoon clusters)
        :type external_nsm_cluster_names: list
        :param version_str: Alba version string
        :type version_str: str
        :param ssh_clients: SSHClients to use
        :type ssh_clients: Dict[Storagerouter, SSHClient]
        :return: None
        :rtype: NoneType
        """
        if ssh_clients is None:
            ssh_clients = {}
        if external_nsm_cluster_names is None:
            external_nsm_cluster_names = []

        nsms_per_storagerouter = nsms_per_storagerouter if nsms_per_storagerouter is not None else cls.get_nsms_per_storagerouter(
            alba_backend)
        version_str = version_str or AlbaArakoonInstaller.get_alba_version_string(
        )
        nsm_loads = cls.get_nsm_loads(alba_backend)
        internal = AlbaArakoonInstaller.is_internally_managed(alba_backend)
        abm_cluster_name = alba_backend.abm_cluster.name

        safety = Configuration.get(
            '/ovs/framework/plugins/alba/config|nsm.safety')
        maxload = Configuration.get(
            '/ovs/framework/plugins/alba/config|nsm.maxload')

        overloaded = min(nsm_loads.values()) >= maxload
        if not overloaded:
            # At least 1 NSM is not overloaded yet
            AlbaArakoonController._logger.debug(
                'ALBA Backend {0} - NSM load OK'.format(alba_backend.name))
            if internal:
                # When load is not OK, deploy at least 1 additional NSM
                nsms_to_add = max(0, min_internal_nsms - len(nsm_loads))
            else:
                nsms_to_add = len(external_nsm_cluster_names)
            if nsms_to_add == 0:
                return
        else:
            AlbaArakoonController._logger.warning(
                'ALBA Backend {0} - NSM load is NOT OK'.format(
                    alba_backend.name))
            if internal:
                # When load is not OK, deploy at least 1 additional NSM
                nsms_to_add = max(1, min_internal_nsms - len(nsm_loads))
            else:
                # For externally managed clusters we only claim the specified clusters, if none provided, we just log it
                nsms_to_add = len(external_nsm_cluster_names)
                if nsms_to_add == 0:
                    cls._logger.critical(
                        'ALBA Backend {0} - All NSM clusters are overloaded'.
                        format(alba_backend.name))
                    return

        # Deploy new (internal) or claim existing (external) NSM clusters
        cls._logger.debug(
            'ALBA Backend {0} - Currently {1} NSM cluster{2}'.format(
                alba_backend.name, len(nsm_loads),
                '' if len(nsm_loads) == 1 else 's'))
        AlbaArakoonController._logger.debug(
            'ALBA Backend {0} - Trying to add {1} NSM cluster{2}'.format(
                alba_backend.name, nsms_to_add,
                '' if nsms_to_add == 1 else 's'))
        base_number = max(nsm_loads.keys()) + 1
        for index, number in enumerate(
                xrange(base_number, base_number + nsms_to_add)):
            if not internal:
                # External clusters
                master_client = None
                if not ssh_clients:
                    for storagerouter in StorageRouterList.get_masters():
                        try:
                            master_client = SSHClient(storagerouter)
                        except UnableToConnectException:
                            cls._logger.warning(
                                'StorageRouter {0} with IP {1} is not reachable'
                                .format(storagerouter.name, storagerouter.ip))
                else:
                    for storagerouter, ssh_client in ssh_clients.iteritems():
                        if storagerouter.node_type == 'MASTER':
                            master_client = ssh_client
                if not master_client:
                    raise ValueError('Could not find an online master node')
                # @todo this might raise an indexerror?
                nsm_cluster_name = external_nsm_cluster_names[index]
                cls._logger.debug(
                    'ALBA Backend {0} - Claiming NSM cluster {1}'.format(
                        alba_backend.name, nsm_cluster_name))
                metadata = ArakoonInstaller.get_unused_arakoon_metadata_and_claim(
                    cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.NSM,
                    cluster_name=nsm_cluster_name)
                if metadata is None:
                    cls._logger.critical(
                        'ALBA Backend {0} - NSM cluster with name {1} could not be found'
                        .format(alba_backend.name, nsm_cluster_name))
                    continue

                cls._logger.debug(
                    'ALBA Backend {0} - Modeling services'.format(
                        alba_backend.name))
                AlbaArakoonInstaller.model_arakoon_service(
                    alba_backend=alba_backend,
                    cluster_name=nsm_cluster_name,
                    number=number)
                cls._logger.debug('ALBA Backend {0} - Registering NSM'.format(
                    alba_backend.name))
                NSMInstaller.register_nsm(abm_name=abm_cluster_name,
                                          nsm_name=nsm_cluster_name,
                                          ip=master_client.ip)
                AlbaArakoonController._logger.debug(
                    'ALBA Backend {0} - Extended cluster'.format(
                        alba_backend.name))
            else:
                # Internal clusters
                nsm_cluster_name = '{0}-nsm_{1}'.format(
                    alba_backend.name, number)
                cls._logger.debug(
                    'ALBA Backend {0} - Adding NSM cluster {1}'.format(
                        alba_backend.name, nsm_cluster_name))

                # One of the NSM nodes is overloaded. This means the complete NSM is considered overloaded
                # Figure out which StorageRouters are the least occupied
                loads = sorted(nsms_per_storagerouter.values())[:safety]
                storagerouters = []
                for storagerouter, load in nsms_per_storagerouter.iteritems():
                    if load in loads:
                        storagerouters.append(storagerouter)
                    if len(storagerouters) == safety:
                        break
                # Creating a new NSM cluster
                for sub_index, storagerouter in enumerate(storagerouters):
                    nsms_per_storagerouter[storagerouter] += 1
                    partition = AlbaArakoonInstaller.get_db_partition(
                        storagerouter)
                    arakoon_installer = ArakoonInstaller(
                        cluster_name=nsm_cluster_name)
                    # @todo Use deploy and extend code. (Disable register nsm in those parts)
                    if sub_index == 0:
                        arakoon_installer.create_cluster(
                            cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.NSM,
                            ip=storagerouter.ip,
                            base_dir=partition.folder,
                            plugins={NSM_PLUGIN: version_str})
                    else:
                        cls._logger.debug(
                            'ALBA Backend {0} - Extending NSM cluster {1}'.
                            format(alba_backend.name, nsm_cluster_name))
                        arakoon_installer.load()
                        arakoon_installer.extend_cluster(
                            new_ip=storagerouter.ip,
                            base_dir=partition.folder,
                            plugins={NSM_PLUGIN: version_str})
                    cls._logger.debug(
                        'ALBA Backend {0} - Linking plugins'.format(
                            alba_backend.name))
                    ssh_client = ssh_clients.get(storagerouter) or SSHClient(
                        StorageRouter)
                    AlbaArakoonInstaller.link_plugins(
                        client=ssh_client,
                        data_dir=partition.folder,
                        plugins=[NSM_PLUGIN],
                        cluster_name=nsm_cluster_name)
                    cls._logger.debug(
                        'ALBA Backend {0} - Modeling services'.format(
                            alba_backend.name))
                    AlbaArakoonInstaller.model_arakoon_service(
                        alba_backend=alba_backend,
                        cluster_name=nsm_cluster_name,
                        ports=arakoon_installer.ports[storagerouter.ip],
                        storagerouter=storagerouter,
                        number=number)
                    if sub_index == 0:
                        cls._logger.debug(
                            'ALBA Backend {0} - Starting cluster'.format(
                                alba_backend.name))
                        arakoon_installer.start_cluster()
                    else:
                        AlbaArakoonController._logger.debug(
                            'ALBA Backend {0} - Restarting cluster'.format(
                                alba_backend.name))
                        arakoon_installer.restart_cluster_after_extending(
                            new_ip=storagerouter.ip)
                cls._logger.debug('ALBA Backend {0} - Registering NSM'.format(
                    alba_backend.name))
                NSMInstaller.register_nsm(abm_name=abm_cluster_name,
                                          nsm_name=nsm_cluster_name,
                                          ip=storagerouters[0].ip)
                cls._logger.debug(
                    'ALBA Backend {0} - Added NSM cluster {1}'.format(
                        alba_backend.name, nsm_cluster_name))

    @staticmethod
    @ovs_task(name='alba.nsm_checkup',
              schedule=Schedule(minute='45', hour='*'),
              ensure_single_info={'mode': 'CHAINED'})
    def nsm_checkup(alba_backend_guid=None,
                    min_internal_nsms=1,
                    external_nsm_cluster_names=None):
        # type: (Optional[str], Optional[int], Optional[List[str]]) -> None
        """
        Validates the current NSM setup/configuration and takes actions where required.
        Assumptions:
        * A 2 node NSM is considered safer than a 1 node NSM.
        * When adding an NSM, the nodes with the least amount of NSM participation are preferred

        :param alba_backend_guid: Run for a specific ALBA Backend
        :type alba_backend_guid: str
        :param min_internal_nsms: Minimum amount of NSM hosts that need to be provided
        :type min_internal_nsms: int
        :param external_nsm_cluster_names: Information about the additional clusters to claim (only for externally managed Arakoon clusters)
        :type external_nsm_cluster_names: list
        :return: None
        :rtype: NoneType
        """
        ###############
        # Validations #
        ###############
        if external_nsm_cluster_names is None:
            external_nsm_cluster_names = []
        AlbaArakoonController._logger.info('NSM checkup started')
        if min_internal_nsms < 1:
            raise ValueError(
                'Minimum amount of NSM clusters must be 1 or more')

        if not isinstance(external_nsm_cluster_names, list):
            raise ValueError(
                "'external_nsm_cluster_names' must be of type 'list'")

        if len(external_nsm_cluster_names) > 0:
            if alba_backend_guid is None:
                raise ValueError(
                    'Additional NSMs can only be configured for a specific ALBA Backend'
                )
            if min_internal_nsms > 1:
                raise ValueError(
                    "'min_internal_nsms' and 'external_nsm_cluster_names' are mutually exclusive"
                )

            external_nsm_cluster_names = list(set(
                external_nsm_cluster_names))  # Remove duplicate cluster names
            for cluster_name in external_nsm_cluster_names:
                try:
                    ArakoonInstaller.get_arakoon_metadata_by_cluster_name(
                        cluster_name=cluster_name)
                except NotFoundException:
                    raise ValueError(
                        'Arakoon cluster with name {0} does not exist'.format(
                            cluster_name))

        if alba_backend_guid is None:
            alba_backends = [
                alba_backend
                for alba_backend in AlbaBackendList.get_albabackends()
                if alba_backend.backend.status == 'RUNNING'
            ]
        else:
            alba_backends = [AlbaBackend(alba_backend_guid)]

        masters = StorageRouterList.get_masters()
        storagerouters = set()
        for alba_backend in alba_backends:
            if alba_backend.abm_cluster is None:
                raise ValueError(
                    'No ABM cluster found for ALBA Backend {0}'.format(
                        alba_backend.name))
            if len(alba_backend.abm_cluster.abm_services) == 0:
                raise ValueError(
                    'ALBA Backend {0} does not have any registered ABM services'
                    .format(alba_backend.name))
            if len(alba_backend.nsm_clusters) + len(
                    external_nsm_cluster_names) > MAX_NSM_AMOUNT:
                raise ValueError(
                    'The maximum of {0} NSM Arakoon clusters will be exceeded. Amount of clusters that can be deployed for this ALBA Backend: {1}'
                    .format(MAX_NSM_AMOUNT,
                            MAX_NSM_AMOUNT - len(alba_backend.nsm_clusters)))
            # Validate enough externally managed Arakoon clusters are available
            if alba_backend.abm_cluster.abm_services[
                    0].service.is_internal is False:
                unused_cluster_names = set([
                    cluster_info['cluster_name'] for cluster_info in
                    ArakoonInstaller.get_unused_arakoon_clusters(
                        cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.NSM)
                ])
                if set(external_nsm_cluster_names).difference(
                        unused_cluster_names):
                    raise ValueError(
                        'Some of the provided cluster_names have already been claimed before'
                    )
                storagerouters.update(
                    set(masters)
                )  # For externally managed we need an available master node
            else:
                for abm_service in alba_backend.abm_cluster.abm_services:  # For internally managed we need all StorageRouters online
                    storagerouters.add(abm_service.service.storagerouter)
                for nsm_cluster in alba_backend.nsm_clusters:  # For internally managed we need all StorageRouters online
                    for nsm_service in nsm_cluster.nsm_services:
                        storagerouters.add(nsm_service.service.storagerouter)

        ssh_clients = {}
        for storagerouter in storagerouters:
            try:
                ssh_clients[storagerouter] = SSHClient(endpoint=storagerouter)
            except UnableToConnectException:
                raise RuntimeError(
                    'StorageRouter {0} with IP {1} is not reachable'.format(
                        storagerouter.name, storagerouter.ip))

        version_str = AlbaArakoonInstaller.get_alba_version_string()
        nsm_installer = NSMInstaller(version_str=version_str,
                                     ssh_clients=ssh_clients)

        ##################
        # Check Clusters #
        ##################
        safety = Configuration.get(
            '/ovs/framework/plugins/alba/config|nsm.safety')
        maxload = Configuration.get(
            '/ovs/framework/plugins/alba/config|nsm.maxload')

        AlbaArakoonController._logger.debug(
            'NSM safety is configured at: {0}'.format(safety))
        AlbaArakoonController._logger.debug(
            'NSM max load is configured at: {0}'.format(maxload))

        master_client = None
        failed_backends = []
        for alba_backend in alba_backends:
            try:
                # Gather information
                AlbaArakoonController._logger.info(
                    'ALBA Backend {0} - Ensuring NSM safety'.format(
                        alba_backend.name))

                internal = AlbaArakoonInstaller.is_internally_managed(
                    alba_backend)
                nsm_loads = AlbaArakoonController.get_nsm_loads(alba_backend)
                nsm_storagerouters = AlbaArakoonController.get_nsms_per_storagerouter(
                    alba_backend)
                sorted_nsm_clusters = sorted(alba_backend.nsm_clusters,
                                             key=lambda k: k.number)

                if not internal and len(external_nsm_cluster_names) > 0:
                    for sr, cl in ssh_clients.iteritems():
                        if sr.node_type == 'MASTER':
                            master_client = cl
                            break
                    if master_client is None:
                        # Internal is False and we specified the NSM clusters to claim, but no MASTER nodes online
                        raise ValueError(
                            'Could not find an online master node')

                AlbaArakoonController._logger.debug(
                    'ALBA Backend {0} - Arakoon clusters are {1} managed'.
                    format(alba_backend.name,
                           'internally' if internal is True else 'externally'))
                for nsm_number, nsm_load in nsm_loads.iteritems():
                    AlbaArakoonController._logger.debug(
                        'ALBA Backend {0} - NSM Cluster {1} - Load {2}'.format(
                            alba_backend.name, nsm_number, nsm_load))
                for sr, count in nsm_storagerouters.iteritems():
                    AlbaArakoonController._logger.debug(
                        'ALBA Backend {0} - StorageRouter {1} - NSM Services {2}'
                        .format(alba_backend.name, sr.name, count))

                if internal:
                    # Extend existing NSM clusters if safety not met
                    for nsm_cluster in sorted_nsm_clusters:
                        AlbaArakoonController._logger.debug(
                            'ALBA Backend {0} - Processing NSM {1} - Expected safety {2} - Current safety {3}'
                            .format(alba_backend.name, nsm_cluster.number,
                                    safety, len(nsm_cluster.nsm_services)))
                        AlbaArakoonController.ensure_nsm_cluster_safety(
                            nsm_cluster,
                            nsm_storagerouters,
                            nsm_installer=nsm_installer)
                AlbaArakoonController.ensure_nsm_clusters_load(
                    alba_backend,
                    nsms_per_storagerouter=nsm_storagerouters,
                    ssh_clients=ssh_clients,
                    version_str=version_str,
                    min_internal_nsms=min_internal_nsms,
                    external_nsm_cluster_names=external_nsm_cluster_names)
            except Exception:
                AlbaArakoonController._logger.exception(
                    'NSM Checkup failed for Backend {0}'.format(
                        alba_backend.name))
                failed_backends.append(alba_backend.name)

    @classmethod
    def ensure_s3_transaction_safety(cls,
                                     s3_cluster,
                                     available_storagerouters,
                                     s3_installer=None):
        # type: (S3TransactionCluster, Dict[StorageRouter, DiskPartition], Optional[S3TransactionInstaller]) -> None
        """
        Ensure that the S3 transaction cluster is safe and sound
        :param s3_cluster: ABM Cluster object
        :type s3_cluster: ABMCluster
        :param available_storagerouters: All available storagerouters mapped with their DB partition
        :type available_storagerouters:  Dict[StorageRouter, DiskPartition]
        :param s3_installer: The ABMInstaller to use. Defaults to creating a new one
        :type s3_installer: ABMInstaller
        :return: None
        :rtype: NoneType
        """
        s3_transaction_installer = s3_installer or S3TransactionInstaller()

        metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(
            cluster_name=s3_cluster.name)
        if 0 < len(s3_cluster.s3_transaction_services) < len(
                available_storagerouters) and metadata['internal'] is True:
            current_service_ips = [
                j_service.service.storagerouter.ip
                for j_service in s3_cluster.s3_transaction_services
            ]
            for storagerouter, partition in available_storagerouters.iteritems(
            ):
                if storagerouter.ip in current_service_ips:
                    continue
                s3_transaction_installer.extend_s3_cluster(
                    storagerouter, s3_cluster)
                current_service_ips.append(storagerouter.ip)

    @classmethod
    def configure_s3_transaction_cluster(cls):
        """
        Completely deploys a S3 transaction cluster
        :return: None
        :rtype: NoneType
        """
        s3_transaction_installer = S3TransactionInstaller()
        available_storagerouters = cls.get_available_arakoon_storagerouters()
        if not available_storagerouters:
            raise ValueError(
                'No available storagerouters. Cannot deploy the S3 Transaction Cluster'
            )
        if len(S3TransactionClusterList.get_s3_transaction_clusters()) == 0:
            # Deployment required
            storagerouter, partition = available_storagerouters.items()[0]
            s3_transaction_installer.deploy_s3_cluster(storagerouter)
        s3_transaction_cluster = S3TransactionClusterList.get_s3_transaction_clusters(
        )[0]
        cls.ensure_s3_transaction_safety(s3_transaction_cluster,
                                         available_storagerouters)
Ejemplo n.º 14
0
class ScheduledTaskController(object):
    """
    This controller contains all scheduled task code. These tasks can be
    executed at certain intervals and should be self-containing
    """
    _logger = LogHandler.get('lib', name='scheduled tasks')

    @staticmethod
    @celery.task(name='ovs.scheduled.snapshot_all_vdisks',
                 schedule=Schedule(minute='0', hour='*'))
    @ensure_single(task_name='ovs.scheduled.snapshot_all_vdisks',
                   extra_task_names=['ovs.scheduled.delete_snapshots'])
    def snapshot_all_vdisks():
        """
        Snapshots all vDisks
        """
        ScheduledTaskController._logger.info('[SSA] started')
        success = []
        fail = []
        for vdisk in VDiskList.get_vdisks():
            try:
                metadata = {
                    'label': '',
                    'is_consistent': False,
                    'timestamp': str(int(time.time())),
                    'is_automatic': True,
                    'is_sticky': False
                }
                VDiskController.create_snapshot(vdisk_guid=vdisk.guid,
                                                metadata=metadata)
                success.append(vdisk.guid)
            except Exception:
                ScheduledTaskController._logger.exception(
                    'Error taking snapshot for vDisk {0}'.format(vdisk.guid))
                fail.append(vdisk.guid)
        ScheduledTaskController._logger.info(
            '[SSA] Snapshot has been taken for {0} vDisks, {1} failed.'.format(
                len(success), len(fail)))

    @staticmethod
    @celery.task(name='ovs.scheduled.delete_snapshots',
                 schedule=Schedule(minute='1', hour='2'))
    @ensure_single(task_name='ovs.scheduled.delete_snapshots')
    def delete_snapshots(timestamp=None):
        """
        Delete snapshots & scrubbing policy

        Implemented delete snapshot policy:
        < 1d | 1d bucket | 1 | best of bucket   | 1d
        < 1w | 1d bucket | 6 | oldest of bucket | 7d = 1w
        < 1m | 1w bucket | 3 | oldest of bucket | 4w = 1m
        > 1m | delete

        :param timestamp: Timestamp to determine whether snapshots should be kept or not, if none provided, current time will be used
        :type timestamp: float

        :return: None
        """
        ScheduledTaskController._logger.info('Delete snapshots started')

        day = timedelta(1)
        week = day * 7

        def make_timestamp(offset):
            """
            Create an integer based timestamp
            :param offset: Offset in days
            :return: Timestamp
            """
            return int(mktime((base - offset).timetuple()))

        # Calculate bucket structure
        if timestamp is None:
            timestamp = time.time()
        base = datetime.fromtimestamp(timestamp).date() - day
        buckets = []
        # Buckets first 7 days: [0-1[, [1-2[, [2-3[, [3-4[, [4-5[, [5-6[, [6-7[
        for i in xrange(0, 7):
            buckets.append({
                'start': make_timestamp(day * i),
                'end': make_timestamp(day * (i + 1)),
                'type': '1d',
                'snapshots': []
            })
        # Week buckets next 3 weeks: [7-14[, [14-21[, [21-28[
        for i in xrange(1, 4):
            buckets.append({
                'start': make_timestamp(week * i),
                'end': make_timestamp(week * (i + 1)),
                'type': '1w',
                'snapshots': []
            })
        buckets.append({
            'start': make_timestamp(week * 4),
            'end': 0,
            'type': 'rest',
            'snapshots': []
        })

        # Get a list of all snapshots that are used as parents for clones
        parent_snapshots = set(
            [vd.parentsnapshot for vd in VDiskList.get_with_parent_snaphots()])

        # Place all snapshots in bucket_chains
        bucket_chains = []
        for vdisk in VDiskList.get_vdisks():
            if vdisk.info['object_type'] in ['BASE']:
                bucket_chain = copy.deepcopy(buckets)
                for snapshot in vdisk.snapshots:
                    if snapshot.get('is_sticky') is True:
                        continue
                    if snapshot['guid'] in parent_snapshots:
                        ScheduledTaskController._logger.info(
                            'Not deleting snapshot {0} because it has clones'.
                            format(snapshot['guid']))
                        continue
                    timestamp = int(snapshot['timestamp'])
                    for bucket in bucket_chain:
                        if bucket['start'] >= timestamp > bucket['end']:
                            bucket['snapshots'].append({
                                'timestamp':
                                timestamp,
                                'snapshot_id':
                                snapshot['guid'],
                                'vdisk_guid':
                                vdisk.guid,
                                'is_consistent':
                                snapshot['is_consistent']
                            })
                bucket_chains.append(bucket_chain)

        # Clean out the snapshot bucket_chains, we delete the snapshots we want to keep
        # And we'll remove all snapshots that remain in the buckets
        for bucket_chain in bucket_chains:
            first = True
            for bucket in bucket_chain:
                if first is True:
                    best = None
                    for snapshot in bucket['snapshots']:
                        if best is None:
                            best = snapshot
                        # Consistent is better than inconsistent
                        elif snapshot[
                                'is_consistent'] and not best['is_consistent']:
                            best = snapshot
                        # Newer (larger timestamp) is better than older snapshots
                        elif snapshot['is_consistent'] == best['is_consistent'] and \
                                snapshot['timestamp'] > best['timestamp']:
                            best = snapshot
                    bucket['snapshots'] = [
                        s for s in bucket['snapshots']
                        if s['timestamp'] != best['timestamp']
                    ]
                    first = False
                elif bucket['end'] > 0:
                    oldest = None
                    for snapshot in bucket['snapshots']:
                        if oldest is None:
                            oldest = snapshot
                        # Older (smaller timestamp) is the one we want to keep
                        elif snapshot['timestamp'] < oldest['timestamp']:
                            oldest = snapshot
                    bucket['snapshots'] = [
                        s for s in bucket['snapshots']
                        if s['timestamp'] != oldest['timestamp']
                    ]

        # Delete obsolete snapshots
        for bucket_chain in bucket_chains:
            for bucket in bucket_chain:
                for snapshot in bucket['snapshots']:
                    VDiskController.delete_snapshot(
                        vdisk_guid=snapshot['vdisk_guid'],
                        snapshot_id=snapshot['snapshot_id'])
        ScheduledTaskController._logger.info('Delete snapshots finished')

    @staticmethod
    @celery.task(name='ovs.scheduled.execute_scrub',
                 schedule=Schedule(minute='0', hour='3'))
    @ensure_single(task_name='ovs.scheduled.execute_scrub')
    def execute_scrub():
        """
        Retrieve and execute scrub work
        :return: None
        """
        ScheduledTaskController._logger.info('Scrubber - Started')
        vpools = VPoolList.get_vpools()
        error_messages = []
        scrub_locations = []
        for storage_router in StorageRouterList.get_storagerouters():
            scrub_partitions = storage_router.partition_config.get(
                DiskPartition.ROLES.SCRUB, [])
            if len(scrub_partitions) == 0:
                continue
            if len(scrub_partitions) > 1:
                raise RuntimeError(
                    'Multiple {0} partitions defined for StorageRouter {1}'.
                    format(DiskPartition.ROLES.SCRUB, storage_router.name))

            partition = DiskPartition(scrub_partitions[0])
            ScheduledTaskController._logger.info(
                'Scrubber - Storage Router {0:<15} has {1} partition at {2}'.
                format(storage_router.ip, DiskPartition.ROLES.SCRUB,
                       partition.folder))
            try:
                SSHClient(storage_router, 'root')
                scrub_locations.append({
                    'scrub_path': str(partition.folder),
                    'storage_router': storage_router
                })
            except UnableToConnectException:
                ScheduledTaskController._logger.warning(
                    'Scrubber - Storage Router {0:<15} is not reachable'.
                    format(storage_router.ip))

        number_of_vpools = len(vpools)
        if number_of_vpools >= 6:
            max_threads_per_vpool = 1
        elif number_of_vpools >= 3:
            max_threads_per_vpool = 2
        else:
            max_threads_per_vpool = 5

        threads = []
        counter = 0
        for vp in vpools:
            # Verify amount of vDisks on vPool
            ScheduledTaskController._logger.info(
                'Scrubber - vPool {0} - Checking scrub work'.format(vp.name))
            if len(vp.vdisks) == 0:
                ScheduledTaskController._logger.info(
                    'Scrubber - vPool {0} - No scrub work'.format(vp.name))
                continue

            # Fill queue with all vDisks for current vPool
            vpool_queue = Queue()
            for vd in vp.vdisks:
                if vd.is_vtemplate is True:
                    continue
                vd.invalidate_dynamics('storagedriver_id')
                if not vd.storagedriver_id:
                    ScheduledTaskController._logger.warning(
                        'Scrubber - vPool {0} - vDisk {1} {2} - No StorageDriver ID found'
                        .format(vp.name, vd.guid, vd.name))
                    continue
                vpool_queue.put(vd.guid)

            # Copy backend connection manager information in separate key
            threads_to_spawn = min(max_threads_per_vpool, len(scrub_locations))
            ScheduledTaskController._logger.info(
                'Scrubber - vPool {0} - Spawning {1} thread{2}'.format(
                    vp.name, threads_to_spawn,
                    '' if threads_to_spawn == 1 else 's'))
            for _ in range(threads_to_spawn):
                scrub_target = scrub_locations[counter % len(scrub_locations)]
                thread = Thread(
                    target=ScheduledTaskController.execute_scrub_work,
                    name='scrub_{0}_{1}'.format(
                        vp.guid, scrub_target['storage_router'].guid),
                    args=(vpool_queue, vp, scrub_target, error_messages))
                thread.start()
                threads.append(thread)
                counter += 1

        for thread in threads:
            thread.join()

        if len(error_messages) > 0:
            raise Exception('Errors occurred while scrubbing:\n  - {0}'.format(
                '\n  - '.join(error_messages)))

    @staticmethod
    def execute_scrub_work(queue, vpool, scrub_info, error_messages):
        """
        Executes scrub work for a given vDisk queue and vPool, based on scrub_info
        :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool)
        :type queue: Queue
        :param vpool: the vPool object of the vDisks
        :type vpool: VPool
        :param scrub_info: A dict containing scrub information: `scrub_path` with the path where to scrub and `storage_router` with the StorageRouter
                           that needs to do the work
        :type scrub_info: dict
        :param error_messages: A list of error messages to be filled
        :type error_messages: list
        :return: a list of error messages
        :rtype: list
        """
        def _verify_mds_config(current_vdisk):
            current_vdisk.invalidate_dynamics('info')
            vdisk_configs = current_vdisk.info['metadata_backend_config']
            if len(vdisk_configs) == 0:
                raise RuntimeError('Could not load MDS configuration')
            return vdisk_configs

        client = None
        lock_time = 5 * 60
        storagerouter = scrub_info['storage_router']
        scrub_directory = '{0}/scrub_work_{1}_{2}'.format(
            scrub_info['scrub_path'], vpool.name, storagerouter.name)
        scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format(
            vpool.guid, storagerouter.guid)
        backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format(
            vpool.guid, storagerouter.guid)
        alba_proxy_service = 'ovs-albaproxy_{0}_{1}_scrub'.format(
            vpool.name, storagerouter.name)

        # Deploy a proxy
        try:
            with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time):
                ScheduledTaskController._logger.info(
                    'Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}'
                    .format(vpool.name, storagerouter.name,
                            alba_proxy_service))
                client = SSHClient(storagerouter, 'root')
                client.dir_create(scrub_directory)
                client.dir_chmod(
                    scrub_directory, 0777
                )  # Celery task executed by 'ovs' user and should be able to write in it
                if ServiceManager.has_service(
                        name=alba_proxy_service, client=client
                ) is True and ServiceManager.get_service_status(
                        name=alba_proxy_service, client=client) is True:
                    ScheduledTaskController._logger.info(
                        'Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}'
                        .format(vpool.name, storagerouter.name,
                                alba_proxy_service))
                    scrub_config = Configuration.get(scrub_config_key)
                else:
                    machine_id = System.get_my_machine_id(client)
                    port_range = Configuration.get(
                        '/ovs/framework/hosts/{0}/ports|storagedriver'.format(
                            machine_id))
                    port = System.get_free_ports(selected_range=port_range,
                                                 nr=1,
                                                 client=client)[0]
                    # Scrub config
                    # {u'albamgr_cfg_url': u'arakoon://config/ovs/vpools/71e2f717-f270-4a41-bbb0-d4c8c084d43e/proxies/64759516-3471-4321-b912-fb424568fc5b/config/abm?ini=%2Fopt%2FOpenvStorage%2Fconfig%2Farakoon_cacc.ini',
                    #  u'fragment_cache': [u'none'],
                    #  u'ips': [u'127.0.0.1'],
                    #  u'log_level': u'info',
                    #  u'manifest_cache_size': 17179869184,
                    #  u'port': 0,
                    #  u'transport': u'tcp'}

                    # Backend config
                    # {u'alba_connection_host': u'10.100.193.155',
                    #  u'alba_connection_port': 26204,
                    #  u'alba_connection_preset': u'preset',
                    #  u'alba_connection_timeout': 15,
                    #  u'alba_connection_transport': u'TCP',
                    #  u'backend_interface_retries_on_error': 5,
                    #  u'backend_interface_retry_backoff_multiplier': 2.0,
                    #  u'backend_interface_retry_interval_secs': 1,
                    #  u'backend_type': u'ALBA'}
                    scrub_config = Configuration.get(
                        'ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(
                            vpool.guid))
                    scrub_config['port'] = port
                    scrub_config['transport'] = 'tcp'
                    Configuration.set(scrub_config_key,
                                      json.dumps(scrub_config, indent=4),
                                      raw=True)

                    params = {
                        'VPOOL_NAME':
                        vpool.name,
                        'LOG_SINK':
                        LogHandler.get_sink_path('alba_proxy'),
                        'CONFIG_PATH':
                        Configuration.get_configuration_path(scrub_config_key)
                    }
                    ServiceManager.add_service(name='ovs-albaproxy',
                                               params=params,
                                               client=client,
                                               target_name=alba_proxy_service)
                    ServiceManager.start_service(name=alba_proxy_service,
                                                 client=client)
                    ScheduledTaskController._logger.info(
                        'Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}'
                        .format(vpool.name, storagerouter.name,
                                alba_proxy_service))

                backend_config = Configuration.get(
                    'ovs/vpools/{0}/hosts/{1}/config'.format(
                        vpool.guid, vpool.storagedrivers[0].storagedriver_id
                    ))['backend_connection_manager']
                backend_config['alba_connection_host'] = '127.0.0.1'
                backend_config['alba_connection_port'] = scrub_config['port']
                Configuration.set(
                    backend_config_key,
                    json.dumps({"backend_connection_manager": backend_config},
                               indent=4),
                    raw=True)
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format(
                vpool.name, storagerouter.name, alba_proxy_service)
            error_messages.append(message)
            ScheduledTaskController._logger.exception(message)
            if client is not None and ServiceManager.has_service(
                    name=alba_proxy_service, client=client) is True:
                if ServiceManager.get_service_status(name=alba_proxy_service,
                                                     client=client) is True:
                    ServiceManager.stop_service(name=alba_proxy_service,
                                                client=client)
                ServiceManager.remove_service(name=alba_proxy_service,
                                              client=client)
            if Configuration.exists(scrub_config_key):
                Configuration.delete(scrub_config_key)

        try:
            # Empty the queue with vDisks to scrub
            with remote(storagerouter.ip, [VDisk]) as rem:
                while True:
                    vdisk = None
                    vdisk_guid = queue.get(False)
                    try:
                        # Check MDS master is local. Trigger MDS handover if necessary
                        vdisk = rem.VDisk(vdisk_guid)
                        ScheduledTaskController._logger.info(
                            'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Started scrubbing at location {3}'
                            .format(vpool.name, storagerouter.name, vdisk.name,
                                    scrub_directory))
                        configs = _verify_mds_config(current_vdisk=vdisk)
                        storagedriver = StorageDriverList.get_by_storagedriver_id(
                            vdisk.storagedriver_id)
                        if configs[0].get(
                                'ip') != storagedriver.storagerouter.ip:
                            ScheduledTaskController._logger.info(
                                'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - MDS master is not local, trigger handover'
                                .format(vpool.name, storagerouter.name,
                                        vdisk.name))
                            MDSServiceController.ensure_safety(
                                VDisk(vdisk_guid)
                            )  # Do not use a remote VDisk instance here
                            configs = _verify_mds_config(current_vdisk=vdisk)
                            if configs[0].get(
                                    'ip') != storagedriver.storagerouter.ip:
                                ScheduledTaskController._logger.warning(
                                    'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because master MDS still not local'
                                    .format(vpool.name, storagerouter.name,
                                            vdisk.name))
                                continue

                        # Do the actual scrubbing
                        with vdisk.storagedriver_client.make_locked_client(
                                str(vdisk.volume_id)) as locked_client:
                            ScheduledTaskController._logger.info(
                                'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Retrieve and apply scrub work'
                                .format(vpool.name, storagerouter.name,
                                        vdisk.name))
                            work_units = locked_client.get_scrubbing_workunits(
                            )
                            for work_unit in work_units:
                                res = locked_client.scrub(
                                    work_unit=work_unit,
                                    scratch_dir=scrub_directory,
                                    log_sinks=[
                                        LogHandler.get_sink_path(
                                            'scrubber', allow_override=True)
                                    ],
                                    backend_config=Configuration.
                                    get_configuration_path(backend_config_key))
                                locked_client.apply_scrubbing_result(
                                    scrubbing_work_result=res)
                            if work_units:
                                ScheduledTaskController._logger.info(
                                    'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - {3} work units successfully applied'
                                    .format(vpool.name, storagerouter.name,
                                            vdisk.name, len(work_units)))
                            else:
                                ScheduledTaskController._logger.info(
                                    'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - No scrubbing required'
                                    .format(vpool.name, storagerouter.name,
                                            vdisk.name))
                    except Exception:
                        if vdisk is None:
                            message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk with guid {2} could not be found'.format(
                                vpool.name, storagerouter.name, vdisk_guid)
                        else:
                            message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Scrubbing failed'.format(
                                vpool.name, storagerouter.name, vdisk.name)
                        error_messages.append(message)
                        ScheduledTaskController._logger.exception(message)

        except Empty:  # Raised when all items have been fetched from the queue
            ScheduledTaskController._logger.info(
                'Scrubber - vPool {0} - StorageRouter {1} - Queue completely processed'
                .format(vpool.name, storagerouter.name))
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - Scrubbing failed'.format(
                vpool.name, storagerouter.name)
            error_messages.append(message)
            ScheduledTaskController._logger.exception(message)

        # Delete the proxy again
        try:
            with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time):
                ScheduledTaskController._logger.info(
                    'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}'
                    .format(vpool.name, storagerouter.name,
                            alba_proxy_service))
                client = SSHClient(storagerouter, 'root')
                client.dir_delete(scrub_directory)
                if ServiceManager.has_service(alba_proxy_service,
                                              client=client):
                    ServiceManager.stop_service(alba_proxy_service,
                                                client=client)
                    ServiceManager.remove_service(alba_proxy_service,
                                                  client=client)
                if Configuration.exists(scrub_config_key):
                    Configuration.delete(scrub_config_key)
                ScheduledTaskController._logger.info(
                    'Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}'
                    .format(vpool.name, storagerouter.name,
                            alba_proxy_service))
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format(
                vpool.name, storagerouter.name, alba_proxy_service)
            error_messages.append(message)
            ScheduledTaskController._logger.exception(message)

    @staticmethod
    @celery.task(name='ovs.scheduled.collapse_arakoon',
                 schedule=Schedule(minute='10',
                                   hour='0,2,4,6,8,10,12,14,16,18,20,22'))
    @ensure_single(task_name='ovs.scheduled.collapse_arakoon')
    def collapse_arakoon():
        """
        Collapse Arakoon's Tlogs
        :return: None
        """
        ScheduledTaskController._logger.info('Starting arakoon collapse')
        storagerouters = StorageRouterList.get_storagerouters()
        cluster_info = [('cacc', storagerouters[0], True)]
        cluster_names = []
        for service in ServiceList.get_services():
            if service.is_internal is True and service.type.name in (
                    ServiceType.SERVICE_TYPES.ARAKOON,
                    ServiceType.SERVICE_TYPES.NS_MGR,
                    ServiceType.SERVICE_TYPES.ALBA_MGR):
                cluster = service.name.replace('arakoon-', '')
                if cluster in cluster_names:
                    continue
                cluster_names.append(cluster)
                cluster_info.append((cluster, service.storagerouter, False))
        workload = {}
        for cluster, storagerouter, filesystem in cluster_info:
            ScheduledTaskController._logger.debug(
                '  Collecting info for cluster {0}'.format(cluster))
            config = ArakoonClusterConfig(cluster, filesystem=filesystem)
            config.load_config(storagerouter.ip)
            for node in config.nodes:
                if node.ip not in workload:
                    workload[node.ip] = {'node_id': node.name, 'clusters': []}
                workload[node.ip]['clusters'].append((cluster, filesystem))
        for storagerouter in storagerouters:
            try:
                if storagerouter.ip not in workload:
                    continue
                node_workload = workload[storagerouter.ip]
                client = SSHClient(storagerouter)
                for cluster, filesystem in node_workload['clusters']:
                    try:
                        ScheduledTaskController._logger.debug(
                            '  Collapsing cluster {0} on {1}'.format(
                                cluster, storagerouter.ip))
                        if filesystem is True:
                            config_path = ArakoonClusterConfig.CONFIG_FILE.format(
                                cluster)
                        else:
                            config_path = Configuration.get_configuration_path(
                                ArakoonClusterConfig.CONFIG_KEY.format(
                                    cluster))
                        client.run([
                            'arakoon', '--collapse-local',
                            node_workload['node_id'], '2', '-config',
                            config_path
                        ])
                        ScheduledTaskController._logger.info(
                            '  Collapsing cluster {0} on {1} completed'.format(
                                cluster, storagerouter.ip))
                    except:
                        ScheduledTaskController._logger.exception(
                            '  Collapsing cluster {0} on {1} failed'.format(
                                cluster, storagerouter.ip))
            except UnableToConnectException:
                ScheduledTaskController._logger.error(
                    '  Could not collapse any cluster on {0} (not reachable)'.
                    format(storagerouter.name))

        ScheduledTaskController._logger.info('Arakoon collapse finished')