Beispiel #1
0
    def restart_osd(node_guid, osd_id):
        """
        Restarts an OSD on a given Node
        :param node_guid: Guid of the node to restart an OSD on
        :type node_guid: str
        :param osd_id: ID of the OSD to restart
        :type osd_id: str
        :return: None
        :rtype: NoneType
        """
        node = AlbaNode(node_guid)
        osd = AlbaOSDList.get_by_osd_id(osd_id)
        if osd.alba_node_guid != node.guid:
            raise RuntimeError('Could not locate OSD {0} on node {1}'.format(
                osd_id, node_guid))

        try:
            result = node.client.restart_osd(osd.slot_id, osd.osd_id)
            if result['_success'] is False:
                AlbaNodeController._logger.error(
                    'Error restarting OSD: {0}'.format(result['_error']))
                raise RuntimeError(result['_error'])
        except (requests.ConnectionError, requests.Timeout):
            AlbaNodeController._logger.warning(
                'Could not connect to node {0} to restart OSD'.format(
                    node.guid))
            raise
Beispiel #2
0
 def reset_osd(node_cluster_guid, node_guid, osd_id, expected_safety):
     # type: (str, str, str, Dict[str, int]) -> None
     """
     Removes and re-adds an OSD to a Disk
     :param node_cluster_guid: Guid of the AlbaNodeCluster
     :type node_cluster_guid: str
     :param node_guid: Guid of the node to reset an OSD of
     :type node_guid: str
     :param osd_id: OSD to reset
     :type osd_id: str
     :param expected_safety: Expected safety after having reset the disk
     :type expected_safety: dict
     :return: None
     :rtype: NoneType
     """
     node_cluster = AlbaNodeCluster(node_cluster_guid)
     active_node = AlbaNode(node_guid)
     if active_node not in node_cluster.alba_nodes:
         raise ValueError(
             'The requested active AlbaNode is not part of AlbaNodeCluster {0}'
             .format(node_cluster.guid))
     osd = AlbaOSDList.get_by_osd_id(osd_id)
     fill_slot_extra = active_node.client.build_slot_params(osd)
     disk_aliases = AlbaNodeClusterController.remove_osd(
         node_guid=node_guid,
         osd_id=osd_id,
         expected_safety=expected_safety)
     if len(disk_aliases) == 0:
         return
     try:
         active_node.client.fill_slot(osd.slot_id, fill_slot_extra)
     except (requests.ConnectionError, requests.Timeout):
         AlbaNodeClusterController._logger.warning(
             'Could not connect to node {0} to (re)configure ASD'.format(
                 active_node.guid))
         return
     except NotFoundError:
         # Can occur when the slot id could not be matched with an existing slot on the alba-asd manager
         # This error can be anticipated when the status of the osd would be 'missing' in the nodes stack but that would be too much overhead
         message = 'Could not add a new OSD. The requested slot {0} could not be found'.format(
             osd.slot_id)
         AlbaNodeClusterController._logger.warning(message)
         raise RuntimeError(
             '{0}. Slot {1} might no longer be present on Alba node {2}'.
             format(message, osd.slot_id, node_guid))
     # Invalidate the stack and sync towards all passive sides
     active_node.invalidate_dynamics('stack')
     for node in node_cluster.alba_nodes:
         if node != active_node:
             try:
                 node.client.sync_stack(active_node.stack)
             except:
                 AlbaNodeClusterController._logger.exception(
                     'Error while syncing stacks to the passive side')
Beispiel #3
0
    def remove_node(node_guid):
        """
        Removes an ALBA node
        :param node_guid: Guid of the ALBA node to remove
        :type node_guid: str
        :return: None
        :rtype: NoneType
        """
        node = AlbaNode(node_guid)
        if node.type == AlbaNode.NODE_TYPES.ASD:
            for slot_id, slot_info in node.stack.iteritems():
                for osd_id, osd_info in slot_info['osds'].iteritems():
                    if AlbaOSDList.get_by_osd_id(osd_id=osd_id) is not None:
                        AlbaNodeController.remove_osd(node_guid=node.guid,
                                                      osd_id=osd_id,
                                                      expected_safety=None)
                if slot_info['available'] is False:
                    AlbaNodeController.remove_slot(node_guid=node.guid,
                                                   slot_id=slot_id)

            name_guid_map = dict(
                (alba_backend.name, alba_backend.guid)
                for alba_backend in AlbaBackendList.get_albabackends())
            try:
                # This loop will delete the services AND their configuration from the configuration management
                node.invalidate_dynamics('maintenance_services')
                for alba_backend_name, service_info in node.maintenance_services.iteritems(
                ):
                    for service_name, status in service_info:
                        node.client.remove_maintenance_service(
                            name=service_name,
                            alba_backend_guid=name_guid_map.get(
                                alba_backend_name))
            except (requests.ConnectionError, requests.Timeout):
                AlbaNodeController._logger.exception(
                    'Could not connect to node {0} to retrieve the maintenance services'
                    .format(node.guid))
            except InvalidCredentialsError:
                AlbaNodeController._logger.warning(
                    'Failed to retrieve the maintenance services for ALBA node {0}'
                    .format(node.node_id))

        node.delete()
        for alba_backend in AlbaBackendList.get_albabackends():
            alba_backend.invalidate_dynamics(['live_status'])
            alba_backend.backend.invalidate_dynamics(['live_status'])
        AlbaController.checkup_maintenance_agents.delay()
Beispiel #4
0
 def reset_osd(node_guid, osd_id, expected_safety):
     """
     Removes and re-adds an OSD to a Disk
     :param node_guid: Guid of the node to reset an OSD of
     :type node_guid: str
     :param osd_id: OSD to reset
     :type osd_id: str
     :param expected_safety: Expected safety after having reset the disk
     :type expected_safety: dict
     :return: None
     :rtype: NoneType
     """
     node = AlbaNode(node_guid)
     osd = AlbaOSDList.get_by_osd_id(osd_id)
     fill_slot_extra = node.client.build_slot_params(osd)
     disk_aliases = AlbaNodeController.remove_osd(
         node_guid=node_guid,
         osd_id=osd_id,
         expected_safety=expected_safety)
     if len(disk_aliases) == 0:
         return
     try:
         AlbaNodeController._fill_slot(node, osd.slot_id, fill_slot_extra)
     except (requests.ConnectionError, requests.Timeout):
         AlbaNodeController._logger.warning(
             'Could not connect to node {0} to (re)configure ASD'.format(
                 node.guid))
     except NotFoundError:
         # Can occur when the slot id could not be matched with an existing slot on the alba-asd manager
         # This error can be anticipated when the status of the osd would be 'missing' in the nodes stack but that would be too much overhead
         message = 'Could not add a new OSD. The requested slot {0} could not be found'.format(
             osd.slot_id)
         AlbaNodeController._logger.warning(message)
         raise RuntimeError(
             '{0}. Slot {1} might no longer be present on Alba node {2}'.
             format(message, osd.slot_id, node_guid))
     node.invalidate_dynamics('stack')
Beispiel #5
0
    def remove_osd(node_guid, osd_id, expected_safety):
        """
        Removes an OSD
        :param node_guid: Guid of the node to remove an OSD from
        :type node_guid: str
        :param osd_id: ID of the OSD to remove
        :type osd_id: str
        :param expected_safety: Expected safety after having removed the OSD
        :type expected_safety: dict or None
        :return: Aliases of the disk on which the OSD was removed
        :rtype: list
        """
        # Retrieve corresponding OSD in model
        node = AlbaNode(node_guid)
        AlbaNodeController._logger.debug('Removing OSD {0} at node {1}'.format(
            osd_id, node.ip))
        osd = AlbaOSDList.get_by_osd_id(osd_id)
        alba_backend = osd.alba_backend

        if expected_safety is None:
            AlbaNodeController._logger.warning(
                'Skipping safety check for OSD {0} on backend {1} - this is dangerous'
                .format(osd_id, alba_backend.guid))
        else:
            final_safety = AlbaController.calculate_safety(
                alba_backend_guid=alba_backend.guid, removal_osd_ids=[osd_id])
            safety_lost = final_safety['lost']
            safety_crit = final_safety['critical']
            if (safety_crit != 0 or safety_lost != 0) and (
                    safety_crit != expected_safety['critical']
                    or safety_lost != expected_safety['lost']):
                raise RuntimeError(
                    'Cannot remove OSD {0} as the current safety is not as expected ({1} vs {2})'
                    .format(osd_id, final_safety, expected_safety))
            AlbaNodeController._logger.debug(
                'Safety OK for OSD {0} on backend {1}'.format(
                    osd_id, alba_backend.guid))
        AlbaNodeController._logger.debug(
            'Purging OSD {0} on backend {1}'.format(osd_id, alba_backend.guid))
        AlbaController.remove_units(alba_backend_guid=alba_backend.guid,
                                    osd_ids=[osd_id])

        # Delete the OSD
        result = node.client.delete_osd(slot_id=osd.slot_id, osd_id=osd_id)
        if result['_success'] is False:
            raise RuntimeError('Error removing OSD: {0}'.format(
                result['_error']))

        # Clean configuration management and model - Well, just try it at least
        if Configuration.exists(ASD_CONFIG.format(osd_id), raw=True):
            Configuration.delete(ASD_CONFIG_DIR.format(osd_id), raw=True)

        osd.delete()
        node.invalidate_dynamics()
        if alba_backend is not None:
            alba_backend.invalidate_dynamics()
            alba_backend.backend.invalidate_dynamics()
        if node.storagerouter is not None:
            try:
                DiskController.sync_with_reality(
                    storagerouter_guid=node.storagerouter_guid)
            except UnableToConnectException:
                AlbaNodeController._logger.warning(
                    'Skipping disk sync since StorageRouter {0} is offline'.
                    format(node.storagerouter.name))

        return [osd.slot_id]
Beispiel #6
0
    def migrate():
        """
        Executes async migrations. It doesn't matter too much when they are executed, as long as they get eventually
        executed. This code will typically contain:
        * "dangerous" migration code (it needs certain running services)
        * Migration code depending on a cluster-wide state
        * ...
        """
        AlbaMigrationController._logger.info(
            'Preparing out of band migrations...')

        from ovs.dal.hybrids.diskpartition import DiskPartition
        from ovs.dal.lists.albabackendlist import AlbaBackendList
        from ovs.dal.lists.albanodelist import AlbaNodeList
        from ovs.dal.lists.albaosdlist import AlbaOSDList
        from ovs.dal.lists.storagerouterlist import StorageRouterList
        from ovs.extensions.generic.configuration import Configuration
        from ovs.extensions.generic.sshclient import SSHClient, UnableToConnectException
        from ovs.extensions.migration.migration.albamigrator import ExtensionMigrator
        from ovs.extensions.packages.albapackagefactory import PackageFactory
        from ovs.extensions.services.albaservicefactory import ServiceFactory
        from ovs.extensions.plugins.albacli import AlbaCLI, AlbaError
        from ovs.lib.alba import AlbaController
        from ovs.lib.disk import DiskController

        AlbaMigrationController._logger.info('Start out of band migrations...')

        #############################################
        # Introduction of IP:port combination on OSDs
        osd_info_map = {}
        alba_backends = AlbaBackendList.get_albabackends()
        for alba_backend in alba_backends:
            AlbaMigrationController._logger.info(
                'Verifying ALBA Backend {0}'.format(alba_backend.name))
            if alba_backend.abm_cluster is None:
                AlbaMigrationController._logger.warning(
                    'ALBA Backend {0} does not have an ABM cluster registered'.
                    format(alba_backend.name))
                continue

            AlbaMigrationController._logger.debug(
                'Retrieving configuration path for ALBA Backend {0}'.format(
                    alba_backend.name))
            try:
                config = Configuration.get_configuration_path(
                    alba_backend.abm_cluster.config_location)
            except:
                AlbaMigrationController._logger.exception(
                    'Failed to retrieve the configuration path for ALBA Backend {0}'
                    .format(alba_backend.name))
                continue

            AlbaMigrationController._logger.info(
                'Retrieving OSD information for ALBA Backend {0}'.format(
                    alba_backend.name))
            try:
                osd_info = AlbaCLI.run(command='list-all-osds', config=config)
            except (AlbaError, RuntimeError):
                AlbaMigrationController._logger.exception(
                    'Failed to retrieve OSD information for ALBA Backend {0}'.
                    format(alba_backend.name))
                continue

            for osd_info in osd_info:
                if osd_info.get('long_id'):
                    osd_info_map[osd_info['long_id']] = {
                        'ips': osd_info.get('ips', []),
                        'port': osd_info.get('port')
                    }

        for osd in AlbaOSDList.get_albaosds():
            if osd.osd_id not in osd_info_map:
                AlbaMigrationController._logger.warning(
                    'OSD with ID {0} is modelled but could not be found through ALBA'
                    .format(osd.osd_id))
                continue

            ips = osd_info_map[osd.osd_id]['ips']
            port = osd_info_map[osd.osd_id]['port']
            changes = False
            if osd.ips is None:
                changes = True
                osd.ips = ips
            if osd.port is None:
                changes = True
                osd.port = port
            if changes is True:
                AlbaMigrationController._logger.info(
                    'Updating OSD with ID {0} with IPS {1} and port {2}'.
                    format(osd.osd_id, ips, port))
                osd.save()

        ###################################################
        # Read preference for GLOBAL ALBA Backends (1.10.3)  (https://github.com/openvstorage/framework-alba-plugin/issues/452)
        if Configuration.get(key='/ovs/framework/migration|read_preference',
                             default=False) is False:
            try:
                name_backend_map = dict((alba_backend.name, alba_backend)
                                        for alba_backend in alba_backends)
                for alba_node in AlbaNodeList.get_albanodes():
                    AlbaMigrationController._logger.info(
                        'Processing maintenance services running on ALBA Node {0} with ID {1}'
                        .format(alba_node.ip, alba_node.node_id))
                    alba_node.invalidate_dynamics('maintenance_services')
                    for alba_backend_name, services in alba_node.maintenance_services.iteritems(
                    ):
                        if alba_backend_name not in name_backend_map:
                            AlbaMigrationController._logger.error(
                                'ALBA Node {0} has services for an ALBA Backend {1} which is not modelled'
                                .format(alba_node.ip, alba_backend_name))
                            continue

                        alba_backend = name_backend_map[alba_backend_name]
                        AlbaMigrationController._logger.info(
                            'Processing {0} ALBA Backend {1} with GUID {2}'.
                            format(alba_backend.scaling, alba_backend.name,
                                   alba_backend.guid))
                        if alba_backend.scaling == alba_backend.SCALINGS.LOCAL:
                            read_preferences = [alba_node.node_id]
                        else:
                            read_preferences = AlbaController.get_read_preferences_for_global_backend(
                                alba_backend=alba_backend,
                                alba_node_id=alba_node.node_id,
                                read_preferences=[])

                        for service_name, _ in services:
                            AlbaMigrationController._logger.info(
                                'Processing service {0}'.format(service_name))
                            old_config_key = '/ovs/alba/backends/{0}/maintenance/config'.format(
                                alba_backend.guid)
                            new_config_key = '/ovs/alba/backends/{0}/maintenance/{1}/config'.format(
                                alba_backend.guid, service_name)
                            if Configuration.exists(key=old_config_key):
                                new_config = Configuration.get(
                                    key=old_config_key)
                                new_config[
                                    'read_preference'] = read_preferences
                                Configuration.set(key=new_config_key,
                                                  value=new_config)
                for alba_backend in alba_backends:
                    Configuration.delete(
                        key='/ovs/alba/backends/{0}/maintenance/config'.format(
                            alba_backend.guid))
                AlbaController.checkup_maintenance_agents.delay()

                Configuration.set(
                    key='/ovs/framework/migration|read_preference', value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Updating read preferences for ALBA Backends failed')

        #######################################################
        # Storing actual package name in version files (1.11.0) (https://github.com/openvstorage/framework/issues/1876)
        changed_clients = set()
        storagerouters = StorageRouterList.get_storagerouters()
        if Configuration.get(
                key=
                '/ovs/framework/migration|actual_package_name_in_version_file_alba',
                default=False) is False:
            try:
                service_manager = ServiceFactory.get_manager()
                alba_pkg_name, alba_version_cmd = PackageFactory.get_package_and_version_cmd_for(
                    component=PackageFactory.COMP_ALBA)
                for storagerouter in storagerouters:
                    try:
                        root_client = SSHClient(
                            endpoint=storagerouter.ip, username='******'
                        )  # Use '.ip' instead of StorageRouter object because this code is executed during post-update at which point the heartbeat has not been updated for some time
                    except UnableToConnectException:
                        AlbaMigrationController._logger.exception(
                            'Updating actual package name for version files failed on StorageRouter {0}'
                            .format(storagerouter.ip))
                        continue

                    for file_name in root_client.file_list(
                            directory=ServiceFactory.RUN_FILE_DIR):
                        if not file_name.endswith('.version'):
                            continue
                        file_path = '{0}/{1}'.format(
                            ServiceFactory.RUN_FILE_DIR, file_name)
                        contents = root_client.file_read(filename=file_path)
                        if alba_pkg_name == PackageFactory.PKG_ALBA_EE and '{0}='.format(
                                PackageFactory.PKG_ALBA) in contents:
                            # Rewrite the version file in the RUN_FILE_DIR
                            contents = contents.replace(
                                PackageFactory.PKG_ALBA,
                                PackageFactory.PKG_ALBA_EE)
                            root_client.file_write(filename=file_path,
                                                   contents=contents)

                            # Regenerate the service and update the EXTRA_VERSION_CMD in the configuration management
                            service_name = file_name.split('.')[0]
                            service_config_key = ServiceFactory.SERVICE_CONFIG_KEY.format(
                                storagerouter.machine_id, service_name)
                            if Configuration.exists(key=service_config_key):
                                service_config = Configuration.get(
                                    key=service_config_key)
                                if 'EXTRA_VERSION_CMD' in service_config:
                                    service_config[
                                        'EXTRA_VERSION_CMD'] = '{0}=`{1}`'.format(
                                            alba_pkg_name, alba_version_cmd)
                                    Configuration.set(key=service_config_key,
                                                      value=service_config)
                                    service_manager.regenerate_service(
                                        name='ovs-arakoon',
                                        client=root_client,
                                        target_name='ovs-{0}'.format(
                                            service_name)
                                    )  # Leave out .version
                                    changed_clients.add(root_client)
                Configuration.set(
                    key=
                    '/ovs/framework/migration|actual_package_name_in_version_file_alba',
                    value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Updating actual package name for version files failed')

        for root_client in changed_clients:
            try:
                root_client.run(['systemctl', 'daemon-reload'])
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Executing command "systemctl daemon-reload" failed')

        ####################################
        # Fix for migration version (1.11.0)
        # Previous code could potentially store a higher version number in the config management than the actual version number
        if Configuration.get(
                key='/ovs/framework/migration|alba_migration_version_fix',
                default=False) is False:
            try:
                for storagerouter in storagerouters:
                    config_key = '/ovs/framework/hosts/{0}/versions'.format(
                        storagerouter.machine_id)
                    if Configuration.exists(key=config_key):
                        versions = Configuration.get(key=config_key)
                        if versions.get(PackageFactory.COMP_MIGRATION_ALBA,
                                        0) > ExtensionMigrator.THIS_VERSION:
                            versions[
                                PackageFactory.
                                COMP_MIGRATION_ALBA] = ExtensionMigrator.THIS_VERSION
                            Configuration.set(key=config_key, value=versions)
                Configuration.set(
                    key='/ovs/framework/migration|alba_migration_version_fix',
                    value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Updating migration version failed')

        ####################################
        # Enable auto-cleanup
        migration_auto_cleanup_key = '/ovs/framework/migration|alba_auto_cleanup'
        if Configuration.get(key=migration_auto_cleanup_key,
                             default=False) is False:
            try:
                for storagerouter in StorageRouterList.get_storagerouters():
                    storagerouter.invalidate_dynamics(
                        'features')  # New feature was added
                errors = []
                for alba_backend in AlbaBackendList.get_albabackends():
                    try:
                        AlbaController.set_auto_cleanup(alba_backend.guid)
                    except Exception as ex:
                        AlbaMigrationController._logger.exception(
                            'Failed to set the auto-cleanup for ALBA Backend {0}'
                            .format(alba_backend.name))
                        errors.append(ex)
                if len(errors) == 0:
                    Configuration.set(key=migration_auto_cleanup_key,
                                      value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Updating auto cleanup failed')

        ####################################
        # Change cache eviction
        migration_random_eviction_key = '/ovs/framework/migration|alba_cache_eviction_random'
        if Configuration.get(key=migration_random_eviction_key,
                             default=False) is False:
            try:
                errors = []
                for alba_backend in AlbaBackendList.get_albabackends():
                    try:
                        AlbaController.set_cache_eviction(alba_backend.guid)
                    except Exception as ex:
                        AlbaMigrationController._logger.exception(
                            'Failed to set the auto-cleanup for ALBA Backend {0}'
                            .format(alba_backend.name))
                        errors.append(ex)
                if len(errors) == 0:
                    Configuration.set(key=migration_random_eviction_key,
                                      value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Updating auto cleanup failed')

        ###################################################
        # Sync all disks and apply the backend role. Backend role was removed with the AD (since 1.10)
        albanode_backend_role_sync_key = '/ovs/framework/migration|albanode_backend_role_sync'
        if not Configuration.get(key=albanode_backend_role_sync_key,
                                 default=False):
            try:
                errors = []
                for alba_node in AlbaNodeList.get_albanodes():
                    try:
                        if not alba_node.storagerouter:
                            continue
                        stack = alba_node.client.get_stack()  # type: dict
                        for slot_id, slot_information in stack.iteritems():
                            osds = slot_information.get('osds',
                                                        {})  # type: dict
                            slot_aliases = slot_information.get(
                                'aliases', [])  # type: list
                            if not osds:  # No osds means no partition was made
                                continue
                            # Sync to add all potential partitions that will need a backend role
                            DiskController.sync_with_reality(
                                storagerouter_guid=alba_node.storagerouter_guid
                            )
                            for disk in alba_node.storagerouter.disks:
                                if set(disk.aliases).intersection(
                                        set(slot_aliases)):
                                    partition = disk.partitions[0]
                                    if DiskPartition.ROLES.BACKEND not in partition.roles:
                                        partition.roles.append(
                                            DiskPartition.ROLES.BACKEND)
                                        partition.save()
                    except Exception as ex:
                        AlbaMigrationController._logger.exception(
                            'Syncing for storagerouter/albanode {0} failed'.
                            format(alba_node.storagerouter.ip))
                        errors.append(ex)
                if not errors:
                    Configuration.set(key=albanode_backend_role_sync_key,
                                      value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Syncing up the disks for backend roles failed')

        AlbaMigrationController._logger.info('Finished out of band migrations')