Exemple #1
0
    def remove_slot(node_guid, slot_id):
        """
        Removes a disk
        :param node_guid: Guid of the node to remove a disk from
        :type node_guid: str
        :param slot_id: Slot ID
        :type slot_id: str
        :return: None
        :rtype: NoneType
        """
        # Verify client connectivity
        node = AlbaNode(node_guid)
        osds = [osd for osd in node.osds if osd.slot_id == slot_id]
        if len(osds) > 0:
            raise RuntimeError('A slot with claimed OSDs can\'t be removed')

        node.client.clear_slot(slot_id)

        node.invalidate_dynamics()
        # Sync model
        if node.storagerouter is not None:
            stack = node.client.get_stack()  # type: dict
            slot_information = stack.get(slot_id, {})
            slot_aliases = slot_information.get('aliases', [])
            for disk in node.storagerouter.disks:
                if set(disk.aliases).intersection(set(slot_aliases)):
                    partition = disk.partitions[0]
                    if DiskPartition.ROLES.BACKEND in partition.roles:
                        partition.roles.remove(DiskPartition.ROLES.BACKEND)
                        partition.save()
            DiskController.sync_with_reality(
                storagerouter_guid=node.storagerouter_guid)
    def remove_disk(node_guid, device_alias):
        """
        Removes a disk
        :param node_guid: Guid of the node to remove a disk from
        :type node_guid: str
        :param device_alias: Alias of the device to remove  (eg: /dev/disk/by-path/pci-0000:03:00.0-sas-0x5000c29f4cf04566-lun-0)
        :type device_alias: str
        :return: None
        """
        asds = {}
        node = AlbaNode(node_guid)
        node_id = node.node_id
        device_id = device_alias.split('/')[-1]
        offline_node = False

        # Verify client connectivity
        try:
            _ = node.client.get_disks()
        except (requests.ConnectionError, requests.Timeout, InvalidCredentialsError):
            AlbaNodeController._logger.warning('Could not connect to node {0} to validate disks'.format(node.guid))
            offline_node = True

        # Retrieve ASD information for the ALBA Disk
        for backend in AlbaBackendList.get_albabackends():
            local_stack = backend.local_stack
            if node_id in local_stack and device_id in local_stack[node_id]:
                asds.update(local_stack[node_id][device_id]['asds'])
        for asd_info in asds.values():
            if (offline_node is False and asd_info.get('status') != 'available') or (offline_node is True and asd_info.get('status_detail') == 'nodedown'):
                AlbaNodeController._logger.error('Disk {0} has still non-available ASDs on node {1}'.format(device_alias, node.ip))
                raise RuntimeError('Disk {0} on ALBA node {1} has still some non-available ASDs'.format(device_alias, node_id))

        # Retrieve the Disk from the framework model matching the ALBA Disk
        disk_to_clear = None
        for disk in DiskList.get_disks():
            if device_alias in disk.aliases:
                disk_to_clear = disk
                break

        # Remove the ALBA Disk making use of the ASD Manager Client
        if offline_node is False:
            result = node.client.remove_disk(disk_id=device_id, partition_aliases=disk_to_clear.partitions[0].aliases if len(disk_to_clear.partitions) > 0 else [])
            if result['_success'] is False:
                raise RuntimeError('Error removing disk {0}: {1}'.format(device_alias, result['_error']))

        # Clean the model
        for model_disk in node.disks:
            if device_alias in model_disk.aliases:
                for osd in model_disk.osds:
                    osd.delete()
                model_disk.delete()
        if disk_to_clear is not None:
            for partition in disk_to_clear.partitions:
                partition.roles = []
                partition.mountpoint = None
                partition.save()
        node.invalidate_dynamics()
        if node.storagerouter is not None:
            DiskController.sync_with_reality(storagerouter_guid=node.storagerouter_guid)
 def initialize_disks(node_guid, disks):
     """
     Initializes 1 or multiple disks
     :param node_guid: Guid of the node to which the disks belong
     :type node_guid: str
     :param disks: Disks to initialize  (key: device_alias, value: amount of ASDs to deploy)
     :type disks: dict
     :return: Dict of all failures with as key the Diskname, and as value the error
     :rtype: dict
     """
     node = AlbaNode(node_guid)
     try:
         available_disks = node.client.get_disks()
     except (requests.ConnectionError, requests.Timeout):
         AlbaNodeController._logger.exception('Could not connect to node {0} to validate disks'.format(node.guid))
         raise
     failures = {}
     added_disks = []
     for device_alias, amount in disks.iteritems():
         device_id = device_alias.split('/')[-1]
         AlbaNodeController._logger.debug('Initializing disk {0} at node {1}'.format(device_alias, node.ip))
         if device_id not in available_disks or available_disks[device_id]['available'] is False:
             AlbaNodeController._logger.exception('Disk {0} not available on node {1}'.format(device_alias, node.ip))
             failures[device_alias] = 'Disk unavailable'
         else:
             add_disk_result = node.client.add_disk(disk_id=device_id)
             # Verify if an AlbaDisk with found aliases already exists (eg: When initialize individual and initialize all run at the same time)
             exists = False
             aliases = add_disk_result['aliases']
             for alba_disk in node.disks:
                 if set(alba_disk.aliases).intersection(set(aliases)):
                     exists = True
                     break
             if exists is True:
                 continue
             disk = AlbaDisk()
             disk.aliases = aliases
             disk.alba_node = node
             disk.save()
             if add_disk_result['_success'] is False:
                 failures[device_alias] = add_disk_result['_error']
                 disk.delete()
             else:
                 device_id = disk.aliases[0].split('/')[-1]
                 for _ in xrange(amount):
                     add_asd_result = node.client.add_asd(disk_id=device_id)
                     if add_asd_result['_success'] is False:
                         failures[device_alias] = add_asd_result['_error']
                 added_disks.extend(add_disk_result['aliases'])
     if node.storagerouter is not None:
         DiskController.sync_with_reality(storagerouter_guid=node.storagerouter_guid)
         for disk in node.storagerouter.disks:
             if set(disk.aliases).intersection(set(added_disks)):
                 partition = disk.partitions[0]
                 if DiskPartition.ROLES.BACKEND not in partition.roles:
                     partition.roles.append(DiskPartition.ROLES.BACKEND)
                     partition.save()
     return failures
 def refresh_hardware(storagerouter_guid):
     """
     Refreshes all hardware related information
     :param storagerouter_guid: Guid of the StorageRouter to refresh the hardware on
     :type storagerouter_guid: str
     :return: None
     :rtype: NoneType
     """
     StorageRouterController.set_rdma_capability(storagerouter_guid)
     DiskController.sync_with_reality(storagerouter_guid)
    def initialize_disks(node_guid, disks):
        """
        Initializes a disk
        :param node_guid: Guid of the node which disks need to be initialized
        :type node_guid: str

        :param disks: Disks to initialize
        :type disks: dict

        :return: Dict of all failures with as key the Diskname, and as value the error
        :rtype: dict
        """
        node = AlbaNode(node_guid)
        try:
            available_disks = node.client.get_disks()
        except (requests.ConnectionError, requests.Timeout):
            AlbaNodeController._logger.exception('Could not connect to node {0} to validate disks'.format(node.guid))
            raise
        failures = {}
        added_disks = []
        for disk_id, amount in disks.iteritems():
            AlbaNodeController._logger.debug('Initializing disk {0} at node {1}'.format(disk_id, node.ip))
            if disk_id not in available_disks or available_disks[disk_id]['available'] is False:
                AlbaNodeController._logger.exception('Disk {0} not available on node {1}'.format(disk_id, node.ip))
                failures[disk_id] = 'Disk unavailable'
            else:
                disk = AlbaDisk()
                disk.name = disk_id
                disk.alba_node = node
                disk.save()
                result = node.client.add_disk(disk_id)
                if result['_success'] is False:
                    failures[disk_id] = result['_error']
                    disk.delete()
                else:
                    device = result['device']
                    for _ in xrange(amount):
                        result = node.client.add_asd(disk_id)
                        if result['_success'] is False:
                            failures[disk_id] = result['_error']
                    added_disks.append(device)
        if node.storagerouter is not None:
            DiskController.sync_with_reality(node.storagerouter_guid)
            for disk in node.storagerouter.disks:
                if disk.path in added_disks:
                    partition = disk.partitions[0]
                    partition.roles.append(DiskPartition.ROLES.BACKEND)
                    partition.save()
        return failures
Exemple #6
0
    def _fill_slot(cls, node, slot_id, extra):
        # type: (AlbaNode, str, any) -> List[dict]
        """
        Fills in the slots with ASDs and checks if the BACKEND role needs to be added
        :param node: The AlbaNode to fill on
        :type node: AlbaNode
        :param slot_id: ID of the slot to fill (which is an alias of the slot)
        :type slot_id: str
        :param extra: Extra information for filling
        :type extra: any
        :return: Information about the created osds
        :rtype: List[dict]
        """
        if node.type == AlbaNode.NODE_TYPES.S3:
            extra = extra.copy()
            try:
                s3_transaction_cluster = S3TransactionClusterList.get_s3_transaction_clusters(
                )[0]
                extra[
                    'transaction_arakoon_url'] = Configuration.get_configuration_path(
                        key=s3_transaction_cluster.config_location)
            except IndexError:
                raise RuntimeError(
                    'No transaction arakoon was deployed for this cluster!')
        created_osds = node.client.fill_slot(slot_id=slot_id, extra=extra)
        cls._logger.info(created_osds)

        # Sync model
        if node.storagerouter is not None:
            stack = node.client.get_stack()  # type: dict
            DiskController.sync_with_reality(
                storagerouter_guid=node.storagerouter_guid)
            slot_information = stack.get(slot_id, {})
            slot_aliases = slot_information.get('aliases', [])
            for disk in node.storagerouter.disks:
                if set(disk.aliases).intersection(set(slot_aliases)):
                    partition = disk.partitions[0]
                    if DiskPartition.ROLES.BACKEND not in partition.roles:
                        partition.roles.append(DiskPartition.ROLES.BACKEND)
                        partition.save()
        return created_osds or []  # Always return a list
    def remove_disk(node_guid, disk):
        """
        Removes a disk
        :param node_guid: Guid of the node to remove a disk from
        :type node_guid: str

        :param disk: Disk name to remove
        :type disk: str

        :return: None
        """
        node = AlbaNode(node_guid)
        offline_node = False
        try:
            if disk not in node.client.get_disks():
                raise RuntimeError('Disk {0} not available on node {1}'.format(disk, node.guid))
        except (requests.ConnectionError, requests.Timeout):
            AlbaNodeController._logger.warning('Could not connect to node {0} to validate disks'.format(node.guid))
            offline_node = True
        node_id = node.node_id
        asds = {}
        for backend in AlbaBackendList.get_albabackends():
            storage_stack = backend.storage_stack
            if node_id in storage_stack and disk in storage_stack[node_id]:
                asds.update(storage_stack[node_id][disk]['asds'])
        for asd_info in asds.values():
            if (offline_node is False and asd_info['status'] != 'available') or (offline_node is True and asd_info['status_detail'] == 'nodedown'):
                AlbaNodeController._logger.error('Disk {0} has still non-available ASDs on node {1}'.format(disk, node.ip))
                raise RuntimeError('Disk {0} has still some non-available ASDs'.format(disk))
        if offline_node is False:
            result = node.client.remove_disk(disk)
            if result['_success'] is False:
                raise RuntimeError('Error removing disk {0}: {1}'.format(disk, result['_error']))
        for model_disk in node.disks:
            if model_disk.name == disk:
                for asd in model_disk.asds:
                    asd.delete()
                model_disk.delete()
        node.invalidate_dynamics()
        if node.storagerouter is not None:
            DiskController.sync_with_reality(node.storagerouter_guid)
Exemple #8
0
    def remove_slot(node_cluster_guid, node_guid, slot_id):
        # type: (str, str, str) -> None
        """
        Removes a slot
        :param node_cluster_guid: Guid of the node cluster to remove a disk from
        :type node_cluster_guid: str
        :param node_guid: Guid of the AlbaNode to act as the 'active' side
        :type node_guid: basestring
        :param slot_id: Slot ID
        :type slot_id: str
        :return: None
        :rtype: NoneType
        """
        node_cluster = AlbaNodeCluster(node_cluster_guid)
        active_node = AlbaNode(node_guid)
        if active_node not in node_cluster.alba_nodes:
            raise ValueError(
                'The requested active AlbaNode is not part of AlbaNodeCluster {0}'
                .format(node_cluster.guid))
        osds = [osd for osd in active_node.osds if osd.slot_id == slot_id]
        if len(osds) > 0:
            raise RuntimeError('A slot with claimed OSDs can\'t be removed')

        active_node.client.clear_slot(slot_id)
        active_node.invalidate_dynamics()
        # Invalidate the stack and sync towards all passive sides
        for node in node_cluster.alba_nodes:
            if node != active_node:
                try:
                    node.client.sync_stack(active_node.stack)
                except:
                    AlbaNodeClusterController._logger.exception(
                        'Error while syncing stacks to the passive side')
        if active_node.storagerouter is not None:
            DiskController.sync_with_reality(
                storagerouter_guid=active_node.storagerouter_guid)
    def remove_asd(node_guid, asd_id, expected_safety):
        """
        Removes an ASD
        :param node_guid: Guid of the node to remove an ASD from
        :type node_guid: str
        :param asd_id: ID of the ASD to remove
        :type asd_id: str
        :param expected_safety: Expected safety after having removed the ASD
        :type expected_safety: dict or None
        :return: Aliases of the disk on which the ASD was removed
        :rtype: list
        """
        node = AlbaNode(node_guid)
        AlbaNodeController._logger.debug('Removing ASD {0} at node {1}'.format(asd_id, node.ip))
        model_osd = None
        for disk in node.disks:
            for asd in disk.osds:
                if asd.osd_id == asd_id:
                    model_osd = asd
                    break
            if model_osd is not None:
                break
        if model_osd is not None:
            alba_backend = model_osd.alba_backend
        else:
            alba_backend = None

        asds = {}
        try:
            asds = node.client.get_asds()
        except (requests.ConnectionError, requests.Timeout, InvalidCredentialsError):
            AlbaNodeController._logger.warning('Could not connect to node {0} to validate ASD'.format(node.guid))
        partition_alias = None
        for alias, asd_ids in asds.iteritems():
            if asd_id in asd_ids:
                partition_alias = alias
                break

        if alba_backend is not None:
            if expected_safety is None:
                AlbaNodeController._logger.warning('Skipping safety check for ASD {0} on backend {1} - this is dangerous'.format(asd_id, alba_backend.guid))
            else:
                final_safety = AlbaController.calculate_safety(alba_backend_guid=alba_backend.guid,
                                                               removal_osd_ids=[asd_id])
                safety_lost = final_safety['lost']
                safety_crit = final_safety['critical']
                if (safety_crit != 0 or safety_lost != 0) and (safety_crit != expected_safety['critical'] or safety_lost != expected_safety['lost']):
                    raise RuntimeError('Cannot remove ASD {0} as the current safety is not as expected ({1} vs {2})'.format(asd_id, final_safety, expected_safety))
                AlbaNodeController._logger.debug('Safety OK for ASD {0} on backend {1}'.format(asd_id, alba_backend.guid))
            AlbaNodeController._logger.debug('Purging ASD {0} on backend {1}'.format(asd_id, alba_backend.guid))
            AlbaController.remove_units(alba_backend_guid=alba_backend.guid,
                                        osd_ids=[asd_id])
        else:
            AlbaNodeController._logger.warning('Could not match ASD {0} to any backend. Cannot purge'.format(asd_id))

        disk_data = None
        if partition_alias is not None:
            AlbaNodeController._logger.debug('Removing ASD {0} from disk {1}'.format(asd_id, partition_alias))
            for device_info in node.client.get_disks().itervalues():
                if partition_alias in device_info['partition_aliases']:
                    disk_data = device_info
                    result = node.client.delete_asd(disk_id=device_info['aliases'][0].split('/')[-1],
                                                    asd_id=asd_id)
                    if result['_success'] is False:
                        raise RuntimeError('Error removing ASD: {0}'.format(result['_error']))
            if disk_data == {}:
                raise RuntimeError('Failed to find disk for partition with alias {0}'.format(partition_alias))
        else:
            AlbaNodeController._logger.warning('Could not remove ASD from remote node (node down)'.format(asd_id))

        if Configuration.exists(AlbaNodeController.ASD_CONFIG.format(asd_id), raw=True):
            Configuration.delete(AlbaNodeController.ASD_CONFIG_DIR.format(asd_id), raw=True)

        if model_osd is not None:
            model_osd.delete()
        if alba_backend is not None:
            alba_backend.invalidate_dynamics()
            alba_backend.backend.invalidate_dynamics()
        if node.storagerouter is not None:
            DiskController.sync_with_reality(storagerouter_guid=node.storagerouter_guid)

        return [] if disk_data is None else disk_data.get('aliases', [])
Exemple #10
0
    def add_vpool(cls, parameters):
        """
        Add a vPool to the machine this task is running on
        :param parameters: Parameters for vPool creation
        :type parameters: dict
        :return: None
        :rtype: NoneType
        """
        # TODO: Add logging
        cls._logger.debug('Adding vpool. Parameters: {}'.format(parameters))
        # VALIDATIONS
        if not isinstance(parameters, dict):
            raise ValueError(
                'Parameters passed to create a vPool should be of type dict')

        # Check StorageRouter existence
        storagerouter = StorageRouterList.get_by_ip(
            ip=parameters.get('storagerouter_ip'))
        if storagerouter is None:
            raise RuntimeError('Could not find StorageRouter')

        # Validate requested vPool configurations
        vp_installer = VPoolInstaller(name=parameters.get('vpool_name'))
        vp_installer.validate(storagerouter=storagerouter)

        # Validate requested StorageDriver configurations
        cls._logger.info(
            'vPool {0}: Validating StorageDriver configurations'.format(
                vp_installer.name))
        sd_installer = StorageDriverInstaller(
            vp_installer=vp_installer,
            configurations={
                'storage_ip': parameters.get('storage_ip'),
                'caching_info': parameters.get('caching_info'),
                'backend_info': {
                    'main':
                    parameters.get('backend_info'),
                    StorageDriverConfiguration.CACHE_BLOCK:
                    parameters.get('backend_info_bc'),
                    StorageDriverConfiguration.CACHE_FRAGMENT:
                    parameters.get('backend_info_fc')
                },
                'connection_info': {
                    'main':
                    parameters.get('connection_info'),
                    StorageDriverConfiguration.CACHE_BLOCK:
                    parameters.get('connection_info_bc'),
                    StorageDriverConfiguration.CACHE_FRAGMENT:
                    parameters.get('connection_info_fc')
                },
                'sd_configuration': parameters.get('config_params')
            })

        partitions_mutex = volatile_mutex('add_vpool_partitions_{0}'.format(
            storagerouter.guid))
        try:
            # VPOOL CREATION
            # Create the vPool as soon as possible in the process to be displayed in the GUI (INSTALLING/EXTENDING state)
            if vp_installer.is_new is True:
                vp_installer.create(rdma_enabled=sd_installer.rdma_enabled)
                vp_installer.configure_mds(
                    config=parameters.get('mds_config_params', {}))
            else:
                vp_installer.update_status(status=VPool.STATUSES.EXTENDING)

            # ADDITIONAL VALIDATIONS
            # Check StorageRouter connectivity
            cls._logger.info(
                'vPool {0}: Validating StorageRouter connectivity'.format(
                    vp_installer.name))
            linked_storagerouters = [storagerouter]
            if vp_installer.is_new is False:
                linked_storagerouters += [
                    sd.storagerouter
                    for sd in vp_installer.vpool.storagedrivers
                ]

            sr_client_map = SSHClient.get_clients(
                endpoints=linked_storagerouters, user_names=['ovs', 'root'])
            offline_nodes = sr_client_map.pop('offline')
            if storagerouter in offline_nodes:
                raise RuntimeError(
                    'Node on which the vPool is being {0} is not reachable'.
                    format('created'
                           if vp_installer.is_new is True else 'extended'))

            sr_installer = StorageRouterInstaller(
                root_client=sr_client_map[storagerouter]['root'],
                sd_installer=sd_installer,
                vp_installer=vp_installer,
                storagerouter=storagerouter)

            # When 2 or more jobs simultaneously run on the same StorageRouter, we need to check and create the StorageDriver partitions in locked context
            partitions_mutex.acquire(wait=60)
            sr_installer.partition_info = StorageRouterController.get_partition_info(
                storagerouter_guid=storagerouter.guid)
            sr_installer.validate_vpool_extendable()
            sr_installer.validate_global_write_buffer(
                requested_size=parameters.get('writecache_size', 0))
            sr_installer.validate_local_cache_size(
                requested_proxies=parameters.get('parallelism', {}).get(
                    'proxies', 2))

            # MODEL STORAGEDRIVER AND PARTITION JUNCTIONS
            sd_installer.create()
            sd_installer.create_partitions()
            partitions_mutex.release()

            vp_installer.refresh_metadata()
        except Exception:
            cls._logger.exception(
                'Something went wrong during the validation or modeling of vPool {0} on StorageRouter {1}'
                .format(vp_installer.name, storagerouter.name))
            partitions_mutex.release()
            vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING)
            raise

        # Arakoon setup
        counter = 0
        while counter < 300:
            try:
                if StorageDriverController.manual_voldrv_arakoon_checkup(
                ) is True:
                    break
            except Exception:
                cls._logger.exception(
                    'Arakoon checkup for voldrv cluster failed')
                vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING)
                raise
            counter += 1
            time.sleep(1)
            if counter == 300:
                vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING)
                raise RuntimeError(
                    'Arakoon checkup for the StorageDriver cluster could not be started'
                )

        # Cluster registry
        try:
            vp_installer.configure_cluster_registry(allow_raise=True)
        except Exception:
            if vp_installer.is_new is True:
                vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING)
            else:
                vp_installer.revert_vpool(status=VPool.STATUSES.FAILURE)
            raise

        try:
            sd_installer.setup_proxy_configs()
            sd_installer.configure_storagedriver_service()
            DiskController.sync_with_reality(storagerouter.guid)
            MDSServiceController.prepare_mds_service(
                storagerouter=storagerouter, vpool=vp_installer.vpool)

            # Update the MDS safety if changed via API (vpool.configuration will be available at this point also for the newly added StorageDriver)
            vp_installer.vpool.invalidate_dynamics('configuration')
            if vp_installer.mds_safety is not None and vp_installer.vpool.configuration[
                    'mds_config']['mds_safety'] != vp_installer.mds_safety:
                Configuration.set(
                    key='/ovs/vpools/{0}/mds_config|mds_safety'.format(
                        vp_installer.vpool.guid),
                    value=vp_installer.mds_safety)

            sd_installer.start_services(
            )  # Create and start watcher volumedriver, DTL, proxies and StorageDriver services

            # Post creation/extension checkups
            mds_config_set = MDSServiceController.get_mds_storagedriver_config_set(
                vpool=vp_installer.vpool, offline_nodes=offline_nodes)
            for sr, clients in sr_client_map.iteritems():
                for current_storagedriver in [
                        sd for sd in sr.storagedrivers
                        if sd.vpool_guid == vp_installer.vpool.guid
                ]:
                    storagedriver_config = StorageDriverConfiguration(
                        vpool_guid=vp_installer.vpool.guid,
                        storagedriver_id=current_storagedriver.storagedriver_id
                    )
                    if storagedriver_config.config_missing is False:
                        # Filesystem section in StorageDriver configuration are all parameters used for vDisks created directly on the filesystem
                        # So when a vDisk gets created on the filesystem, these MDSes will be assigned to them
                        storagedriver_config.configure_filesystem(
                            fs_metadata_backend_mds_nodes=mds_config_set[
                                sr.guid])
                        storagedriver_config.save(client=clients['ovs'])

            # Everything's reconfigured, refresh new cluster configuration
            for current_storagedriver in vp_installer.vpool.storagedrivers:
                if current_storagedriver.storagerouter not in sr_client_map:
                    continue
                vp_installer.vpool.storagedriver_client.update_cluster_node_configs(
                    str(current_storagedriver.storagedriver_id),
                    req_timeout_secs=10)
        except Exception:
            cls._logger.exception('vPool {0}: Creation failed'.format(
                vp_installer.name))
            vp_installer.update_status(status=VPool.STATUSES.FAILURE)
            raise

        # When a node is offline, we can run into errors, but also when 1 or more volumes are not running
        # Scheduled tasks below, so don't really care whether they succeed or not
        try:
            VDiskController.dtl_checkup(vpool_guid=vp_installer.vpool.guid,
                                        ensure_single_timeout=600)
        except:
            pass
        for vdisk in vp_installer.vpool.vdisks:
            try:
                MDSServiceController.ensure_safety(vdisk_guid=vdisk.guid)
            except:
                pass
        vp_installer.update_status(status=VPool.STATUSES.RUNNING)
        cls._logger.info('Add vPool {0} ended successfully'.format(
            vp_installer.name))
Exemple #11
0
    def shrink_vpool(cls,
                     storagedriver_guid,
                     offline_storage_router_guids=list()):
        """
        Removes a StorageDriver (if its the last StorageDriver for a vPool, the vPool is removed as well)
        :param storagedriver_guid: Guid of the StorageDriver to remove
        :type storagedriver_guid: str
        :param offline_storage_router_guids: Guids of StorageRouters which are offline and will be removed from cluster.
                                             WHETHER VPOOL WILL BE DELETED DEPENDS ON THIS
        :type offline_storage_router_guids: list
        :return: None
        :rtype: NoneType
        """
        # TODO: Add logging
        # TODO: Unit test individual pieces of code
        # Validations
        storagedriver = StorageDriver(storagedriver_guid)
        storagerouter = storagedriver.storagerouter
        cls._logger.info(
            'StorageDriver {0} - Deleting StorageDriver {1}'.format(
                storagedriver.guid, storagedriver.name))

        vp_installer = VPoolInstaller(name=storagedriver.vpool.name)
        vp_installer.validate(storagedriver=storagedriver)

        sd_installer = StorageDriverInstaller(vp_installer=vp_installer,
                                              storagedriver=storagedriver)

        cls._logger.info(
            'StorageDriver {0} - Checking availability of related StorageRouters'
            .format(storagedriver.guid, storagedriver.name))
        sr_client_map = SSHClient.get_clients(endpoints=[
            sd.storagerouter for sd in vp_installer.vpool.storagedrivers
        ],
                                              user_names=['root'])
        sr_installer = StorageRouterInstaller(root_client=sr_client_map.get(
            storagerouter, {}).get('root'),
                                              storagerouter=storagerouter,
                                              vp_installer=vp_installer,
                                              sd_installer=sd_installer)

        offline_srs = sr_client_map.pop('offline')
        if sorted([sr.guid for sr in offline_srs
                   ]) != sorted(offline_storage_router_guids):
            raise RuntimeError('Not all StorageRouters are reachable')

        if storagerouter not in offline_srs:
            mtpt_pids = sr_installer.root_client.run(
                "lsof -t +D '/mnt/{0}' || true".format(
                    vp_installer.name.replace(r"'", r"'\''")),
                allow_insecure=True).splitlines()
            if len(mtpt_pids) > 0:
                raise RuntimeError(
                    'vPool cannot be deleted. Following processes keep the vPool mount point occupied: {0}'
                    .format(', '.join(mtpt_pids)))

        # Retrieve reachable StorageDrivers
        reachable_storagedrivers = []
        for sd in vp_installer.vpool.storagedrivers:
            if sd.storagerouter not in sr_client_map:
                # StorageRouter is offline
                continue

            sd_key = '/ovs/vpools/{0}/hosts/{1}/config'.format(
                vp_installer.vpool.guid, sd.storagedriver_id)
            if Configuration.exists(sd_key) is True:
                path = Configuration.get_configuration_path(sd_key)
                with remote(sd.storagerouter.ip,
                            [LocalStorageRouterClient]) as rem:
                    try:
                        lsrc = rem.LocalStorageRouterClient(path)
                        lsrc.server_revision(
                        )  # 'Cheap' call to verify whether volumedriver is responsive
                        cls._logger.info(
                            'StorageDriver {0} - Responsive StorageDriver {1} on node with IP {2}'
                            .format(storagedriver.guid, sd.name,
                                    sd.storagerouter.ip))
                        reachable_storagedrivers.append(sd)
                    except Exception as exception:
                        if not is_connection_failure(exception):
                            raise

        if len(reachable_storagedrivers) == 0:
            raise RuntimeError(
                'Could not find any responsive node in the cluster')

        # Start removal
        if vp_installer.storagedriver_amount > 1:
            vp_installer.update_status(status=VPool.STATUSES.SHRINKING)
        else:
            vp_installer.update_status(status=VPool.STATUSES.DELETING)

        # Clean up stale vDisks
        cls._logger.info('StorageDriver {0} - Removing stale vDisks'.format(
            storagedriver.guid))
        VDiskController.remove_stale_vdisks(vpool=vp_installer.vpool)

        # Reconfigure the MDSes
        cls._logger.info('StorageDriver {0} - Reconfiguring MDSes'.format(
            storagedriver.guid))
        for vdisk_guid in storagerouter.vdisks_guids:
            try:
                MDSServiceController.ensure_safety(
                    vdisk_guid=vdisk_guid,
                    excluded_storagerouter_guids=[storagerouter.guid] +
                    offline_storage_router_guids)
            except Exception:
                cls._logger.exception(
                    'StorageDriver {0} - vDisk {1} - Ensuring MDS safety failed'
                    .format(storagedriver.guid, vdisk_guid))

        # Validate that all MDSes on current StorageRouter have been moved away
        # Ensure safety does not always throw an error, that's why we perform this check here instead of in the Exception clause of above code
        vdisks = []
        for mds in vp_installer.mds_services:
            for junction in mds.vdisks:
                vdisk = junction.vdisk
                if vdisk in vdisks:
                    continue
                vdisks.append(vdisk)
                cls._logger.critical(
                    'StorageDriver {0} - vDisk {1} {2} - MDS Services have not been migrated away'
                    .format(storagedriver.guid, vdisk.guid, vdisk.name))
        if len(vdisks) > 0:
            # Put back in RUNNING, so it can be used again. Errors keep on displaying in GUI now anyway
            vp_installer.update_status(status=VPool.STATUSES.RUNNING)
            raise RuntimeError(
                'Not all MDS Services have been successfully migrated away')

        # Start with actual removal
        errors_found = False
        if storagerouter not in offline_srs:
            errors_found &= sd_installer.stop_services()

        errors_found &= vp_installer.configure_cluster_registry(
            exclude=[storagedriver], apply_on=reachable_storagedrivers)
        errors_found &= vp_installer.update_node_distance_map()
        errors_found &= vp_installer.remove_mds_services()
        errors_found &= sd_installer.clean_config_management()
        errors_found &= sd_installer.clean_model()

        if storagerouter not in offline_srs:
            errors_found &= sd_installer.clean_directories(
                mountpoints=StorageRouterController.get_mountpoints(
                    client=sr_installer.root_client))

            try:
                DiskController.sync_with_reality(
                    storagerouter_guid=storagerouter.guid)
            except Exception:
                cls._logger.exception(
                    'StorageDriver {0} - Synchronizing disks with reality failed'
                    .format(storagedriver.guid))
                errors_found = True

        if vp_installer.storagedriver_amount > 1:
            # Update the vPool metadata and run DTL checkup
            vp_installer.vpool.metadata['caching_info'].pop(
                sr_installer.storagerouter.guid, None)
            vp_installer.vpool.save()

            try:
                VDiskController.dtl_checkup(vpool_guid=vp_installer.vpool.guid,
                                            ensure_single_timeout=600)
            except Exception:
                cls._logger.exception(
                    'StorageDriver {0} - DTL checkup failed for vPool {1} with guid {2}'
                    .format(storagedriver.guid, vp_installer.name,
                            vp_installer.vpool.guid))
        else:
            cls._logger.info(
                'StorageDriver {0} - Removing vPool from model'.format(
                    storagedriver.guid))
            # Clean up model
            try:
                vp_installer.vpool.delete()
            except Exception:
                errors_found = True
                cls._logger.exception(
                    'StorageDriver {0} - Cleaning up vPool from the model failed'
                    .format(storagedriver.guid))
            Configuration.delete('/ovs/vpools/{0}'.format(
                vp_installer.vpool.guid))

        cls._logger.info('StorageDriver {0} - Running MDS checkup'.format(
            storagedriver.guid))
        try:
            MDSServiceController.mds_checkup()
        except Exception:
            cls._logger.exception(
                'StorageDriver {0} - MDS checkup failed'.format(
                    storagedriver.guid))

        # Update vPool status
        if errors_found is True:
            if vp_installer.storagedriver_amount > 1:
                vp_installer.update_status(status=VPool.STATUSES.FAILURE)
            raise RuntimeError(
                '1 or more errors occurred while trying to remove the StorageDriver. Please check the logs for more information'
            )

        if vp_installer.storagedriver_amount > 1:
            vp_installer.update_status(status=VPool.STATUSES.RUNNING)
        cls._logger.info(
            'StorageDriver {0} - Deleted StorageDriver {1}'.format(
                storagedriver.guid, storagedriver.name))

        if len(VPoolList.get_vpools()) == 0:
            cluster_name = ArakoonInstaller.get_cluster_name('voldrv')
            if ArakoonInstaller.get_arakoon_metadata_by_cluster_name(
                    cluster_name=cluster_name)['internal'] is True:
                cls._logger.debug(
                    'StorageDriver {0} - Removing Arakoon cluster {1}'.format(
                        storagedriver.guid, cluster_name))
                try:
                    installer = ArakoonInstaller(cluster_name=cluster_name)
                    installer.load()
                    installer.delete_cluster()
                except Exception:
                    cls._logger.exception(
                        'StorageDriver {0} - Delete voldrv Arakoon cluster failed'
                        .format(storagedriver.guid))
                service_type = ServiceTypeList.get_by_name(
                    ServiceType.SERVICE_TYPES.ARAKOON)
                service_name = ArakoonInstaller.get_service_name_for_cluster(
                    cluster_name=cluster_name)
                for service in list(service_type.services):
                    if service.name == service_name:
                        service.delete()

        # Remove watcher volumedriver service if last StorageDriver on current StorageRouter
        if len(
                storagerouter.storagedrivers
        ) == 0 and storagerouter not in offline_srs:  # ensure client is initialized for StorageRouter
            try:
                if cls._service_manager.has_service(
                        ServiceFactory.SERVICE_WATCHER_VOLDRV,
                        client=sr_installer.root_client):
                    cls._service_manager.stop_service(
                        ServiceFactory.SERVICE_WATCHER_VOLDRV,
                        client=sr_installer.root_client)
                    cls._service_manager.remove_service(
                        ServiceFactory.SERVICE_WATCHER_VOLDRV,
                        client=sr_installer.root_client)
            except Exception:
                cls._logger.exception(
                    'StorageDriver {0} - {1} service deletion failed'.format(
                        storagedriver.guid,
                        ServiceFactory.SERVICE_WATCHER_VOLDRV))
    def configure_disk(storagerouter_guid, disk_guid, partition_guid, offset,
                       size, roles):
        """
        Configures a partition
        :param storagerouter_guid: Guid of the StorageRouter to configure a disk on
        :type storagerouter_guid: str
        :param disk_guid: Guid of the disk to configure
        :type disk_guid: str
        :param partition_guid: Guid of the partition on the disk
        :type partition_guid: str
        :param offset: Offset for the partition
        :type offset: int
        :param size: Size of the partition
        :type size: int
        :param roles: Roles assigned to the partition
        :type roles: list
        :return: None
        :rtype: NoneType
        """
        # Validations
        storagerouter = StorageRouter(storagerouter_guid)
        for role in roles:
            if role not in DiskPartition.ROLES or role == DiskPartition.ROLES.BACKEND:
                raise RuntimeError('Invalid role specified: {0}'.format(role))
        disk = Disk(disk_guid)
        if disk.storagerouter_guid != storagerouter_guid:
            raise RuntimeError(
                'The given Disk is not on the given StorageRouter')
        for partition in disk.partitions:
            if DiskPartition.ROLES.BACKEND in partition.roles:
                raise RuntimeError('The given Disk is in use by a Backend')

        if len({DiskPartition.ROLES.DB, DiskPartition.ROLES.DTL}.intersection(
                set(roles))) > 0:
            roles_on_sr = StorageRouterController._get_roles_on_storagerouter(
                storagerouter.ip)
            for role in [DiskPartition.ROLES.DB, DiskPartition.ROLES.DTL]:
                if role in roles_on_sr and role in roles and roles_on_sr[role][
                        0] != disk.name:  # DB and DTL roles still have to be unassignable
                    raise RoleDuplicationException(
                        'Disk {0} cannot have the {1} role due to presence on disk {2}'
                        .format(disk.name, role, roles_on_sr[role][0]))

        # Create partition
        if partition_guid is None:
            StorageRouterController._logger.debug(
                'Creating new partition - Offset: {0} bytes - Size: {1} bytes - Roles: {2}'
                .format(offset, size, roles))
            with remote(storagerouter.ip, [DiskTools], username='******') as rem:
                if len(disk.aliases) == 0:
                    raise ValueError(
                        'Disk {0} does not have any aliases'.format(disk.name))
                rem.DiskTools.create_partition(disk_alias=disk.aliases[0],
                                               disk_size=disk.size,
                                               partition_start=offset,
                                               partition_size=size)
            DiskController.sync_with_reality(storagerouter_guid)
            disk = Disk(disk_guid)
            end_point = offset + size
            partition = None
            for part in disk.partitions:
                if offset < part.offset + part.size and end_point > part.offset:
                    partition = part
                    break

            if partition is None:
                raise RuntimeError(
                    'No new partition detected on disk {0} after having created 1'
                    .format(disk.name))
            StorageRouterController._logger.debug('Partition created')
        else:
            StorageRouterController._logger.debug('Using existing partition')
            partition = DiskPartition(partition_guid)
            if partition.disk_guid != disk_guid:
                raise RuntimeError(
                    'The given DiskPartition is not on the given Disk')
            if partition.filesystem in [
                    'swap', 'linux_raid_member', 'LVM2_member'
            ]:
                raise RuntimeError(
                    "It is not allowed to assign roles on partitions of type: ['swap', 'linux_raid_member', 'LVM2_member']"
                )
            metadata = StorageRouterController.get_metadata(storagerouter_guid)
            partition_info = metadata['partitions']
            removed_roles = set(partition.roles) - set(roles)
            used_roles = []
            for role in removed_roles:
                for info in partition_info[role]:
                    if info['in_use'] and info['guid'] == partition.guid:
                        used_roles.append(role)
            if len(used_roles) > 0:
                raise RuntimeError(
                    'Roles in use cannot be removed. Used roles: {0}'.format(
                        ', '.join(used_roles)))

        # Add filesystem
        if partition.filesystem is None or partition_guid is None:
            StorageRouterController._logger.debug('Creating filesystem')
            if len(partition.aliases) == 0:
                raise ValueError(
                    'Partition with offset {0} does not have any aliases'.
                    format(partition.offset))
            with remote(storagerouter.ip, [DiskTools], username='******') as rem:
                rem.DiskTools.make_fs(partition_alias=partition.aliases[0])
            DiskController.sync_with_reality(storagerouter_guid)
            partition = DiskPartition(partition.guid)
            if partition.filesystem not in ['ext4', 'xfs']:
                raise RuntimeError('Unexpected filesystem')
            StorageRouterController._logger.debug('Filesystem created')

        # Mount the partition and add to FSTab
        if partition.mountpoint is None:
            StorageRouterController._logger.debug('Configuring mount point')
            with remote(storagerouter.ip, [DiskTools], username='******') as rem:
                counter = 1
                mountpoint = '/mnt/{0}{1}'.format(
                    'ssd' if disk.is_ssd else 'hdd', counter)
                while True:
                    if not rem.DiskTools.mountpoint_exists(mountpoint):
                        break
                    counter += 1
                    mountpoint = '/mnt/{0}{1}'.format(
                        'ssd' if disk.is_ssd else 'hdd', counter)
                StorageRouterController._logger.debug(
                    'Found mount point: {0}'.format(mountpoint))
                rem.DiskTools.add_fstab(partition_aliases=partition.aliases,
                                        mountpoint=mountpoint,
                                        filesystem=partition.filesystem)
                rem.DiskTools.mount(mountpoint)
            DiskController.sync_with_reality(storagerouter_guid)
            partition = DiskPartition(partition.guid)
            if partition.mountpoint != mountpoint:
                raise RuntimeError('Unexpected mount point')
            StorageRouterController._logger.debug('Mount point configured')
        partition.roles = roles
        partition.save()
        StorageRouterController._logger.debug('Partition configured')
Exemple #13
0
    def migrate(previous_version):
        """
        Migrates from a given version to the current version. It uses 'previous_version' to be smart
        wherever possible, but the code should be able to migrate any version towards the expected version.
        When this is not possible, the code can set a minimum version and raise when it is not met.
        :param previous_version: The previous version from which to start the migration
        :type previous_version: float
        """

        working_version = previous_version

        if working_version == 0:
            # Initial version:
            # * Set the version to THIS RELEASE version

            from ovs.dal.hybrids.user import User
            from ovs.dal.hybrids.group import Group
            from ovs.dal.hybrids.role import Role
            from ovs.dal.hybrids.client import Client
            from ovs.dal.hybrids.j_rolegroup import RoleGroup
            from ovs.dal.hybrids.j_roleclient import RoleClient
            from ovs.dal.hybrids.servicetype import ServiceType
            from ovs.dal.hybrids.branding import Branding
            from ovs.dal.lists.backendtypelist import BackendTypeList

            # Create groups
            admin_group = Group()
            admin_group.name = 'administrators'
            admin_group.description = 'Administrators'
            admin_group.save()
            viewers_group = Group()
            viewers_group.name = 'viewers'
            viewers_group.description = 'Viewers'
            viewers_group.save()

            # Create users
            admin = User()
            admin.username = '******'
            admin.password = hashlib.sha256('admin').hexdigest()
            admin.is_active = True
            admin.group = admin_group
            admin.save()

            # Create internal OAuth 2 clients
            admin_pw_client = Client()
            admin_pw_client.ovs_type = 'INTERNAL'
            admin_pw_client.grant_type = 'PASSWORD'
            admin_pw_client.user = admin
            admin_pw_client.save()
            admin_cc_client = Client()
            admin_cc_client.ovs_type = 'INTERNAL'
            admin_cc_client.grant_type = 'CLIENT_CREDENTIALS'
            admin_cc_client.client_secret = ''.join(random.choice(string.ascii_letters +
                                                                  string.digits +
                                                                  '|_=+*#@!/-[]{}<>.?,\'";:~')
                                                    for _ in range(128))
            admin_cc_client.user = admin
            admin_cc_client.save()

            # Create roles
            read_role = Role()
            read_role.code = 'read'
            read_role.name = 'Read'
            read_role.description = 'Can read objects'
            read_role.save()
            write_role = Role()
            write_role.code = 'write'
            write_role.name = 'Write'
            write_role.description = 'Can write objects'
            write_role.save()
            manage_role = Role()
            manage_role.code = 'manage'
            manage_role.name = 'Manage'
            manage_role.description = 'Can manage the system'
            manage_role.save()

            # Attach groups to roles
            mapping = [
                (admin_group, [read_role, write_role, manage_role]),
                (viewers_group, [read_role])
            ]
            for setting in mapping:
                for role in setting[1]:
                    rolegroup = RoleGroup()
                    rolegroup.group = setting[0]
                    rolegroup.role = role
                    rolegroup.save()
                for user in setting[0].users:
                    for role in setting[1]:
                        for client in user.clients:
                            roleclient = RoleClient()
                            roleclient.client = client
                            roleclient.role = role
                            roleclient.save()

            # Add service types
            for service_type_info in [ServiceType.SERVICE_TYPES.MD_SERVER, ServiceType.SERVICE_TYPES.ALBA_PROXY, ServiceType.SERVICE_TYPES.ARAKOON]:
                service_type = ServiceType()
                service_type.name = service_type_info
                service_type.save()

            # Branding
            branding = Branding()
            branding.name = 'Default'
            branding.description = 'Default bootstrap theme'
            branding.css = 'bootstrap-default.min.css'
            branding.productname = 'Open vStorage'
            branding.is_default = True
            branding.save()
            slate = Branding()
            slate.name = 'Slate'
            slate.description = 'Dark bootstrap theme'
            slate.css = 'bootstrap-slate.min.css'
            slate.productname = 'Open vStorage'
            slate.is_default = False
            slate.save()

        # From here on, all actual migration should happen to get to the expected state for THIS RELEASE
        elif working_version < OVSMigrator.THIS_VERSION:
            # Migrate unique constraints
            from ovs.dal.helpers import HybridRunner, Descriptor
            from ovs.extensions.storage.persistentfactory import PersistentFactory
            client = PersistentFactory.get_client()
            hybrid_structure = HybridRunner.get_hybrids()
            for class_descriptor in hybrid_structure.values():
                cls = Descriptor().load(class_descriptor).get_object()
                classname = cls.__name__.lower()
                unique_key = 'ovs_unique_{0}_{{0}}_'.format(classname)
                uniques = []
                # noinspection PyProtectedMember
                for prop in cls._properties:
                    if prop.unique is True and len([k for k in client.prefix(unique_key.format(prop.name))]) == 0:
                        uniques.append(prop.name)
                if len(uniques) > 0:
                    prefix = 'ovs_data_{0}_'.format(classname)
                    for key in client.prefix(prefix):
                        data = client.get(key)
                        for property_name in uniques:
                            ukey = '{0}{1}'.format(unique_key.format(property_name), hashlib.sha1(str(data[property_name])).hexdigest())
                            client.set(ukey, key)

            # Complete rework of the way we detect devices to assign roles or use as ASD
            # Allow loop-, raid-, nvme-, ??-devices and logical volumes as ASD (https://github.com/openvstorage/framework/issues/792)
            from ovs.dal.lists.storagerouterlist import StorageRouterList
            from ovs.extensions.generic.sshclient import SSHClient, UnableToConnectException
            from ovs.lib.disk import DiskController

            for storagerouter in StorageRouterList.get_storagerouters():
                try:
                    client = SSHClient(storagerouter, username='******')
                except UnableToConnectException:
                    raise

                # Retrieve all symlinks for all devices
                # Example of name_alias_mapping:
                # {'/dev/md0': ['/dev/disk/by-id/md-uuid-ad2de634:26d97253:5eda0a23:96986b76', '/dev/disk/by-id/md-name-OVS-1:0'],
                #  '/dev/sda': ['/dev/disk/by-path/pci-0000:03:00.0-sas-0x5000c295fe2ff771-lun-0'],
                #  '/dev/sda1': ['/dev/disk/by-uuid/e3e0bc62-4edc-4c6b-a6ce-1f39e8f27e41', '/dev/disk/by-path/pci-0000:03:00.0-sas-0x5000c295fe2ff771-lun-0-part1']}
                name_alias_mapping = {}
                alias_name_mapping = {}
                for path_type in client.dir_list(directory='/dev/disk'):
                    if path_type in ['by-uuid', 'by-partuuid']:  # UUIDs can change after creating a filesystem on a partition
                        continue
                    directory = '/dev/disk/{0}'.format(path_type)
                    for symlink in client.dir_list(directory=directory):
                        symlink_path = '{0}/{1}'.format(directory, symlink)
                        link = client.file_read_link(symlink_path)
                        if link not in name_alias_mapping:
                            name_alias_mapping[link] = []
                        name_alias_mapping[link].append(symlink_path)
                        alias_name_mapping[symlink_path] = link

                for disk in storagerouter.disks:
                    if disk.aliases is None:
                        # noinspection PyProtectedMember
                        device_path = '/dev/{0}'.format(disk.name)
                        disk.aliases = name_alias_mapping.get(device_path, [device_path])
                        disk.save()
                    for partition in disk.partitions:
                        if partition.aliases is None:
                            # noinspection PyProtectedMember
                            partition_device = alias_name_mapping.get(partition._data.get('path'))
                            if partition_device is None:
                                partition.aliases = []
                                partition.save()
                                continue
                            partition.aliases = name_alias_mapping.get(partition_device, [])
                            partition.save()

                DiskController.sync_with_reality(storagerouter_guid=storagerouter.guid)

            # Only support ALBA backend type
            from ovs.dal.lists.backendtypelist import BackendTypeList
            for backend_type in BackendTypeList.get_backend_types():
                if backend_type.code != 'alba':
                    backend_type.delete()

            # Reformat the vpool.metadata information
            from ovs.dal.lists.vpoollist import VPoolList
            for vpool in VPoolList.get_vpools():
                new_metadata = {}
                for metadata_key, value in vpool.metadata.items():
                    new_info = {}
                    storagerouter_guids = [key for key in vpool.metadata.keys() if not key.startswith('backend')]
                    if isinstance(value, dict):
                        read_cache = value.get('backend_info', {}).get('fragment_cache_on_read', True)
                        write_cache = value.get('backend_info', {}).get('fragment_cache_on_write', False)
                        new_info['backend_info'] = {'alba_backend_guid': value.get('backend_guid'),
                                                    'backend_guid': None,
                                                    'frag_size': value.get('backend_info', {}).get('frag_size'),
                                                    'name': value.get('name'),
                                                    'policies': value.get('backend_info', {}).get('policies'),
                                                    'preset': value.get('preset'),
                                                    'sco_size': value.get('backend_info', {}).get('sco_size'),
                                                    'total_size': value.get('backend_info', {}).get('total_size')}
                        new_info['arakoon_config'] = value.get('arakoon_config')
                        new_info['connection_info'] = {'host': value.get('connection', {}).get('host', ''),
                                                       'port': value.get('connection', {}).get('port', ''),
                                                       'local': value.get('connection', {}).get('local', ''),
                                                       'client_id': value.get('connection', {}).get('client_id', ''),
                                                       'client_secret': value.get('connection', {}).get('client_secret', '')}
                        if metadata_key == 'backend':
                            new_info['caching_info'] = dict((sr_guid, {'fragment_cache_on_read': read_cache, 'fragment_cache_on_write': write_cache}) for sr_guid in storagerouter_guids)
                    if metadata_key in storagerouter_guids:
                        metadata_key = 'backend_aa_{0}'.format(metadata_key)
                    new_metadata[metadata_key] = new_info
                vpool.metadata = new_metadata
                vpool.save()

            # Removal of READ role
            from ovs.dal.lists.diskpartitionlist import DiskPartitionList
            for partition in DiskPartitionList.get_partitions():
                if 'READ' in partition.roles:
                    partition.roles.remove('READ')
                    partition.save()

        return OVSMigrator.THIS_VERSION
Exemple #14
0
    def remove_osd(node_guid, osd_id, expected_safety):
        """
        Removes an OSD
        :param node_guid: Guid of the node to remove an OSD from
        :type node_guid: str
        :param osd_id: ID of the OSD to remove
        :type osd_id: str
        :param expected_safety: Expected safety after having removed the OSD
        :type expected_safety: dict or None
        :return: Aliases of the disk on which the OSD was removed
        :rtype: list
        """
        # Retrieve corresponding OSD in model
        node = AlbaNode(node_guid)
        AlbaNodeController._logger.debug('Removing OSD {0} at node {1}'.format(
            osd_id, node.ip))
        osd = AlbaOSDList.get_by_osd_id(osd_id)
        alba_backend = osd.alba_backend

        if expected_safety is None:
            AlbaNodeController._logger.warning(
                'Skipping safety check for OSD {0} on backend {1} - this is dangerous'
                .format(osd_id, alba_backend.guid))
        else:
            final_safety = AlbaController.calculate_safety(
                alba_backend_guid=alba_backend.guid, removal_osd_ids=[osd_id])
            safety_lost = final_safety['lost']
            safety_crit = final_safety['critical']
            if (safety_crit != 0 or safety_lost != 0) and (
                    safety_crit != expected_safety['critical']
                    or safety_lost != expected_safety['lost']):
                raise RuntimeError(
                    'Cannot remove OSD {0} as the current safety is not as expected ({1} vs {2})'
                    .format(osd_id, final_safety, expected_safety))
            AlbaNodeController._logger.debug(
                'Safety OK for OSD {0} on backend {1}'.format(
                    osd_id, alba_backend.guid))
        AlbaNodeController._logger.debug(
            'Purging OSD {0} on backend {1}'.format(osd_id, alba_backend.guid))
        AlbaController.remove_units(alba_backend_guid=alba_backend.guid,
                                    osd_ids=[osd_id])

        # Delete the OSD
        result = node.client.delete_osd(slot_id=osd.slot_id, osd_id=osd_id)
        if result['_success'] is False:
            raise RuntimeError('Error removing OSD: {0}'.format(
                result['_error']))

        # Clean configuration management and model - Well, just try it at least
        if Configuration.exists(ASD_CONFIG.format(osd_id), raw=True):
            Configuration.delete(ASD_CONFIG_DIR.format(osd_id), raw=True)

        osd.delete()
        node.invalidate_dynamics()
        if alba_backend is not None:
            alba_backend.invalidate_dynamics()
            alba_backend.backend.invalidate_dynamics()
        if node.storagerouter is not None:
            try:
                DiskController.sync_with_reality(
                    storagerouter_guid=node.storagerouter_guid)
            except UnableToConnectException:
                AlbaNodeController._logger.warning(
                    'Skipping disk sync since StorageRouter {0} is offline'.
                    format(node.storagerouter.name))

        return [osd.slot_id]
Exemple #15
0
    def migrate():
        """
        Executes async migrations. It doesn't matter too much when they are executed, as long as they get eventually
        executed. This code will typically contain:
        * "dangerous" migration code (it needs certain running services)
        * Migration code depending on a cluster-wide state
        * ...
        """
        AlbaMigrationController._logger.info(
            'Preparing out of band migrations...')

        from ovs.dal.hybrids.diskpartition import DiskPartition
        from ovs.dal.lists.albabackendlist import AlbaBackendList
        from ovs.dal.lists.albanodelist import AlbaNodeList
        from ovs.dal.lists.albaosdlist import AlbaOSDList
        from ovs.dal.lists.storagerouterlist import StorageRouterList
        from ovs.extensions.generic.configuration import Configuration
        from ovs.extensions.generic.sshclient import SSHClient, UnableToConnectException
        from ovs.extensions.migration.migration.albamigrator import ExtensionMigrator
        from ovs.extensions.packages.albapackagefactory import PackageFactory
        from ovs.extensions.services.albaservicefactory import ServiceFactory
        from ovs.extensions.plugins.albacli import AlbaCLI, AlbaError
        from ovs.lib.alba import AlbaController
        from ovs.lib.disk import DiskController

        AlbaMigrationController._logger.info('Start out of band migrations...')

        #############################################
        # Introduction of IP:port combination on OSDs
        osd_info_map = {}
        alba_backends = AlbaBackendList.get_albabackends()
        for alba_backend in alba_backends:
            AlbaMigrationController._logger.info(
                'Verifying ALBA Backend {0}'.format(alba_backend.name))
            if alba_backend.abm_cluster is None:
                AlbaMigrationController._logger.warning(
                    'ALBA Backend {0} does not have an ABM cluster registered'.
                    format(alba_backend.name))
                continue

            AlbaMigrationController._logger.debug(
                'Retrieving configuration path for ALBA Backend {0}'.format(
                    alba_backend.name))
            try:
                config = Configuration.get_configuration_path(
                    alba_backend.abm_cluster.config_location)
            except:
                AlbaMigrationController._logger.exception(
                    'Failed to retrieve the configuration path for ALBA Backend {0}'
                    .format(alba_backend.name))
                continue

            AlbaMigrationController._logger.info(
                'Retrieving OSD information for ALBA Backend {0}'.format(
                    alba_backend.name))
            try:
                osd_info = AlbaCLI.run(command='list-all-osds', config=config)
            except (AlbaError, RuntimeError):
                AlbaMigrationController._logger.exception(
                    'Failed to retrieve OSD information for ALBA Backend {0}'.
                    format(alba_backend.name))
                continue

            for osd_info in osd_info:
                if osd_info.get('long_id'):
                    osd_info_map[osd_info['long_id']] = {
                        'ips': osd_info.get('ips', []),
                        'port': osd_info.get('port')
                    }

        for osd in AlbaOSDList.get_albaosds():
            if osd.osd_id not in osd_info_map:
                AlbaMigrationController._logger.warning(
                    'OSD with ID {0} is modelled but could not be found through ALBA'
                    .format(osd.osd_id))
                continue

            ips = osd_info_map[osd.osd_id]['ips']
            port = osd_info_map[osd.osd_id]['port']
            changes = False
            if osd.ips is None:
                changes = True
                osd.ips = ips
            if osd.port is None:
                changes = True
                osd.port = port
            if changes is True:
                AlbaMigrationController._logger.info(
                    'Updating OSD with ID {0} with IPS {1} and port {2}'.
                    format(osd.osd_id, ips, port))
                osd.save()

        ###################################################
        # Read preference for GLOBAL ALBA Backends (1.10.3)  (https://github.com/openvstorage/framework-alba-plugin/issues/452)
        if Configuration.get(key='/ovs/framework/migration|read_preference',
                             default=False) is False:
            try:
                name_backend_map = dict((alba_backend.name, alba_backend)
                                        for alba_backend in alba_backends)
                for alba_node in AlbaNodeList.get_albanodes():
                    AlbaMigrationController._logger.info(
                        'Processing maintenance services running on ALBA Node {0} with ID {1}'
                        .format(alba_node.ip, alba_node.node_id))
                    alba_node.invalidate_dynamics('maintenance_services')
                    for alba_backend_name, services in alba_node.maintenance_services.iteritems(
                    ):
                        if alba_backend_name not in name_backend_map:
                            AlbaMigrationController._logger.error(
                                'ALBA Node {0} has services for an ALBA Backend {1} which is not modelled'
                                .format(alba_node.ip, alba_backend_name))
                            continue

                        alba_backend = name_backend_map[alba_backend_name]
                        AlbaMigrationController._logger.info(
                            'Processing {0} ALBA Backend {1} with GUID {2}'.
                            format(alba_backend.scaling, alba_backend.name,
                                   alba_backend.guid))
                        if alba_backend.scaling == alba_backend.SCALINGS.LOCAL:
                            read_preferences = [alba_node.node_id]
                        else:
                            read_preferences = AlbaController.get_read_preferences_for_global_backend(
                                alba_backend=alba_backend,
                                alba_node_id=alba_node.node_id,
                                read_preferences=[])

                        for service_name, _ in services:
                            AlbaMigrationController._logger.info(
                                'Processing service {0}'.format(service_name))
                            old_config_key = '/ovs/alba/backends/{0}/maintenance/config'.format(
                                alba_backend.guid)
                            new_config_key = '/ovs/alba/backends/{0}/maintenance/{1}/config'.format(
                                alba_backend.guid, service_name)
                            if Configuration.exists(key=old_config_key):
                                new_config = Configuration.get(
                                    key=old_config_key)
                                new_config[
                                    'read_preference'] = read_preferences
                                Configuration.set(key=new_config_key,
                                                  value=new_config)
                for alba_backend in alba_backends:
                    Configuration.delete(
                        key='/ovs/alba/backends/{0}/maintenance/config'.format(
                            alba_backend.guid))
                AlbaController.checkup_maintenance_agents.delay()

                Configuration.set(
                    key='/ovs/framework/migration|read_preference', value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Updating read preferences for ALBA Backends failed')

        #######################################################
        # Storing actual package name in version files (1.11.0) (https://github.com/openvstorage/framework/issues/1876)
        changed_clients = set()
        storagerouters = StorageRouterList.get_storagerouters()
        if Configuration.get(
                key=
                '/ovs/framework/migration|actual_package_name_in_version_file_alba',
                default=False) is False:
            try:
                service_manager = ServiceFactory.get_manager()
                alba_pkg_name, alba_version_cmd = PackageFactory.get_package_and_version_cmd_for(
                    component=PackageFactory.COMP_ALBA)
                for storagerouter in storagerouters:
                    try:
                        root_client = SSHClient(
                            endpoint=storagerouter.ip, username='******'
                        )  # Use '.ip' instead of StorageRouter object because this code is executed during post-update at which point the heartbeat has not been updated for some time
                    except UnableToConnectException:
                        AlbaMigrationController._logger.exception(
                            'Updating actual package name for version files failed on StorageRouter {0}'
                            .format(storagerouter.ip))
                        continue

                    for file_name in root_client.file_list(
                            directory=ServiceFactory.RUN_FILE_DIR):
                        if not file_name.endswith('.version'):
                            continue
                        file_path = '{0}/{1}'.format(
                            ServiceFactory.RUN_FILE_DIR, file_name)
                        contents = root_client.file_read(filename=file_path)
                        if alba_pkg_name == PackageFactory.PKG_ALBA_EE and '{0}='.format(
                                PackageFactory.PKG_ALBA) in contents:
                            # Rewrite the version file in the RUN_FILE_DIR
                            contents = contents.replace(
                                PackageFactory.PKG_ALBA,
                                PackageFactory.PKG_ALBA_EE)
                            root_client.file_write(filename=file_path,
                                                   contents=contents)

                            # Regenerate the service and update the EXTRA_VERSION_CMD in the configuration management
                            service_name = file_name.split('.')[0]
                            service_config_key = ServiceFactory.SERVICE_CONFIG_KEY.format(
                                storagerouter.machine_id, service_name)
                            if Configuration.exists(key=service_config_key):
                                service_config = Configuration.get(
                                    key=service_config_key)
                                if 'EXTRA_VERSION_CMD' in service_config:
                                    service_config[
                                        'EXTRA_VERSION_CMD'] = '{0}=`{1}`'.format(
                                            alba_pkg_name, alba_version_cmd)
                                    Configuration.set(key=service_config_key,
                                                      value=service_config)
                                    service_manager.regenerate_service(
                                        name='ovs-arakoon',
                                        client=root_client,
                                        target_name='ovs-{0}'.format(
                                            service_name)
                                    )  # Leave out .version
                                    changed_clients.add(root_client)
                Configuration.set(
                    key=
                    '/ovs/framework/migration|actual_package_name_in_version_file_alba',
                    value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Updating actual package name for version files failed')

        for root_client in changed_clients:
            try:
                root_client.run(['systemctl', 'daemon-reload'])
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Executing command "systemctl daemon-reload" failed')

        ####################################
        # Fix for migration version (1.11.0)
        # Previous code could potentially store a higher version number in the config management than the actual version number
        if Configuration.get(
                key='/ovs/framework/migration|alba_migration_version_fix',
                default=False) is False:
            try:
                for storagerouter in storagerouters:
                    config_key = '/ovs/framework/hosts/{0}/versions'.format(
                        storagerouter.machine_id)
                    if Configuration.exists(key=config_key):
                        versions = Configuration.get(key=config_key)
                        if versions.get(PackageFactory.COMP_MIGRATION_ALBA,
                                        0) > ExtensionMigrator.THIS_VERSION:
                            versions[
                                PackageFactory.
                                COMP_MIGRATION_ALBA] = ExtensionMigrator.THIS_VERSION
                            Configuration.set(key=config_key, value=versions)
                Configuration.set(
                    key='/ovs/framework/migration|alba_migration_version_fix',
                    value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Updating migration version failed')

        ####################################
        # Enable auto-cleanup
        migration_auto_cleanup_key = '/ovs/framework/migration|alba_auto_cleanup'
        if Configuration.get(key=migration_auto_cleanup_key,
                             default=False) is False:
            try:
                for storagerouter in StorageRouterList.get_storagerouters():
                    storagerouter.invalidate_dynamics(
                        'features')  # New feature was added
                errors = []
                for alba_backend in AlbaBackendList.get_albabackends():
                    try:
                        AlbaController.set_auto_cleanup(alba_backend.guid)
                    except Exception as ex:
                        AlbaMigrationController._logger.exception(
                            'Failed to set the auto-cleanup for ALBA Backend {0}'
                            .format(alba_backend.name))
                        errors.append(ex)
                if len(errors) == 0:
                    Configuration.set(key=migration_auto_cleanup_key,
                                      value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Updating auto cleanup failed')

        ####################################
        # Change cache eviction
        migration_random_eviction_key = '/ovs/framework/migration|alba_cache_eviction_random'
        if Configuration.get(key=migration_random_eviction_key,
                             default=False) is False:
            try:
                errors = []
                for alba_backend in AlbaBackendList.get_albabackends():
                    try:
                        AlbaController.set_cache_eviction(alba_backend.guid)
                    except Exception as ex:
                        AlbaMigrationController._logger.exception(
                            'Failed to set the auto-cleanup for ALBA Backend {0}'
                            .format(alba_backend.name))
                        errors.append(ex)
                if len(errors) == 0:
                    Configuration.set(key=migration_random_eviction_key,
                                      value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Updating auto cleanup failed')

        ###################################################
        # Sync all disks and apply the backend role. Backend role was removed with the AD (since 1.10)
        albanode_backend_role_sync_key = '/ovs/framework/migration|albanode_backend_role_sync'
        if not Configuration.get(key=albanode_backend_role_sync_key,
                                 default=False):
            try:
                errors = []
                for alba_node in AlbaNodeList.get_albanodes():
                    try:
                        if not alba_node.storagerouter:
                            continue
                        stack = alba_node.client.get_stack()  # type: dict
                        for slot_id, slot_information in stack.iteritems():
                            osds = slot_information.get('osds',
                                                        {})  # type: dict
                            slot_aliases = slot_information.get(
                                'aliases', [])  # type: list
                            if not osds:  # No osds means no partition was made
                                continue
                            # Sync to add all potential partitions that will need a backend role
                            DiskController.sync_with_reality(
                                storagerouter_guid=alba_node.storagerouter_guid
                            )
                            for disk in alba_node.storagerouter.disks:
                                if set(disk.aliases).intersection(
                                        set(slot_aliases)):
                                    partition = disk.partitions[0]
                                    if DiskPartition.ROLES.BACKEND not in partition.roles:
                                        partition.roles.append(
                                            DiskPartition.ROLES.BACKEND)
                                        partition.save()
                    except Exception as ex:
                        AlbaMigrationController._logger.exception(
                            'Syncing for storagerouter/albanode {0} failed'.
                            format(alba_node.storagerouter.ip))
                        errors.append(ex)
                if not errors:
                    Configuration.set(key=albanode_backend_role_sync_key,
                                      value=True)
            except Exception:
                AlbaMigrationController._logger.exception(
                    'Syncing up the disks for backend roles failed')

        AlbaMigrationController._logger.info('Finished out of band migrations')
    def remove_asd(node_guid, asd_id, expected_safety):
        """
        Removes an ASD

        :param node_guid: Guid of the node to remove a disk from
        :type node_guid: str

        :param asd_id: ASD to remove
        :type asd_id: str

        :param expected_safety: Expected safety after having removed the disk
        :type expected_safety: dict

        :return: True
        :rtype: bool
        """
        node = AlbaNode(node_guid)
        AlbaNodeController._logger.debug('Removing ASD {0} at node {1}'.format(asd_id, node.ip))
        model_asd = None
        for disk in node.disks:
            for asd in disk.asds:
                if asd.asd_id == asd_id:
                    model_asd = asd
                    break
            if model_asd is not None:
                break
        if model_asd is None:
            raise RuntimeError('Could not locate asd {0} in the model'.format(asd_id))
        alba_backend = model_asd.alba_backend

        asds = {}
        try:
            asds = node.client.get_asds()
        except (requests.ConnectionError, requests.Timeout):
            AlbaNodeController._logger.warning('Could not connect to node {0} to validate asd'.format(node.guid))
        disk_id = None
        for _disk_id in asds:
            if asd_id in asds[_disk_id]:
                disk_id = _disk_id
                break

        AlbaController.remove_units(alba_backend.guid, [asd_id], absorb_exception=True)
        if disk_id is not None:
            final_safety = AlbaController.calculate_safety(alba_backend.guid, [asd_id])
            safety_lost = final_safety['lost']
            safety_crit = final_safety['critical']
            if (safety_crit != 0 or safety_lost != 0) and (safety_crit != expected_safety['critical'] or safety_lost != expected_safety['lost']):
                raise RuntimeError('Cannot remove ASD {0} as the current safety is not as expected ({1} vs {2})'.format(asd_id, final_safety, expected_safety))

            result = node.client.delete_asd(disk_id, asd_id)
            if result['_success'] is False:
                raise RuntimeError('Error removing ASD: {0}'.format(result['_error']))
        else:
            AlbaNodeController._logger.warning('Alba decommission osd {0} without safety validations (node down)'.format(asd_id))
        if EtcdConfiguration.exists(AlbaNodeController.ASD_CONFIG.format(asd_id), raw=True):
            EtcdConfiguration.delete(AlbaNodeController.ASD_CONFIG_DIR.format(asd_id), raw=True)

        model_asd.delete()
        alba_backend.invalidate_dynamics()
        alba_backend.backend.invalidate_dynamics()
        if node.storagerouter is not None:
            DiskController.sync_with_reality(node.storagerouter_guid)

        return disk_id