Ejemplo n.º 1
0
    def add_vpool(cls, parameters):
        """
        Add a vPool to the machine this task is running on
        :param parameters: Parameters for vPool creation
        :type parameters: dict
        :return: None
        :rtype: NoneType
        """
        # TODO: Add logging
        cls._logger.debug('Adding vpool. Parameters: {}'.format(parameters))
        # VALIDATIONS
        if not isinstance(parameters, dict):
            raise ValueError('Parameters passed to create a vPool should be of type dict')

        # Check StorageRouter existence
        storagerouter = StorageRouterList.get_by_ip(ip=parameters.get('storagerouter_ip'))
        if storagerouter is None:
            raise RuntimeError('Could not find StorageRouter')

        # Validate requested vPool configurations
        vp_installer = VPoolInstaller(name=parameters.get('vpool_name'))
        vp_installer.validate(storagerouter=storagerouter)

        # Validate requested StorageDriver configurations
        cls._logger.info('vPool {0}: Validating StorageDriver configurations'.format(vp_installer.name))
        sd_installer = StorageDriverInstaller(vp_installer=vp_installer,
                                              configurations={'storage_ip': parameters.get('storage_ip'),
                                                              'caching_info': parameters.get('caching_info'),
                                                              'backend_info': {'main': parameters.get('backend_info'),
                                                                               StorageDriverConfiguration.CACHE_BLOCK: parameters.get('backend_info_bc'),
                                                                               StorageDriverConfiguration.CACHE_FRAGMENT: parameters.get('backend_info_fc')},
                                                              'connection_info': {'main': parameters.get('connection_info'),
                                                                                  StorageDriverConfiguration.CACHE_BLOCK: parameters.get('connection_info_bc'),
                                                                                  StorageDriverConfiguration.CACHE_FRAGMENT: parameters.get('connection_info_fc')},
                                                              'sd_configuration': parameters.get('config_params')})

        partitions_mutex = volatile_mutex('add_vpool_partitions_{0}'.format(storagerouter.guid))
        try:
            # VPOOL CREATION
            # Create the vPool as soon as possible in the process to be displayed in the GUI (INSTALLING/EXTENDING state)
            if vp_installer.is_new is True:
                vp_installer.create(rdma_enabled=sd_installer.rdma_enabled)
                vp_installer.configure_mds(config=parameters.get('mds_config_params', {}))
            else:
                vp_installer.update_status(status=VPool.STATUSES.EXTENDING)

            # ADDITIONAL VALIDATIONS
            # Check StorageRouter connectivity
            cls._logger.info('vPool {0}: Validating StorageRouter connectivity'.format(vp_installer.name))
            linked_storagerouters = [storagerouter]
            if vp_installer.is_new is False:
                linked_storagerouters += [sd.storagerouter for sd in vp_installer.vpool.storagedrivers]

            sr_client_map = SSHClient.get_clients(endpoints=linked_storagerouters, user_names=['ovs', 'root'])
            offline_nodes = sr_client_map.pop('offline')
            if storagerouter in offline_nodes:
                raise RuntimeError('Node on which the vPool is being {0} is not reachable'.format('created' if vp_installer.is_new is True else 'extended'))

            sr_installer = StorageRouterInstaller(root_client=sr_client_map[storagerouter]['root'],
                                                  sd_installer=sd_installer,
                                                  vp_installer=vp_installer,
                                                  storagerouter=storagerouter)

            # When 2 or more jobs simultaneously run on the same StorageRouter, we need to check and create the StorageDriver partitions in locked context
            partitions_mutex.acquire(wait=60)
            sr_installer.partition_info = StorageRouterController.get_partition_info(storagerouter_guid=storagerouter.guid)
            sr_installer.validate_vpool_extendable()
            sr_installer.validate_global_write_buffer(requested_size=parameters.get('writecache_size', 0))
            sr_installer.validate_local_cache_size(requested_proxies=parameters.get('parallelism', {}).get('proxies', 2))

            # MODEL STORAGEDRIVER AND PARTITION JUNCTIONS
            sd_installer.create()
            sd_installer.create_partitions()
            partitions_mutex.release()

            vp_installer.refresh_metadata()
        except Exception:
            cls._logger.exception('Something went wrong during the validation or modeling of vPool {0} on StorageRouter {1}'.format(vp_installer.name, storagerouter.name))
            partitions_mutex.release()
            vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING)
            raise

        # Arakoon setup
        counter = 0
        while counter < 300:
            try:
                if StorageDriverController.manual_voldrv_arakoon_checkup() is True:
                    break
            except Exception:
                cls._logger.exception('Arakoon checkup for voldrv cluster failed')
                vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING)
                raise
            counter += 1
            time.sleep(1)
            if counter == 300:
                vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING)
                raise RuntimeError('Arakoon checkup for the StorageDriver cluster could not be started')

        # Cluster registry
        try:
            vp_installer.configure_cluster_registry(allow_raise=True)
        except Exception:
            if vp_installer.is_new is True:
                vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING)
            else:
                vp_installer.revert_vpool(status=VPool.STATUSES.FAILURE)
            raise

        try:
            sd_installer.setup_proxy_configs()
            sd_installer.configure_storagedriver_service()
            DiskController.sync_with_reality(storagerouter.guid)
            MDSServiceController.prepare_mds_service(storagerouter=storagerouter, vpool=vp_installer.vpool)

            # Update the MDS safety if changed via API (vpool.configuration will be available at this point also for the newly added StorageDriver)
            vp_installer.vpool.invalidate_dynamics('configuration')
            if vp_installer.mds_safety is not None and vp_installer.vpool.configuration['mds_config']['mds_safety'] != vp_installer.mds_safety:
                Configuration.set(key='/ovs/vpools/{0}/mds_config|mds_safety'.format(vp_installer.vpool.guid), value=vp_installer.mds_safety)

            sd_installer.start_services()  # Create and start watcher volumedriver, DTL, proxies and StorageDriver services

            # Post creation/extension checkups
            mds_config_set = MDSServiceController.get_mds_storagedriver_config_set(vpool=vp_installer.vpool, offline_nodes=offline_nodes)
            for sr, clients in sr_client_map.iteritems():
                for current_storagedriver in [sd for sd in sr.storagedrivers if sd.vpool_guid == vp_installer.vpool.guid]:
                    storagedriver_config = StorageDriverConfiguration(vpool_guid=vp_installer.vpool.guid, storagedriver_id=current_storagedriver.storagedriver_id)
                    if storagedriver_config.config_missing is False:
                        # Filesystem section in StorageDriver configuration are all parameters used for vDisks created directly on the filesystem
                        # So when a vDisk gets created on the filesystem, these MDSes will be assigned to them
                        storagedriver_config.configure_filesystem(fs_metadata_backend_mds_nodes=mds_config_set[sr.guid])
                        storagedriver_config.save(client=clients['ovs'])

            # Everything's reconfigured, refresh new cluster configuration
            for current_storagedriver in vp_installer.vpool.storagedrivers:
                if current_storagedriver.storagerouter not in sr_client_map:
                    continue
                vp_installer.vpool.storagedriver_client.update_cluster_node_configs(str(current_storagedriver.storagedriver_id), req_timeout_secs=10)
        except Exception:
            cls._logger.exception('vPool {0}: Creation failed'.format(vp_installer.name))
            vp_installer.update_status(status=VPool.STATUSES.FAILURE)
            raise

        # When a node is offline, we can run into errors, but also when 1 or more volumes are not running
        # Scheduled tasks below, so don't really care whether they succeed or not
        try:
            VDiskController.dtl_checkup(vpool_guid=vp_installer.vpool.guid, ensure_single_timeout=600)
        except:
            pass
        try:
            MDSServiceController.mds_checkup_single(vp_installer.vpool.guid)
        except:
            pass
        vp_installer.update_status(status=VPool.STATUSES.RUNNING)
        cls._logger.info('Add vPool {0} ended successfully'.format(vp_installer.name))
Ejemplo n.º 2
0
    def shrink_vpool(cls, storagedriver_guid, offline_storage_router_guids=list()):
        """
        Removes a StorageDriver (if its the last StorageDriver for a vPool, the vPool is removed as well)
        :param storagedriver_guid: Guid of the StorageDriver to remove
        :type storagedriver_guid: str
        :param offline_storage_router_guids: Guids of StorageRouters which are offline and will be removed from cluster.
                                             WHETHER VPOOL WILL BE DELETED DEPENDS ON THIS
        :type offline_storage_router_guids: list
        :return: None
        :rtype: NoneType
        """
        # TODO: Add logging
        # TODO: Unit test individual pieces of code
        # Validations
        storagedriver = StorageDriver(storagedriver_guid)
        storagerouter = storagedriver.storagerouter
        cls._logger.info('StorageDriver {0} - Deleting StorageDriver {1}'.format(storagedriver.guid, storagedriver.name))

        vp_installer = VPoolInstaller(name=storagedriver.vpool.name)
        vp_installer.validate(storagedriver=storagedriver)

        sd_installer = StorageDriverInstaller(vp_installer=vp_installer,
                                              storagedriver=storagedriver)

        cls._logger.info('StorageDriver {0} - Checking availability of related StorageRouters'.format(storagedriver.guid, storagedriver.name))
        sr_client_map = SSHClient.get_clients(endpoints=[sd.storagerouter for sd in vp_installer.vpool.storagedrivers], user_names=['root'])
        sr_installer = StorageRouterInstaller(root_client=sr_client_map.get(storagerouter, {}).get('root'),
                                              storagerouter=storagerouter,
                                              vp_installer=vp_installer,
                                              sd_installer=sd_installer)

        offline_srs = sr_client_map.pop('offline')
        if sorted([sr.guid for sr in offline_srs]) != sorted(offline_storage_router_guids):
            raise RuntimeError('Not all StorageRouters are reachable')

        if storagerouter not in offline_srs:
            mtpt_pids = sr_installer.root_client.run("lsof -t +D '/mnt/{0}' || true".format(vp_installer.name.replace(r"'", r"'\''")), allow_insecure=True).splitlines()
            if len(mtpt_pids) > 0:
                raise RuntimeError('vPool cannot be deleted. Following processes keep the vPool mount point occupied: {0}'.format(', '.join(mtpt_pids)))

        # Retrieve reachable StorageDrivers
        reachable_storagedrivers = []
        for sd in vp_installer.vpool.storagedrivers:
            if sd.storagerouter not in sr_client_map:
                # StorageRouter is offline
                continue

            sd_key = '/ovs/vpools/{0}/hosts/{1}/config'.format(vp_installer.vpool.guid, sd.storagedriver_id)
            if Configuration.exists(sd_key) is True:
                path = Configuration.get_configuration_path(sd_key)
                with remote(sd.storagerouter.ip, [LocalStorageRouterClient]) as rem:
                    try:
                        lsrc = rem.LocalStorageRouterClient(path)
                        lsrc.server_revision()  # 'Cheap' call to verify whether volumedriver is responsive
                        cls._logger.info('StorageDriver {0} - Responsive StorageDriver {1} on node with IP {2}'.format(storagedriver.guid, sd.name, sd.storagerouter.ip))
                        reachable_storagedrivers.append(sd)
                    except Exception as exception:
                        if not is_connection_failure(exception):
                            raise

        if len(reachable_storagedrivers) == 0:
            raise RuntimeError('Could not find any responsive node in the cluster')

        # Start removal
        if vp_installer.storagedriver_amount > 1:
            vp_installer.update_status(status=VPool.STATUSES.SHRINKING)
        else:
            vp_installer.update_status(status=VPool.STATUSES.DELETING)

        # Clean up stale vDisks
        cls._logger.info('StorageDriver {0} - Removing stale vDisks'.format(storagedriver.guid))
        VDiskController.remove_stale_vdisks(vpool=vp_installer.vpool)

        # Reconfigure the MDSes
        cls._logger.info('StorageDriver {0} - Reconfiguring MDSes'.format(storagedriver.guid))
        for vdisk_guid in storagerouter.vdisks_guids:
            try:
                MDSServiceController.ensure_safety(vdisk_guid=vdisk_guid, excluded_storagerouter_guids=[storagerouter.guid] + offline_storage_router_guids)
            except Exception:
                cls._logger.exception('StorageDriver {0} - vDisk {1} - Ensuring MDS safety failed'.format(storagedriver.guid, vdisk_guid))

        # Validate that all MDSes on current StorageRouter have been moved away
        # Ensure safety does not always throw an error, that's why we perform this check here instead of in the Exception clause of above code
        vdisks = []
        for mds in vp_installer.mds_services:
            for junction in mds.vdisks:
                vdisk = junction.vdisk
                if vdisk in vdisks:
                    continue
                vdisks.append(vdisk)
                cls._logger.critical('StorageDriver {0} - vDisk {1} {2} - MDS Services have not been migrated away'.format(storagedriver.guid, vdisk.guid, vdisk.name))
        if len(vdisks) > 0:
            # Put back in RUNNING, so it can be used again. Errors keep on displaying in GUI now anyway
            vp_installer.update_status(status=VPool.STATUSES.RUNNING)
            raise RuntimeError('Not all MDS Services have been successfully migrated away')

        # Start with actual removal
        errors_found = False
        if storagerouter not in offline_srs:
            errors_found &= sd_installer.stop_services()

        errors_found &= vp_installer.configure_cluster_registry(exclude=[storagedriver], apply_on=reachable_storagedrivers)
        errors_found &= vp_installer.update_node_distance_map()
        errors_found &= vp_installer.remove_mds_services()
        errors_found &= sd_installer.clean_config_management()
        errors_found &= sd_installer.clean_model()

        if storagerouter not in offline_srs:
            errors_found &= sd_installer.clean_directories(mountpoints=StorageRouterController.get_mountpoints(client=sr_installer.root_client))

            try:
                DiskController.sync_with_reality(storagerouter_guid=storagerouter.guid)
            except Exception:
                cls._logger.exception('StorageDriver {0} - Synchronizing disks with reality failed'.format(storagedriver.guid))
                errors_found = True

        if vp_installer.storagedriver_amount > 1:
            # Update the vPool metadata and run DTL checkup
            vp_installer.vpool.metadata['caching_info'].pop(sr_installer.storagerouter.guid, None)
            vp_installer.vpool.save()

            try:
                VDiskController.dtl_checkup(vpool_guid=vp_installer.vpool.guid, ensure_single_timeout=600)
            except Exception:
                cls._logger.exception('StorageDriver {0} - DTL checkup failed for vPool {1} with guid {2}'.format(storagedriver.guid, vp_installer.name, vp_installer.vpool.guid))
            StorageRouterController._logger.info('Remove StorageDriver - Guid {0} - Running MDS checkup'.format(storagedriver.guid))
            try:
                MDSServiceController.mds_checkup_single(vp_installer.vpool.guid)
            except Exception:
                StorageRouterController._logger.exception('Remove StorageDriver - Guid {0} - MDS checkup failed'.format(storagedriver.guid))

        else:
            cls._logger.info('StorageDriver {0} - Removing vPool from model'.format(storagedriver.guid))
            # Clean up model
            try:
                vp_installer.vpool.delete()
            except Exception:
                errors_found = True
                cls._logger.exception('StorageDriver {0} - Cleaning up vPool from the model failed'.format(storagedriver.guid))
            Configuration.delete('/ovs/vpools/{0}'.format(vp_installer.vpool.guid))

        # Update vPool status
        if errors_found is True:
            if vp_installer.storagedriver_amount > 1:
                vp_installer.update_status(status=VPool.STATUSES.FAILURE)
            raise RuntimeError('1 or more errors occurred while trying to remove the StorageDriver. Please check the logs for more information')

        if vp_installer.storagedriver_amount > 1:
            vp_installer.update_status(status=VPool.STATUSES.RUNNING)
        cls._logger.info('StorageDriver {0} - Deleted StorageDriver {1}'.format(storagedriver.guid, storagedriver.name))

        if len(VPoolList.get_vpools()) == 0:
            cluster_name = ArakoonInstaller.get_cluster_name('voldrv')
            if ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name)['internal'] is True:
                cls._logger.debug('StorageDriver {0} - Removing Arakoon cluster {1}'.format(storagedriver.guid, cluster_name))
                try:
                    installer = ArakoonInstaller(cluster_name=cluster_name)
                    installer.load()
                    installer.delete_cluster()
                except Exception:
                    cls._logger.exception('StorageDriver {0} - Delete voldrv Arakoon cluster failed'.format(storagedriver.guid))
                service_type = ServiceTypeList.get_by_name(ServiceType.SERVICE_TYPES.ARAKOON)
                service_name = ArakoonInstaller.get_service_name_for_cluster(cluster_name=cluster_name)
                for service in list(service_type.services):
                    if service.name == service_name:
                        service.delete()

        # Remove watcher volumedriver service if last StorageDriver on current StorageRouter
        if len(storagerouter.storagedrivers) == 0 and storagerouter not in offline_srs:  # ensure client is initialized for StorageRouter
            try:
                if cls._service_manager.has_service(ServiceFactory.SERVICE_WATCHER_VOLDRV, client=sr_installer.root_client):
                    cls._service_manager.stop_service(ServiceFactory.SERVICE_WATCHER_VOLDRV, client=sr_installer.root_client)
                    cls._service_manager.remove_service(ServiceFactory.SERVICE_WATCHER_VOLDRV, client=sr_installer.root_client)
            except Exception:
                cls._logger.exception('StorageDriver {0} - {1} service deletion failed'.format(storagedriver.guid, ServiceFactory.SERVICE_WATCHER_VOLDRV))