def update_preset(alba_backend_guid, name, policies):
        """
        Updates policies for an existing preset to Alba
        :param alba_backend_guid: Guid of the ALBA backend
        :type alba_backend_guid: str
        :param name: Name of backend
        :type name: str
        :param policies: New policy list to be sent to alba
        :type policies: list
        :return: None
        """
        # VALIDATIONS
        AlbaPresetController._validate_policies_param(policies=policies)

        alba_backend = AlbaBackend(alba_backend_guid)
        if name not in [preset['name'] for preset in alba_backend.presets]:
            raise RuntimeError('Could not find a preset with name {0} for ALBA Backend {1}'.format(name, alba_backend.name))

        # UPDATE PRESET
        AlbaPresetController._logger.debug('Updating preset {0} with policies {1}'.format(name, policies))
        config = Configuration.get_configuration_path(ArakoonInstaller.CONFIG_KEY.format(AlbaController.get_abm_cluster_name(alba_backend=alba_backend)))
        temp_config_file = tempfile.mktemp()
        with open(temp_config_file, 'wb') as data_file:
            data_file.write(json.dumps({'policies': policies}))
            data_file.flush()
        AlbaCLI.run(command='update-preset', config=config, named_params={'input-url': temp_config_file}, extra_params=[name])
        alba_backend.invalidate_dynamics()
        os.remove(temp_config_file)
    def get_load(nsm_cluster):
        """
        Calculates the load of an NSM node, returning a float percentage
        :param nsm_cluster: NSM cluster to retrieve the load for
        :type nsm_cluster: ovs.dal.hybrids.albansmcluster.NSMCluster
        :return: Load of the NSM service
        :rtype: float
        """
        service_capacity = float(nsm_cluster.capacity)
        if service_capacity < 0:
            return 50.0
        if service_capacity == 0:
            return float('inf')

        config = Configuration.get_configuration_path(
            key=nsm_cluster.alba_backend.abm_cluster.config_location)
        hosts_data = AlbaCLI.run(command='list-nsm-hosts', config=config)
        try:
            host = [
                host for host in hosts_data if host['id'] == nsm_cluster.name
            ][0]
        except IndexError:
            raise ValueError(
                'No host data could be retrieved from Alba for NSM cluster {0}'
                .format(nsm_cluster.name))
        usage = host['namespaces_count']
        return round(usage / service_capacity * 100.0, 5)
    def update_preset(alba_backend_guid, name, policies):
        """
        Updates policies for an existing preset to Alba
        :param alba_backend_guid: Guid of the ALBA backend
        :type alba_backend_guid: str
        :param name: Name of preset
        :type name: str
        :param policies: New policy list to be sent to alba
        :type policies: list
        :return: None
        """
        # VALIDATIONS
        AlbaPresetController._validate_policies_param(policies=policies)

        alba_backend = AlbaBackend(alba_backend_guid)
        if name not in [preset['name'] for preset in alba_backend.presets]:
            raise RuntimeError(
                'Could not find a preset with name {0} for ALBA Backend {1}'.
                format(name, alba_backend.name))

        # UPDATE PRESET
        AlbaPresetController._logger.debug(
            'Updating preset {0} with policies {1}'.format(name, policies))
        config = Configuration.get_configuration_path(
            alba_backend.abm_cluster.config_location)
        temp_config_file = tempfile.mktemp()
        with open(temp_config_file, 'wb') as data_file:
            data_file.write(json.dumps({'policies': policies}))
            data_file.flush()
        AlbaCLI.run(command='update-preset',
                    config=config,
                    named_params={'input-url': temp_config_file},
                    extra_params=[name])
        alba_backend.invalidate_dynamics()
        os.remove(temp_config_file)
Beispiel #4
0
    def check_for_halted_volumes(result_handler):
        """
        Checks for halted volumes on a single or multiple vPools
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        result_handler.info('Checking for halted volumes.', add_to_result=False)
        vpools = VPoolHelper.get_vpools()

        if len(vpools) == 0:
            result_handler.skip('No vPools found!'.format(len(vpools)))
            return

        for vp in vpools:
            if vp.guid not in VolumedriverHealthCheck.LOCAL_SR.vpools_guids:
                result_handler.skip('Skipping vPool {0} because it is not living here.'.format(vp.name))
                continue

            haltedvolumes = []
            result_handler.info('Checking vPool {0}: '.format(vp.name), add_to_result=False)
            if len(vp.storagedrivers) > 0:
                config_file = Configuration.get_configuration_path('/ovs/vpools/{0}/hosts/{1}/config'.format(vp.guid, vp.storagedrivers[0].name))
            else:
                result_handler.failure('The vpool {0} does not have any storagedrivers associated to it!'.format(vp.name))
                continue

            try:
                voldrv_client = src.LocalStorageRouterClient(config_file)
                # noinspection PyArgumentList
                voldrv_volume_list = voldrv_client.list_volumes()
                for volume in voldrv_volume_list:
                    # check if volume is halted, returns: 0 or 1
                    try:
                        # noinspection PyTypeChecker
                        if int(VolumedriverHealthCheck._info_volume(voldrv_client, volume).halted):
                            haltedvolumes.append(volume)
                    except ObjectNotFoundException:
                        # ignore ovsdb invalid entrees
                        # model consistency will handle it.
                        continue
                    except MaxRedirectsExceededException:
                        # this means the volume is not halted but detached or unreachable for the volumedriver
                        haltedvolumes.append(volume)
                    except RuntimeError:
                        haltedvolumes.append(volume)
                    except TimeoutError:
                        # timeout occurred
                        haltedvolumes.append(volume)
                result_handler.success('Volumedriver {0} is up and running.'.format(vp.name))
            except (ClusterNotReachableException, RuntimeError) as ex:
                result_handler.failure('Seems like the Volumedriver {0} is not running.'.format(vp.name, ex.message))
                continue

            # print all results
            if len(haltedvolumes) > 0:
                result_handler.failure('Detected volumes that are HALTED in vPool {0}: {1}'.format(vp.name, ', '.join(haltedvolumes)))
            else:
                result_handler.success('No halted volumes detected in vPool {0}'.format(vp.name))
    def _presets(self):
        """
        Returns the policies active on the node
        """
        if len(self.abm_services) == 0:
            return []  # No ABM services yet, so backend not fully installed yet

        asds = {}
        if self.scaling != AlbaBackend.SCALINGS.GLOBAL:
            for node in AlbaNodeList.get_albanodes():
                asds[node.node_id] = 0
                for disk in self.local_stack[node.node_id].values():
                    for asd_info in disk['asds'].values():
                        if asd_info['status'] in ['claimed', 'warning']:
                            asds[node.node_id] += 1
        config = Configuration.get_configuration_path('/ovs/arakoon/{0}-abm/config'.format(self.name))
        presets = AlbaCLI.run(command='list-presets', config=config)
        preset_dict = {}
        for preset in presets:
            preset_dict[preset['name']] = preset
            if 'in_use' not in preset:
                preset['in_use'] = True
            if 'is_default' not in preset:
                preset['is_default'] = False
            preset['is_available'] = False
            preset['policies'] = [tuple(policy) for policy in preset['policies']]
            preset['policy_metadata'] = {}
            active_policy = None
            for policy in preset['policies']:
                is_available = False
                available_disks = 0
                if self.scaling != AlbaBackend.SCALINGS.GLOBAL:
                    available_disks += sum(min(asds[node], policy[3]) for node in asds)
                if self.scaling != AlbaBackend.SCALINGS.LOCAL:
                    available_disks += sum(self.local_summary['devices'].values())
                if available_disks >= policy[2]:
                    if active_policy is None:
                        active_policy = policy
                    is_available = True
                preset['policy_metadata'][policy] = {'is_active': False, 'in_use': False, 'is_available': is_available}
                preset['is_available'] |= is_available
            if active_policy is not None:
                preset['policy_metadata'][active_policy]['is_active'] = True
        for namespace in self.ns_data:
            if namespace['namespace']['state'] != 'active':
                continue
            policy_usage = namespace['statistics']['bucket_count']
            preset = preset_dict[namespace['namespace']['preset_name']]
            for usage in policy_usage:
                upolicy = tuple(usage[0])  # Policy as reported to be "in use"
                for cpolicy in preset['policies']:  # All configured policies
                    if upolicy[0] == cpolicy[0] and upolicy[1] == cpolicy[1] and upolicy[3] <= cpolicy[3]:
                        preset['policy_metadata'][cpolicy]['in_use'] = True
                        break
        for preset in presets:
            preset['policies'] = [str(policy) for policy in preset['policies']]
            for key in preset['policy_metadata'].keys():
                preset['policy_metadata'][str(key)] = preset['policy_metadata'][key]
                del preset['policy_metadata'][key]
        return presets
    def _osd_statistics(self):
        """
        Loads statistics from all it's asds in one call
        """
        from ovs.dal.hybrids.albaosd import AlbaOSD

        statistics = {}
        if self.abm_cluster is None:
            return statistics  # No ABM cluster yet, so backend not fully installed yet

        osd_ids = [
            osd.osd_id for osd in self.osds
            if osd.osd_type in [AlbaOSD.OSD_TYPES.ASD, AlbaOSD.OSD_TYPES.AD]
        ]
        if len(osd_ids) == 0:
            return statistics
        try:
            config = Configuration.get_configuration_path(
                self.abm_cluster.config_location)
            # TODO: This will need to be changed to osd-multistatistics, see openvstorage/alba#749
            raw_statistics = AlbaCLI.run(
                command='asd-multistatistics',
                config=config,
                named_params={'long-id': ','.join(osd_ids)})
        except RuntimeError:
            return statistics
        if raw_statistics:
            for osd_id, stats in raw_statistics.iteritems():
                if stats['success'] is True:
                    statistics[osd_id] = stats['result']
        return statistics
Beispiel #7
0
    def __init__(self, vpool_guid, storagedriver_id):
        """
        Initializes the class
        """
        _log_level = LOG_LEVEL_MAPPING[OVSLogger(
            'extensions').getEffectiveLevel()]
        # noinspection PyCallByClass,PyTypeChecker
        storagerouterclient.Logger.setupLogging(
            OVSLogger.load_path('storagerouterclient'), _log_level)
        # noinspection PyArgumentList
        storagerouterclient.Logger.enableLogging()

        self._key = '/ovs/vpools/{0}/hosts/{1}/config'.format(
            vpool_guid, storagedriver_id)
        self._logger = OVSLogger('extensions')
        self._dirty_entries = []

        self.remote_path = Configuration.get_configuration_path(
            self._key).strip('/')
        # Load configuration
        if Configuration.exists(self._key):
            self.configuration = Configuration.get(self._key)
            self.config_missing = False
        else:
            self.configuration = {}
            self.config_missing = True
            self._logger.debug(
                'Could not find config {0}, a new one will be created'.format(
                    self._key))
    def delete_preset(alba_backend_guid, name):
        """
        Deletes a preset from the Alba backend
        :param alba_backend_guid: Guid of the ALBA backend
        :type alba_backend_guid: str
        :param name: Name of the preset
        :type name: str
        :return: None
        """
        # VALIDATIONS
        alba_backend = AlbaBackend(alba_backend_guid)
        preset_default_map = dict((preset['name'], preset['is_default'])
                                  for preset in alba_backend.presets)
        if name not in preset_default_map:
            AlbaPresetController._logger.warning(
                'Preset with name {0} for ALBA Backend {1} could not be found, so not deleting'
                .format(name, alba_backend.name))
            return

        if preset_default_map[name] is True:
            raise RuntimeError('Cannot delete the default preset')

        # DELETE PRESET
        AlbaPresetController._logger.debug('Deleting preset {0}'.format(name))
        config = Configuration.get_configuration_path(
            alba_backend.abm_cluster.config_location)
        AlbaCLI.run(command='delete-preset',
                    config=config,
                    extra_params=[name])
        alba_backend.invalidate_dynamics()
 def get_abm_config(alba_backend):
     """
     Retrieve the configuration string to pass to the ALBA CLI
     :param alba_backend: ALBA backend
     :return: Configuration string
     """
     service_name = alba_backend.abm_services[0].service.name
     return ['--config', Configuration.get_configuration_path('/ovs/arakoon/{0}/config'.format(service_name.replace('arakoon-', '')))]
    def _ns_data(self):
        """
        Loads namespace data
        """
        if len(self.abm_services) == 0:
            return []  # No ABM services yet, so backend not fully installed yet

        config = Configuration.get_configuration_path('/ovs/arakoon/{0}-abm/config'.format(self.name))
        return AlbaCLI.run(command='show-namespaces', config=config, named_params={'max': -1})[1]
Beispiel #11
0
    def get_stats_nsms(cls):
        """
        Retrieve the amount of NSMs deployed and their statistics
        """
        if cls._config is None:
            cls.validate_and_retrieve_config()

        stats = []
        errors = False
        environment = cls._config['environment']
        for alba_backend in AlbaBackendList.get_albabackends():
            for nsm in alba_backend.nsm_clusters:
                stats.append({
                    'tags': {
                        'nsm_number': nsm.number,
                        'environment': environment,
                        'backend_name': alba_backend.name,
                        'abm_service_name': alba_backend.abm_cluster.name
                    },
                    'fields': {
                        'load': float(AlbaArakoonController.get_load(nsm))
                    },
                    'measurement': 'nsm'
                })

            config_path = Configuration.get_configuration_path(
                alba_backend.abm_cluster.config_location)
            try:
                nsm_host_ids = [
                    nsm_host['id']
                    for nsm_host in AlbaCLI.run(command='list-nsm-hosts',
                                                config=config_path)
                ]
                nsm_hosts_statistics = AlbaCLI.run(
                    command='nsm-hosts-statistics',
                    config=config_path,
                    named_params={'nsm-hosts': ','.join(nsm_host_ids)})
                for nsm_host_id, statistics in nsm_hosts_statistics.iteritems(
                ):
                    stats.append({
                        'tags': {
                            'nsm_name': nsm_host_id,
                            'environment': environment,
                            'backend_name': alba_backend.name
                        },
                        'fields':
                        cls._convert_to_float_values(statistics['statistics']),
                        'measurement':
                        'nsm_statistic'
                    })
            except Exception:
                errors = True
                cls._logger.exception(
                    'Retrieving NSM statistics for ALBA Backend {0} failed'.
                    format(alba_backend.name))
        return errors, stats
    def check_backends(result_handler):
        """
        Checks Alba as a whole
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        result_handler.info('Checking available ALBA backends.', add_to_result=False)
        try:
            alba_backends = AlbaHealthCheck._get_all_responding_backends(result_handler)
            if len(alba_backends) == 0:
                return result_handler.skip('No backends found.')

            result_handler.success('We found {0} backend(s)!'.format(len(alba_backends)))

            result_handler.info('Checking the ALBA ASDs.', add_to_result=False)
            for backend in alba_backends:
                backend_name = backend['name']
                # Check disks of backend, ignore global backends
                if backend['type'] != 'LOCAL':
                    result_handler.skip('Alba backend {0} is a global backend.'.format(backend_name), add_to_result=False)
                    continue

                config = Configuration.get_configuration_path('/ovs/arakoon/{0}-abm/config'.format(backend_name))
                try:
                    result_disks = AlbaHealthCheck._check_backend_asds(result_handler, backend['disks'], backend_name, config)
                except Exception:
                    result_handler.warning('Could not fetch the asd information for alba backend {0}'.format(backend_name))
                    continue
                working_disks = result_disks['working']
                defective_disks = result_disks['broken']
                # Check if backend is available for vPool use
                if backend['is_available_for_vpool']:
                    if len(defective_disks) == 0:
                        result_handler.success('Alba backend {0} should be available for VPool use. All asds are working fine!'.format(backend_name))
                    else:
                        result_handler.warning('Alba backend {0} should be available for VPool use with {1} asds, but there are {2} defective asds: {3}'
                                               .format(backend_name, len(working_disks), len(defective_disks), ', '.join(defective_disks)),
                                               code=ErrorCodes.osd_defective)
                else:
                    if len(working_disks) == 0 and len(defective_disks) == 0:
                        result_handler.skip('Alba backend {0} is not available for vPool use, there are no asds assigned to this backend!'.format(backend_name))
                    else:
                        result_handler.failure('Alba backend {0} is not available for vPool use, preset requirements not satisfied! There are {1} working asds AND {2} '
                                               'defective asds!'.format(backend_name, len(working_disks), len(defective_disks)),
                                               code=ErrorCodes.osd_defective_unsatisfiable)
        except NotFoundException as ex:
            result_handler.failure('Failed to fetch the object with exception: {0}'.format(ex),
                                   code=ErrorCodes.configuration_not_found)
        except ConnectionFailedException as ex:
            result_handler.failure('Failed to connect to configuration master with exception: {0}'.format(ex),
                                   code=ErrorCodes.arakoon_connection_failure)
        except (ArakoonNotFound, ArakoonNoMaster, ArakoonNoMasterResult) as e:
            result_handler.failure('Seems like an Arakoon has some problems: {0}'.format(str(e)),
                                   code=ErrorCodes.arakoon_problems)
    def _ns_data(self):
        """
        Loads namespace data
        """
        if self.abm_cluster is None:
            return []  # No ABM cluster yet, so backend not fully installed yet

        config = Configuration.get_configuration_path(
            self.abm_cluster.config_location)
        return AlbaCLI.run(command='show-namespaces',
                           config=config,
                           named_params={'max': -1})[1]
    def _deploy(config, filesystem, offline_nodes=None, plugins=None, delay_service_registration=False):
        """
        Deploys a complete cluster: Distributing the configuration files, creating directories and services
        """
        if os.environ.get('RUNNING_UNITTESTS') == 'True':
            if filesystem is True:
                raise NotImplementedError('At this moment, there is no support for unit-testing filesystem backend Arakoon clusters')

        ArakoonInstaller._logger.debug('Deploying cluster {0}'.format(config.cluster_id))
        if offline_nodes is None:
            offline_nodes = []

        service_metadata = {}
        for node in config.nodes:
            if node.ip in offline_nodes:
                continue
            ArakoonInstaller._logger.debug('  Deploying cluster {0} on {1}'.format(config.cluster_id, node.ip))
            root_client = SSHClient(node.ip, username='******')

            # Distributes a configuration file to all its nodes
            config.write_config(node.ip)

            # Create dirs as root because mountpoint /mnt/cache1 is typically owned by root
            abs_paths = {node.tlog_dir, node.home}  # That's a set
            if node.log_sinks.startswith('/'):
                abs_paths.add(os.path.dirname(os.path.abspath(node.log_sinks)))
            if node.crash_log_sinks.startswith('/'):
                abs_paths.add(os.path.dirname(os.path.abspath(node.crash_log_sinks)))
            abs_paths = list(abs_paths)
            root_client.dir_create(abs_paths)
            root_client.dir_chmod(abs_paths, 0755, recursive=True)
            root_client.dir_chown(abs_paths, 'ovs', 'ovs', recursive=True)

            # Creates services for/on all nodes in the config
            if config.filesystem is True:
                config_path = config.config_path
            else:
                config_path = Configuration.get_configuration_path(config.config_path)
            extra_version_cmd = ''
            if plugins is not None:
                extra_version_cmd = ';'.join(plugins)
            metadata = ServiceManager.add_service(name='ovs-arakoon',
                                                  client=root_client,
                                                  params={'CLUSTER': config.cluster_id,
                                                          'NODE_ID': node.name,
                                                          'CONFIG_PATH': config_path,
                                                          'EXTRA_VERSION_CMD': extra_version_cmd},
                                                  target_name='ovs-arakoon-{0}'.format(config.cluster_id),
                                                  startup_dependency=('ovs-watcher-config' if filesystem is False else None),
                                                  delay_registration=delay_service_registration)
            service_metadata[node.ip] = metadata
            ArakoonInstaller._logger.debug('  Deploying cluster {0} on {1} completed'.format(config.cluster_id, node.ip))
        return service_metadata
Beispiel #15
0
 def _generate_proxy_config(proxy_type, proxy_service):
     proxy_config = {'log_level': 'info',
                     'port': proxy_service.service.ports[0] if proxy_type == 'main' else 0,
                     'ips': [self.storagedriver.storage_ip] if proxy_type == 'main' else ['127.0.0.1'],
                     'manifest_cache_size': manifest_cache_size,
                     'fragment_cache': fragment_cache_main_proxy if proxy_type == 'main' else fragment_cache_scrub_proxy,
                     'transport': 'tcp',
                     'read_preference': read_preferences,
                     'albamgr_cfg_url': Configuration.get_configuration_path(config_tree.format('abm'))}
     if self.sr_installer.block_cache_supported:
         proxy_config['block_cache'] = block_cache_main_proxy if proxy_type == 'main' else block_cache_scrub_proxy
     return proxy_config
    def restart_cluster_add(cluster_name, current_ips, new_ip, filesystem):
        """
        Execute a (re)start sequence after adding a new node to a cluster.
        :param cluster_name: Name of the cluster to restart
        :type cluster_name: str
        :param current_ips: IPs of the previous nodes
        :type current_ips: list
        :param new_ip: IP of the newly added node
        :type new_ip: str
        :param filesystem: Indicates whether the configuration should be on the filesystem or in a configuration cluster
        :type filesystem: bool
        :return: None
        """
        ArakoonInstaller._logger.debug('Restart sequence (add) for {0}'.format(cluster_name))
        ArakoonInstaller._logger.debug('Current ips: {0}'.format(', '.join(current_ips)))
        ArakoonInstaller._logger.debug('New ip: {0}'.format(new_ip))

        client = SSHClient(new_ip, username=ArakoonInstaller.SSHCLIENT_USER)
        if ArakoonInstaller.is_running(cluster_name, client):
            ArakoonInstaller._logger.info('Arakoon service for {0} is already running'.format(cluster_name))
            return
        config = ArakoonClusterConfig(cluster_name, filesystem)
        config.load_config(new_ip)
        arakoon_client = ArakoonInstaller.build_client(config)

        if len(config.nodes) > 1:
            ArakoonInstaller._logger.debug('Catching up new node {0} for cluster {1}'.format(new_ip, cluster_name))
            node_name = [node.name for node in config.nodes if node.ip == new_ip][0]
            if filesystem is True:
                config_path = config.config_path
            else:
                config_path = Configuration.get_configuration_path(config.config_path)
            client.run(['arakoon', '--node', node_name, '-config', config_path, '-catchup-only'])
            ArakoonInstaller._logger.debug('Catching up new node {0} for cluster {1} completed'.format(new_ip, cluster_name))

        threshold = 2 if new_ip in current_ips else 1
        for ip in current_ips:
            if ip == new_ip:
                continue
            current_client = SSHClient(ip, username='******')
            ArakoonInstaller.stop(cluster_name, client=current_client)
            ArakoonInstaller.start(cluster_name, client=current_client)
            ArakoonInstaller._logger.debug('  Restarted node {0} for cluster {1}'.format(current_client.ip, cluster_name))
            if len(current_ips) > threshold:  # A two node cluster needs all nodes running
                ArakoonInstaller.wait_for_cluster(cluster_name, ip, filesystem)
        client = SSHClient(new_ip, username='******')
        ArakoonInstaller.start(cluster_name, client=client)
        ArakoonInstaller.wait_for_cluster(cluster_name, new_ip, filesystem)
        arakoon_client.set(ArakoonInstaller.INTERNAL_CONFIG_KEY, config.export_ini())
        ArakoonInstaller._logger.debug('Started node {0} for cluster {1}'.format(new_ip, cluster_name))
Beispiel #17
0
    def collapse_arakoon():
        """
        Collapse Arakoon's Tlogs
        :return: None
        """
        ScheduledTaskController._logger.info('Starting arakoon collapse')
        storagerouters = StorageRouterList.get_storagerouters()
        cluster_info = [('cacc', storagerouters[0], True)]
        cluster_names = []
        for service in ServiceList.get_services():
            if service.is_internal is True and service.type.name in (ServiceType.SERVICE_TYPES.ARAKOON,
                                                                     ServiceType.SERVICE_TYPES.NS_MGR,
                                                                     ServiceType.SERVICE_TYPES.ALBA_MGR):
                cluster = service.name.replace('arakoon-', '')
                if cluster in cluster_names:
                    continue
                cluster_names.append(cluster)
                cluster_info.append((cluster, service.storagerouter, False))
        workload = {}
        for cluster, storagerouter, filesystem in cluster_info:
            ScheduledTaskController._logger.debug('  Collecting info for cluster {0}'.format(cluster))
            config = ArakoonClusterConfig(cluster, filesystem=filesystem)
            config.load_config(storagerouter.ip)
            for node in config.nodes:
                if node.ip not in workload:
                    workload[node.ip] = {'node_id': node.name,
                                         'clusters': []}
                workload[node.ip]['clusters'].append((cluster, filesystem))
        for storagerouter in storagerouters:
            try:
                if storagerouter.ip not in workload:
                    continue
                node_workload = workload[storagerouter.ip]
                client = SSHClient(storagerouter)
                for cluster, filesystem in node_workload['clusters']:
                    try:
                        ScheduledTaskController._logger.debug('  Collapsing cluster {0} on {1}'.format(cluster, storagerouter.ip))
                        if filesystem is True:
                            config_path = ArakoonClusterConfig.CONFIG_FILE.format(cluster)
                        else:
                            config_path = Configuration.get_configuration_path(ArakoonClusterConfig.CONFIG_KEY.format(cluster))
                        client.run(['arakoon', '--collapse-local', node_workload['node_id'], '2', '-config', config_path])
                        ScheduledTaskController._logger.info('  Collapsing cluster {0} on {1} completed'.format(cluster, storagerouter.ip))
                    except:
                        ScheduledTaskController._logger.exception('  Collapsing cluster {0} on {1} failed'.format(cluster, storagerouter.ip))
            except UnableToConnectException:
                ScheduledTaskController._logger.error('  Could not collapse any cluster on {0} (not reachable)'.format(storagerouter.name))

        ScheduledTaskController._logger.info('Arakoon collapse finished')
Beispiel #18
0
    def get_stats_alba_backends(cls):
        """
        Retrieve statistics about all ALBA Backends and their maintenance work
        """
        if cls._config is None:
            cls.validate_and_retrieve_config()

        stats = []
        errors = False
        environment = cls._config['environment']
        for alba_backend in AlbaBackendList.get_albabackends():
            try:
                local_summary = alba_backend.local_summary
                sizes = local_summary['sizes']
                devices = local_summary['devices']
                stats.append({
                    'tags': {
                        'environment': environment,
                        'backend_name': alba_backend.name
                    },
                    'fields': {
                        'red':
                        int(devices['red']),
                        'free':
                        float(sizes['size'] - sizes['used']),
                        'used':
                        float(sizes['used']),
                        'green':
                        int(devices['green']),
                        'orange':
                        int(devices['orange']),
                        'maintenance_work':
                        int(
                            AlbaCLI.run(
                                command='list-work',
                                config=Configuration.get_configuration_path(
                                    alba_backend.abm_cluster.config_location))
                            ['count'])
                    },
                    'measurement': 'backend'
                })
            except Exception:
                errors = True
                cls._logger.exception(
                    'Retrieving statistics for ALBA Backend {0} failed'.format(
                        alba_backend.name))
        return errors, stats
Beispiel #19
0
    def _deploy(config, filesystem, offline_nodes=None):
        """
        Deploys a complete cluster: Distributing the configuration files, creating directories and services
        """
        if os.environ.get('RUNNING_UNITTESTS') == 'True':
            if filesystem is True:
                raise NotImplementedError('At this moment, there is no support for unittesting filesystem backend Arakoon clusters')

        ArakoonInstaller._logger.debug('Deploying cluster {0}'.format(config.cluster_id))
        if offline_nodes is None:
            offline_nodes = []
        for node in config.nodes:
            if node.ip in offline_nodes:
                continue
            ArakoonInstaller._logger.debug('  Deploying cluster {0} on {1}'.format(config.cluster_id, node.ip))
            root_client = SSHClient(node.ip, username='******')

            # Distributes a configuration file to all its nodes
            config.write_config(node.ip)

            # Create dirs as root because mountpoint /mnt/cache1 is typically owned by root
            abs_paths = {node.tlog_dir, node.home}  # That's a set
            if node.log_sinks.startswith('/'):
                abs_paths.add(os.path.dirname(os.path.abspath(node.log_sinks)))
            if node.crash_log_sinks.startswith('/'):
                abs_paths.add(os.path.dirname(os.path.abspath(node.crash_log_sinks)))
            abs_paths = list(abs_paths)
            root_client.dir_create(abs_paths)
            root_client.dir_chmod(abs_paths, 0755, recursive=True)
            root_client.dir_chown(abs_paths, 'ovs', 'ovs', recursive=True)

            # Creates services for/on all nodes in the config
            if config.filesystem is True:
                config_path = config.config_path
            else:
                config_path = Configuration.get_configuration_path(config.config_path)
            base_name = 'ovs-arakoon'
            target_name = 'ovs-arakoon-{0}'.format(config.cluster_id)
            ServiceManager.add_service(base_name, root_client,
                                       params={'CLUSTER': config.cluster_id,
                                               'NODE_ID': node.name,
                                               'CONFIG_PATH': config_path,
                                               'STARTUP_DEPENDENCY': 'started ovs-watcher-config' if filesystem is False else '(local-filesystems and started networking)'},
                                       target_name=target_name)
            ArakoonInstaller._logger.debug('  Deploying cluster {0} on {1} completed'.format(config.cluster_id, node.ip))
Beispiel #20
0
    def __init__(self, config_type, vpool_guid, storagedriver_id):
        """
        Initializes the class
        """
        if config_type != 'storagedriver':
            raise RuntimeError('Invalid configuration type. Allowed: storagedriver')

        storagerouterclient.Logger.setupLogging(LogHandler.load_path('storagerouterclient'))
        # noinspection PyArgumentList
        storagerouterclient.Logger.enableLogging()

        self._logger = LogHandler.get('extensions', name='storagedriver')
        self.config_type = config_type
        self.configuration = {}
        self.key = '/ovs/vpools/{0}/hosts/{1}/config'.format(vpool_guid, storagedriver_id)
        self.remote_path = Configuration.get_configuration_path(self.key).strip('/')
        self.is_new = True
        self.dirty_entries = []
    def ovs_4509_validate_arakoon_collapse_test():
        """
        Validate arakoon collapse
        """
        node_ips = [sr.ip for sr in GeneralStorageRouter.get_storage_routers()]
        node_ips.sort()
        for node_ip in node_ips:
            root_client = SSHClient(node_ip, username='******')
            arakoon_clusters = []
            for service in ServiceList.get_services():
                if service.is_internal is True and service.storagerouter.ip == node_ip and \
                    service.type.name in (ServiceType.SERVICE_TYPES.ARAKOON,
                                          ServiceType.SERVICE_TYPES.NS_MGR,
                                          ServiceType.SERVICE_TYPES.ALBA_MGR):
                    arakoon_clusters.append(service.name.replace('arakoon-', ''))

            for arakoon_cluster in arakoon_clusters:
                arakoon_config_path = Configuration.get_configuration_path('/ovs/arakoon/{0}/config'.format(arakoon_cluster))
                tlog_location = '/opt/OpenvStorage/db/arakoon/{0}/tlogs'.format(arakoon_cluster)

                # read_tlog_dir
                with remote(node_ip, [Configuration]) as rem:
                    config_contents = rem.Configuration.get('/ovs/arakoon/{0}/config'.format(arakoon_cluster), raw=True)
                for line in config_contents.splitlines():
                    if 'tlog_dir' in line:
                        tlog_location = line.split()[-1]

                nr_of_tlogs = TestArakoon.get_nr_of_tlogs_in_folder(root_client, tlog_location)
                old_headdb_timestamp = 0
                if root_client.file_exists('/'.join([tlog_location, 'head.db'])):
                    old_headdb_timestamp = root_client.run(['stat', '--format=%Y', tlog_location + '/head.db'])
                if nr_of_tlogs <= 2:
                    benchmark_command = ['arakoon', '--benchmark', '-n_clients', '1', '-max_n', '5_000', '-config', arakoon_config_path]
                    root_client.run(benchmark_command)

                GenericController.collapse_arakoon()

                nr_of_tlogs = TestArakoon.get_nr_of_tlogs_in_folder(root_client, tlog_location)
                new_headdb_timestamp = root_client.run(['stat', '--format=%Y', tlog_location + '/head.db'])
                assert nr_of_tlogs <= 2,\
                    'Arakoon collapse left {0} tlogs on the environment, expecting less than 2'.format(nr_of_tlogs)
                assert old_headdb_timestamp != new_headdb_timestamp,\
                    'Timestamp of the head_db file was not changed in the process of collapsing tlogs'
Beispiel #22
0
    def _fill_slot(cls, node, slot_id, extra):
        # type: (AlbaNode, str, any) -> List[dict]
        """
        Fills in the slots with ASDs and checks if the BACKEND role needs to be added
        :param node: The AlbaNode to fill on
        :type node: AlbaNode
        :param slot_id: ID of the slot to fill (which is an alias of the slot)
        :type slot_id: str
        :param extra: Extra information for filling
        :type extra: any
        :return: Information about the created osds
        :rtype: List[dict]
        """
        if node.type == AlbaNode.NODE_TYPES.S3:
            extra = extra.copy()
            try:
                s3_transaction_cluster = S3TransactionClusterList.get_s3_transaction_clusters(
                )[0]
                extra[
                    'transaction_arakoon_url'] = Configuration.get_configuration_path(
                        key=s3_transaction_cluster.config_location)
            except IndexError:
                raise RuntimeError(
                    'No transaction arakoon was deployed for this cluster!')
        created_osds = node.client.fill_slot(slot_id=slot_id, extra=extra)
        cls._logger.info(created_osds)

        # Sync model
        if node.storagerouter is not None:
            stack = node.client.get_stack()  # type: dict
            DiskController.sync_with_reality(
                storagerouter_guid=node.storagerouter_guid)
            slot_information = stack.get(slot_id, {})
            slot_aliases = slot_information.get('aliases', [])
            for disk in node.storagerouter.disks:
                if set(disk.aliases).intersection(set(slot_aliases)):
                    partition = disk.partitions[0]
                    if DiskPartition.ROLES.BACKEND not in partition.roles:
                        partition.roles.append(DiskPartition.ROLES.BACKEND)
                        partition.save()
        return created_osds or []  # Always return a list
Beispiel #23
0
        def _generate_proxy_cache_config(cache_settings, cache_type, proxy_index):
            if cache_settings['read'] is False and cache_settings['write'] is False:
                return ['none']

            if cache_settings['is_backend'] is True:
                cfg_tree_name = 'abm_bc' if cache_type == StorageDriverConfiguration.CACHE_BLOCK else 'abm_aa'
                return ['alba', {'cache_on_read': cache_settings['read'],
                                 'cache_on_write': cache_settings['write'],
                                 'albamgr_cfg_url': Configuration.get_configuration_path(config_tree.format(cfg_tree_name)),
                                 'bucket_strategy': ['1-to-1', {'prefix': vpool.guid,
                                                                'preset': cache_settings['backend_info']['preset']}],
                                 'manifest_cache_size': manifest_cache_size}]

            if cache_type == StorageDriverConfiguration.CACHE_BLOCK:
                path = '{0}/bc'.format(self.storagedriver_partitions_caches[proxy_index].path)
            else:
                path = '{0}/fc'.format(self.storagedriver_partitions_caches[proxy_index].path)
            return ['local', {'path': path,
                              'max_size': self.cache_size_local / self.sr_installer.requested_local_proxies,
                              'cache_on_read': cache_settings['read'],
                              'cache_on_write': cache_settings['write']}]
Beispiel #24
0
    def __init__(self, config_type, vpool_guid, storagedriver_id):
        """
        Initializes the class
        """
        if config_type != 'storagedriver':
            raise RuntimeError(
                'Invalid configuration type. Allowed: storagedriver')

        storagerouterclient.Logger.setupLogging(
            LogHandler.load_path('storagerouterclient'))
        # noinspection PyArgumentList
        storagerouterclient.Logger.enableLogging()

        self._logger = LogHandler.get('extensions', name='storagedriver')
        self.config_type = config_type
        self.configuration = {}
        self.key = '/ovs/vpools/{0}/hosts/{1}/config'.format(
            vpool_guid, storagedriver_id)
        self.remote_path = Configuration.get_configuration_path(
            self.key).strip('/')
        self.is_new = True
        self.dirty_entries = []
    def _asd_statistics(self):
        """
        Loads statistics from all it's asds in one call
        """
        from ovs.dal.hybrids.albaosd import AlbaOSD

        statistics = {}
        if len(self.abm_services) == 0:
            return statistics  # No ABM services yet, so backend not fully installed yet

        asd_ids = [osd.osd_id for osd in self.osds if osd.osd_type == AlbaOSD.OSD_TYPES.ASD]
        if len(asd_ids) == 0:
            return statistics

        try:
            config = Configuration.get_configuration_path('/ovs/arakoon/{0}-abm/config'.format(self.name))
            raw_statistics = AlbaCLI.run(command='asd-multistatistics', config=config, named_params={'long-id': ','.join(asd_ids)})
        except RuntimeError:
            return statistics
        for asd_id, stats in raw_statistics.iteritems():
            if stats['success'] is True:
                statistics[asd_id] = stats['result']
        return statistics
    def _usages(self):
        """
        Returns an overview of free space, total space and used space
        """
        # Collect total usage
        usages = {'free': 0.0, 'size': 0.0, 'used': 0.0}

        if self.abm_cluster is None:
            return usages

        config = Configuration.get_configuration_path(
            self.abm_cluster.config_location)
        try:
            osds_stats = AlbaCLI.run(command='list-osds', config=config)
        except AlbaError:
            self._logger.exception('Unable to fetch OSD information')
            return usages

        for osd_stats in osds_stats:
            usages['size'] += osd_stats['total']
            usages['used'] += osd_stats['used']
        usages['free'] = usages['size'] - usages['used']

        return usages
    def delete_preset(alba_backend_guid, name):
        """
        Deletes a preset from the Alba backend
        :param alba_backend_guid: Guid of the ALBA backend
        :type alba_backend_guid: str
        :param name: Name of the preset
        :type name: str
        :return: None
        """
        # VALIDATIONS
        alba_backend = AlbaBackend(alba_backend_guid)
        preset_default_map = dict((preset['name'], preset['is_default']) for preset in alba_backend.presets)
        if name not in preset_default_map:
            AlbaPresetController._logger.warning('Preset with name {0} for ALBA Backend {1} could not be found, so not deleting'.format(name, alba_backend.name))
            return

        if preset_default_map[name] is True:
            raise RuntimeError('Cannot delete the default preset')

        # DELETE PRESET
        AlbaPresetController._logger.debug('Deleting preset {0}'.format(name))
        config = Configuration.get_configuration_path(ArakoonInstaller.CONFIG_KEY.format(AlbaController.get_abm_cluster_name(alba_backend=alba_backend)))
        AlbaCLI.run(command='delete-preset', config=config, extra_params=[name])
        alba_backend.invalidate_dynamics()
    def _stack(self):
        """
        Returns an overview of this node's storage stack
        """
        from ovs.dal.hybrids.albabackend import AlbaBackend
        from ovs.dal.lists.albabackendlist import AlbaBackendList

        def _move(info):
            for move in [('state', 'status'),
                         ('state_detail', 'status_detail')]:
                if move[0] in info:
                    info[move[1]] = info[move[0]]
                    del info[move[0]]

        stack = {}
        node_down = False
        # Fetch stack from asd-manager
        try:
            remote_stack = self.client.get_stack()
            for slot_id, slot_data in remote_stack.iteritems():
                stack[slot_id] = {'status': 'ok'}
                stack[slot_id].update(slot_data)
                # Migrate state > status
                _move(stack[slot_id])
                for osd_data in slot_data.get('osds', {}).itervalues():
                    _move(osd_data)
        except (requests.ConnectionError, requests.Timeout,
                InvalidCredentialsError):
            self._logger.warning(
                'Error during stack retrieval. Assuming that the node is down')
            node_down = True

        model_osds = {}
        found_osds = {}
        # Apply own model to fetched stack
        for osd in self.osds:
            model_osds[osd.osd_id] = osd  # Initially set the info
            if osd.slot_id not in stack:
                stack[osd.slot_id] = {
                    'status':
                    self.OSD_STATUSES.UNKNOWN
                    if node_down is True else self.OSD_STATUSES.MISSING,
                    'status_detail':
                    self.OSD_STATUS_DETAILS.NODEDOWN
                    if node_down is True else '',
                    'osds': {}
                }
            osd_data = stack[osd.slot_id]['osds'].get(osd.osd_id, {})
            stack[osd.slot_id]['osds'][
                osd.osd_id] = osd_data  # Initially set the info in the stack
            osd_data.update(osd.stack_info)
            if node_down is True:
                osd_data['status'] = self.OSD_STATUSES.UNKNOWN
                osd_data['status_detail'] = self.OSD_STATUS_DETAILS.NODEDOWN
            elif osd.alba_backend_guid is not None:  # Osds has been claimed
                # Load information from alba
                if osd.alba_backend_guid not in found_osds:
                    found_osds[osd.alba_backend_guid] = {}
                    if osd.alba_backend.abm_cluster is not None:
                        config = Configuration.get_configuration_path(
                            osd.alba_backend.abm_cluster.config_location)
                        try:
                            for found_osd in AlbaCLI.run(
                                    command='list-all-osds', config=config):
                                found_osds[osd.alba_backend_guid][
                                    found_osd['long_id']] = found_osd
                        except (AlbaError, RuntimeError):
                            self._logger.exception(
                                'Listing all osds has failed')
                            osd_data['status'] = self.OSD_STATUSES.UNKNOWN
                            osd_data[
                                'status_detail'] = self.OSD_STATUS_DETAILS.ALBAERROR
                            continue

                if osd.osd_id not in found_osds[osd.alba_backend_guid]:
                    # Not claimed by any backend thus not in use
                    continue
                found_osd = found_osds[osd.alba_backend_guid][osd.osd_id]
                if found_osd['decommissioned'] is True:
                    osd_data['status'] = self.OSD_STATUSES.UNAVAILABLE
                    osd_data[
                        'status_detail'] = self.OSD_STATUS_DETAILS.DECOMMISSIONED
                    continue

                backend_interval_key = '/ovs/alba/backends/{0}/gui_error_interval'.format(
                    osd.alba_backend_guid)
                if Configuration.exists(backend_interval_key):
                    interval = Configuration.get(backend_interval_key)
                else:
                    interval = Configuration.get(
                        '/ovs/alba/backends/global_gui_error_interval')
                read = found_osd['read'] or [0]
                write = found_osd['write'] or [0]
                errors = found_osd['errors']
                osd_data['status'] = self.OSD_STATUSES.WARNING
                osd_data['status_detail'] = self.OSD_STATUS_DETAILS.ERROR
                if len(errors) == 0 or (len(read + write) > 0
                                        and max(min(read), min(write)) >
                                        max(error[0]
                                            for error in errors) + interval):
                    osd_data['status'] = self.OSD_STATUSES.OK
                    osd_data['status_detail'] = ''

        statistics = {}
        for slot_info in stack.itervalues():
            for osd_id, osd in slot_info['osds'].iteritems():
                if osd.get(
                        'status_detail') == self.OSD_STATUS_DETAILS.ACTIVATING:
                    osd['claimed_by'] = 'unknown'  # We won't be able to connect to it just yet
                    continue
                if osd_id not in model_osds:
                    # The osd is known by the remote node but not in the model
                    # In that case, let's connect to the OSD to see whether we get some info from it
                    try:
                        ips = osd['hosts'] if 'hosts' in osd and len(
                            osd['hosts']) > 0 else osd.get('ips', [])
                        port = osd['port']
                        claimed_by = 'unknown'
                        for ip in ips:
                            try:
                                # Output will be None if it is not claimed
                                claimed_by = AlbaCLI.run('get-osd-claimed-by',
                                                         named_params={
                                                             'host': ip,
                                                             'port': port
                                                         })
                                break
                            except (AlbaError, RuntimeError):
                                self._logger.warning(
                                    'get-osd-claimed-by failed for IP:port {0}:{1}'
                                    .format(ip, port))
                        alba_backend = AlbaBackendList.get_by_alba_id(
                            claimed_by)
                        osd['claimed_by'] = alba_backend.guid if alba_backend is not None else claimed_by
                    except KeyError:
                        osd['claimed_by'] = 'unknown'
                    except:
                        self._logger.exception(
                            'Could not load OSD info: {0}'.format(osd_id))
                        osd['claimed_by'] = 'unknown'
                        if osd.get('status') not in ['error', 'warning']:
                            osd['status'] = self.OSD_STATUSES.ERROR
                            osd['status_detail'] = self.OSD_STATUS_DETAILS.UNREACHABLE
                claimed_by = osd.get('claimed_by', 'unknown')
                if claimed_by == 'unknown':
                    continue
                try:
                    alba_backend = AlbaBackend(claimed_by)
                except ObjectNotFoundException:
                    continue
                # Add usage information
                if alba_backend not in statistics:
                    statistics[alba_backend] = alba_backend.osd_statistics
                osd_statistics = statistics[alba_backend]
                if osd_id not in osd_statistics:
                    continue
                stats = osd_statistics[osd_id]
                osd['usage'] = {
                    'size': int(stats['capacity']),
                    'used': int(stats['disk_usage']),
                    'available': int(stats['capacity'] - stats['disk_usage'])
                }
        return stack
    def validate_alba_backend_sanity_without_claimed_disks(alba_backend):
        """
        Validate whether the ALBA backend is configured correctly
        :param alba_backend: ALBA backend
        :return: None
        """
        # Attribute validation
        assert alba_backend.available is True,\
            'ALBA backend {0} is not available'.format(alba_backend.backend.name)
        assert len(alba_backend.presets) >= 1,\
            'No preset found for ALBA backend {0}'.format(alba_backend.backend.name)
        assert len([default for default in alba_backend.presets if default['is_default'] is True]) == 1,\
            'Could not find default preset for backend {0}'.format(alba_backend.backend.name)
        assert alba_backend.backend.backend_type.code == 'alba',\
            'Backend type for ALBA backend is {0}'.format(alba_backend.backend.backend_type.code)
        assert alba_backend.backend.status == 'RUNNING',\
            'Status for ALBA backend is {0}'.format(alba_backend.backend.status)

        # Validate ABM and NSM services
        storagerouters = GeneralStorageRouter.get_storage_routers()
        storagerouters_with_db_role = [sr for sr in storagerouters if GeneralStorageRouter.has_roles(storagerouter=sr, roles='DB') is True and sr.node_type == 'MASTER']

        assert len(alba_backend.abm_services) == len(storagerouters_with_db_role),\
            'Not enough ABM services found'
        assert len(alba_backend.nsm_services) == len(storagerouters_with_db_role),\
            'Not enough NSM services found'

        # Validate ALBA backend configuration structure
        alba_backend_key = '/ovs/alba/backends'
        assert Configuration.dir_exists(key=alba_backend_key) is True,\
            'Configuration does not contain key {0}'.format(alba_backend_key)

        actual_config_keys = [key for key in Configuration.list(alba_backend_key)]
        expected_config_keys = ['global_gui_error_interval', alba_backend.guid, 'default_nsm_hosts']
        optional_config_keys = ['verification_factor']

        expected_keys_amount = 0
        for optional_key in optional_config_keys:
            if optional_key in actual_config_keys:
                expected_keys_amount += 1

        for expected_key in expected_config_keys:
            if not re.match(Toolbox.regex_guid, expected_key):
                expected_keys_amount += 1
            assert expected_key in actual_config_keys,\
                'Key {0} was not found in tree {1}'.format(expected_key, alba_backend_key)

        for actual_key in list(actual_config_keys):
            if re.match(Toolbox.regex_guid, actual_key):
                actual_config_keys.remove(actual_key)  # Remove all alba backend keys
        assert len(actual_config_keys) == expected_keys_amount,\
            'Another key was added to the {0} tree'.format(alba_backend_key)

        this_alba_backend_key = '{0}/{1}'.format(alba_backend_key, alba_backend.guid)
        actual_keys = [key for key in Configuration.list(this_alba_backend_key)]
        expected_keys = ['maintenance']
        assert actual_keys == expected_keys,\
            'Actual keys: {0} - Expected keys: {1}'.format(actual_keys, expected_keys)

        maintenance_key = '{0}/maintenance'.format(this_alba_backend_key)
        actual_keys = [key for key in Configuration.list(maintenance_key)]
        expected_keys = ['nr_of_agents', 'config']
        assert set(actual_keys) == set(expected_keys),\
            'Actual keys: {0} - Expected keys: {1}'.format(actual_keys, expected_keys)
        # @TODO: Add validation for config values

        # Validate ASD node configuration structure
        alba_nodes = GeneralAlba.get_alba_nodes()
        assert len(alba_nodes) > 0,\
            'Could not find any ALBA nodes in the model'
        alba_node_key = '/ovs/alba/asdnodes'
        actual_keys = [key for key in Configuration.list(alba_node_key)]
        assert len(alba_nodes) == len(actual_keys),\
            'Amount of ALBA nodes in model: {0} >< amount of ALBA nodes in configuration: {1}.'.format(len(alba_nodes),
                                                                                                       len(actual_keys))
        for alba_node in alba_nodes:
            assert alba_node.node_id in actual_keys,\
                'ALBA node with ID {0} not present in configuration'.format(alba_node.node_id)

            actual_asdnode_keys = [key for key in Configuration.list('{0}/{1}'.format(alba_node_key, alba_node.node_id))]
            expected_asdnode_keys = ['config', 'services']
            assert actual_asdnode_keys == expected_asdnode_keys,\
                'Actual keys: {0} - Expected keys: {1}'.format(actual_asdnode_keys, expected_asdnode_keys)

            actual_config_keys = [key for key in Configuration.list('{0}/{1}/config'.format(alba_node_key, alba_node.node_id))]
            expected_config_keys = ['main', 'network']
            assert set(actual_config_keys) == set(expected_config_keys),\
                'Actual keys: {0} - Expected keys: {1}'.format(actual_config_keys, expected_config_keys)
            # @TODO: Add validation for main and network values

        # Validate Arakoon configuration structure
        arakoon_abm_key = '/ovs/arakoon/{0}/config'.format(alba_backend.abm_services[0].service.name).replace('arakoon-', '')
        arakoon_nsm_key = '/ovs/arakoon/{0}/config'.format(alba_backend.nsm_services[0].service.name).replace('arakoon-', '')
        assert Configuration.exists(key=arakoon_abm_key, raw=True) is True,\
            'Configuration key {0} does not exist'.format(arakoon_abm_key)
        assert Configuration.exists(key=arakoon_nsm_key, raw=True) is True,\
            'Configuration key {0} does not exist'.format(arakoon_nsm_key)
        # @TODO: Add validation for config values

        # Validate maintenance agents
        actual_amount_agents = len([service for node_services in [alba_node.client.list_maintenance_services() for alba_node in alba_nodes] for service in node_services])
        expected_amount_agents = 1
        assert actual_amount_agents == expected_amount_agents,\
            'Amount of maintenance agents is incorrect. Found {0} - Expected {1}'.format(actual_amount_agents,
                                                                                         expected_amount_agents)

        # Validate arakoon services
        machine_ids = [sr.machine_id for sr in storagerouters_with_db_role]
        abm_service_name = alba_backend.abm_services[0].service.name
        nsm_service_name = alba_backend.nsm_services[0].service.name
        for storagerouter in storagerouters_with_db_role:
            root_client = SSHClient(endpoint=storagerouter, username='******')
            for service_name in [abm_service_name, nsm_service_name]:
                assert GeneralService.has_service(name=service_name, client=root_client) is True,\
                    'Service {0} not deployed on Storage Router {1}'.format(service_name, storagerouter.name)
                exitcode, output = GeneralService.get_service_status(name=service_name, client=root_client)
                assert exitcode is True,\
                    'Service {0} not running on Storage Router {1} - {2}'.format(service_name, storagerouter.name,
                                                                                 output)
                out, err, _ = General.execute_command('arakoon --who-master -config {0}'.format(Configuration.get_configuration_path('/ovs/arakoon/{0}/config'.format(abm_service_name.replace('arakoon-', '')))))
                assert out.strip() in machine_ids,\
                    'Arakoon master is {0}, but should be 1 of "{1}"'.format(out.strip(), ', '.join(machine_ids))
Beispiel #30
0
    def migrate(previous_version):
        """
        Migrates from a given version to the current version. It uses 'previous_version' to be smart
        wherever possible, but the code should be able to migrate any version towards the expected version.
        When this is not possible, the code can set a minimum version and raise when it is not met.
        :param previous_version: The previous version from which to start the migration
        :type previous_version: float
        """

        working_version = previous_version

        if working_version == 0:
            from ovs.dal.hybrids.servicetype import ServiceType
            # Initial version:
            # * Add any basic configuration or model entries

            # Add backends
            for backend_type_info in [('ALBA', 'alba')]:
                code = backend_type_info[1]
                backend_type = BackendTypeList.get_backend_type_by_code(code)
                if backend_type is None:
                    backend_type = BackendType()
                backend_type.name = backend_type_info[0]
                backend_type.code = code
                backend_type.save()

            # Add service types
            for service_type_info in [
                    ServiceType.SERVICE_TYPES.NS_MGR,
                    ServiceType.SERVICE_TYPES.ALBA_MGR,
                    ServiceType.SERVICE_TYPES.ALBA_S3_TRANSACTION
            ]:
                service_type = ServiceType()
                service_type.name = service_type_info
                service_type.save()

        # From here on, all actual migration should happen to get to the expected state for THIS RELEASE
        elif working_version < DALMigrator.THIS_VERSION:
            import hashlib
            from ovs.dal.exceptions import ObjectNotFoundException
            from ovs.dal.helpers import HybridRunner, Descriptor
            from ovs.dal.hybrids.albaabmcluster import ABMCluster
            from ovs.dal.hybrids.albaosd import AlbaOSD
            from ovs.dal.hybrids.albansmcluster import NSMCluster
            from ovs.dal.hybrids.j_abmservice import ABMService
            from ovs.dal.hybrids.j_nsmservice import NSMService
            from ovs.dal.hybrids.service import Service
            from ovs.dal.hybrids.servicetype import ServiceType
            from ovs.dal.lists.albabackendlist import AlbaBackendList
            from ovs.dal.lists.albanodelist import AlbaNodeList
            from ovs.dal.lists.servicetypelist import ServiceTypeList
            from ovs.dal.lists.storagerouterlist import StorageRouterList
            from ovs.extensions.db.arakooninstaller import ArakoonClusterConfig, ArakoonInstaller
            from ovs.extensions.generic.configuration import Configuration, NotFoundException
            from ovs_extensions.generic.toolbox import ExtensionsToolbox
            from ovs.extensions.plugins.albacli import AlbaCLI
            from ovs.extensions.storage.persistentfactory import PersistentFactory

            # Migrate unique constraints & indexes
            client = PersistentFactory.get_client()
            hybrid_structure = HybridRunner.get_hybrids()
            for class_descriptor in hybrid_structure.values():
                cls = Descriptor().load(class_descriptor).get_object()
                classname = cls.__name__.lower()
                unique_key = 'ovs_unique_{0}_{{0}}_'.format(classname)
                index_prefix = 'ovs_index_{0}|{{0}}|'.format(classname)
                index_key = 'ovs_index_{0}|{{0}}|{{1}}'.format(classname)
                uniques = []
                indexes = []
                # noinspection PyProtectedMember
                for prop in cls._properties:
                    if prop.unique is True and len([
                            k for k in client.prefix(
                                unique_key.format(prop.name))
                    ]) == 0:
                        uniques.append(prop.name)
                    if prop.indexed is True and len([
                            k for k in client.prefix(
                                index_prefix.format(prop.name))
                    ]) == 0:
                        indexes.append(prop.name)
                if len(uniques) > 0 or len(indexes) > 0:
                    prefix = 'ovs_data_{0}_'.format(classname)
                    for key, data in client.prefix_entries(prefix):
                        for property_name in uniques:
                            ukey = '{0}{1}'.format(
                                unique_key.format(property_name),
                                hashlib.sha1(str(
                                    data[property_name])).hexdigest())
                            client.set(ukey, key)
                        for property_name in indexes:
                            if property_name not in data:
                                continue  # This is the case when there's a new indexed property added.
                            ikey = index_key.format(
                                property_name,
                                hashlib.sha1(str(
                                    data[property_name])).hexdigest())
                            index = list(
                                client.get_multi([ikey], must_exist=False))[0]
                            transaction = client.begin_transaction()
                            if index is None:
                                client.assert_value(ikey,
                                                    None,
                                                    transaction=transaction)
                                client.set(ikey, [key],
                                           transaction=transaction)
                            elif key not in index:
                                client.assert_value(ikey,
                                                    index[:],
                                                    transaction=transaction)
                                client.set(ikey,
                                           index + [key],
                                           transaction=transaction)
                            client.apply_transaction(transaction)

            #############################################
            # Introduction of ABMCluster and NSMCluster #
            #############################################
            # Verify presence of unchanged ALBA Backends
            alba_backends = AlbaBackendList.get_albabackends()
            changes_required = False
            for alba_backend in alba_backends:
                if alba_backend.abm_cluster is None or len(
                        alba_backend.nsm_clusters) == 0:
                    changes_required = True
                    break

            if changes_required:
                # Retrieve ABM and NSM clusters
                abm_cluster_info = []
                nsm_cluster_info = []
                for cluster_name in Configuration.list('/ovs/arakoon'):
                    try:
                        metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(
                            cluster_name=cluster_name)
                        if metadata[
                                'cluster_type'] == ServiceType.ARAKOON_CLUSTER_TYPES.ABM:
                            abm_cluster_info.append(metadata)
                        elif metadata[
                                'cluster_type'] == ServiceType.ARAKOON_CLUSTER_TYPES.NSM:
                            nsm_cluster_info.append(metadata)
                    except NotFoundException:
                        continue

                # Retrieve NSM Arakoon cluster information
                cluster_arakoon_map = {}
                for cluster_info in abm_cluster_info + nsm_cluster_info:
                    cluster_name = cluster_info['cluster_name']
                    arakoon_config = ArakoonClusterConfig(
                        cluster_id=cluster_name)
                    cluster_arakoon_map[
                        cluster_name] = arakoon_config.export_dict()

                storagerouter_map = dict(
                    (storagerouter.machine_id, storagerouter) for storagerouter
                    in StorageRouterList.get_storagerouters())
                alba_backend_id_map = dict((alba_backend.alba_id, alba_backend)
                                           for alba_backend in alba_backends)
                for cluster_info in abm_cluster_info:
                    internal = cluster_info['internal']
                    cluster_name = cluster_info['cluster_name']
                    config_location = Configuration.get_configuration_path(
                        key=ArakoonClusterConfig.CONFIG_KEY.format(
                            cluster_name))
                    try:
                        alba_id = AlbaCLI.run(command='get-alba-id',
                                              config=config_location,
                                              named_params={'attempts':
                                                            3})['id']
                        nsm_hosts = AlbaCLI.run(command='list-nsm-hosts',
                                                config=config_location,
                                                named_params={'attempts': 3})
                    except RuntimeError:
                        continue

                    alba_backend = alba_backend_id_map.get(alba_id)
                    if alba_backend is None:  # ALBA Backend with ID not found in model
                        continue
                    if alba_backend.abm_cluster is not None and len(
                            alba_backend.nsm_clusters
                    ) > 0:  # Clusters already exist
                        continue

                    # Create ABM Cluster
                    if alba_backend.abm_cluster is None:
                        abm_cluster = ABMCluster()
                        abm_cluster.name = cluster_name
                        abm_cluster.alba_backend = alba_backend
                        abm_cluster.config_location = ArakoonClusterConfig.CONFIG_KEY.format(
                            cluster_name)
                        abm_cluster.save()
                    else:
                        abm_cluster = alba_backend.abm_cluster

                    # Create ABM Services
                    abm_arakoon_config = cluster_arakoon_map[cluster_name]
                    abm_arakoon_config.pop('global')
                    arakoon_nodes = abm_arakoon_config.keys()
                    if internal is False:
                        services_to_create = 1
                    else:
                        if set(arakoon_nodes).difference(
                                set(storagerouter_map.keys())):
                            continue
                        services_to_create = len(arakoon_nodes)
                    for index in range(services_to_create):
                        service = Service()
                        service.name = 'arakoon-{0}-abm'.format(
                            alba_backend.name)
                        service.type = ServiceTypeList.get_by_name(
                            ServiceType.SERVICE_TYPES.ALBA_MGR)
                        if internal is True:
                            arakoon_node_config = abm_arakoon_config[
                                arakoon_nodes[index]]
                            service.ports = [
                                arakoon_node_config['client_port'],
                                arakoon_node_config['messaging_port']
                            ]
                            service.storagerouter = storagerouter_map[
                                arakoon_nodes[index]]
                        else:
                            service.ports = []
                            service.storagerouter = None
                        service.save()

                        abm_service = ABMService()
                        abm_service.service = service
                        abm_service.abm_cluster = abm_cluster
                        abm_service.save()

                    # Create NSM Clusters
                    for cluster_index, nsm_host in enumerate(
                            sorted(nsm_hosts,
                                   key=lambda host: ExtensionsToolbox.
                                   advanced_sort(host['cluster_id'], '_'))):
                        nsm_cluster_name = nsm_host['cluster_id']
                        nsm_arakoon_config = cluster_arakoon_map.get(
                            nsm_cluster_name)
                        if nsm_arakoon_config is None:
                            continue

                        number = cluster_index if internal is False else int(
                            nsm_cluster_name.split('_')[-1])
                        nsm_cluster = NSMCluster()
                        nsm_cluster.name = nsm_cluster_name
                        nsm_cluster.number = number
                        nsm_cluster.alba_backend = alba_backend
                        nsm_cluster.config_location = ArakoonClusterConfig.CONFIG_KEY.format(
                            nsm_cluster_name)
                        nsm_cluster.save()

                        # Create NSM Services
                        nsm_arakoon_config.pop('global')
                        arakoon_nodes = nsm_arakoon_config.keys()
                        if internal is False:
                            services_to_create = 1
                        else:
                            if set(arakoon_nodes).difference(
                                    set(storagerouter_map.keys())):
                                continue
                            services_to_create = len(arakoon_nodes)
                        for service_index in range(services_to_create):
                            service = Service()
                            service.name = 'arakoon-{0}-nsm_{1}'.format(
                                alba_backend.name, number)
                            service.type = ServiceTypeList.get_by_name(
                                ServiceType.SERVICE_TYPES.NS_MGR)
                            if internal is True:
                                arakoon_node_config = nsm_arakoon_config[
                                    arakoon_nodes[service_index]]
                                service.ports = [
                                    arakoon_node_config['client_port'],
                                    arakoon_node_config['messaging_port']
                                ]
                                service.storagerouter = storagerouter_map[
                                    arakoon_nodes[service_index]]
                            else:
                                service.ports = []
                                service.storagerouter = None
                            service.save()

                            nsm_service = NSMService()
                            nsm_service.service = service
                            nsm_service.nsm_cluster = nsm_cluster
                            nsm_service.save()

            # Clean up all junction services no longer linked to an ALBA Backend
            all_nsm_services = [
                service.nsm_service for service in ServiceTypeList.get_by_name(
                    ServiceType.SERVICE_TYPES.NS_MGR).services
                if service.nsm_service.nsm_cluster is None
            ]
            all_abm_services = [
                service.abm_service for service in ServiceTypeList.get_by_name(
                    ServiceType.SERVICE_TYPES.ALBA_MGR).services
                if service.abm_service.abm_cluster is None
            ]
            for abm_service in all_abm_services:
                abm_service.delete()
                abm_service.service.delete()
            for nsm_service in all_nsm_services:
                nsm_service.delete()
                nsm_service.service.delete()

            ################################
            # Introduction of Active Drive #
            ################################
            # Update slot_id and Alba Node relation for all OSDs
            client = PersistentFactory.get_client()
            disk_osd_map = {}
            for key, data in client.prefix_entries('ovs_data_albaosd_'):
                alba_disk_guid = data.get('alba_disk', {}).get('guid')
                if alba_disk_guid is not None:
                    if alba_disk_guid not in disk_osd_map:
                        disk_osd_map[alba_disk_guid] = []
                    disk_osd_map[alba_disk_guid].append(
                        key.replace('ovs_data_albaosd_', ''))
                try:
                    value = client.get(key)
                    value.pop('alba_disk', None)
                    client.set(key=key, value=value)
                except Exception:
                    pass  # We don't care if we would have any leftover AlbaDisk information in _data, but its cleaner not to

            alba_guid_node_map = dict(
                (an.guid, an) for an in AlbaNodeList.get_albanodes())
            for key, data in client.prefix_entries('ovs_data_albadisk_'):
                alba_disk_guid = key.replace('ovs_data_albadisk_', '')
                alba_node_guid = data.get('alba_node', {}).get('guid')
                if alba_disk_guid in disk_osd_map and alba_node_guid in alba_guid_node_map and len(
                        data.get('aliases', [])) > 0:
                    slot_id = data['aliases'][0].split('/')[-1]
                    for osd_guid in disk_osd_map[alba_disk_guid]:
                        try:
                            osd = AlbaOSD(osd_guid)
                        except ObjectNotFoundException:
                            continue
                        osd.slot_id = slot_id
                        osd.alba_node = alba_guid_node_map[alba_node_guid]
                        osd.save()
                client.delete(key=key, must_exist=False)

            # Remove unique constraints for AlbaNode IP
            for key in client.prefix('ovs_unique_albanode_ip_'):
                client.delete(key=key, must_exist=False)

            # Remove relation for all Alba Disks
            for key in client.prefix('ovs_reverseindex_albadisk_'):
                client.delete(key=key, must_exist=False)

            # Remove the relation between AlbaNode and AlbaDisk
            for key in client.prefix('ovs_reverseindex_albanode_'):
                if '|disks|' in key:
                    client.delete(key=key, must_exist=False)

        return DALMigrator.THIS_VERSION
    def get_disk_safety(cls, result_handler, backends_to_include=(), backends_to_skip=(), include_errored_as_dead=False):
        """
        Fetch safety of every namespace in every backend
        - amount_in_bucket is in %
        - max_disk_safety is the max. key that should be available in current_disk_safety
        Output example: {'mybackend02': {'1,2': {'max_disk_safety': 2, 'current_disk_safety':
        {2: {'namespace': u'b4eef27e-ef54-4fe8-8658-cdfbda7ceae4_000000065', 'amount_in_bucket': 100}}}}, 'mybackend':
        {'1,2': {'max_disk_safety': 2, 'current_disk_safety':
        {2: {'namespace': u'b4eef27e-ef54-4fe8-8658-cdfbda7ceae4_000000065', 'amount_in_bucket': 100}}}},
        'mybackend-global': {'1,2': {'max_disk_safety': 2, 'current_disk_safety':
        {1: {'namespace': u'e88c88c9-632c-4975-b39f-e9993e352560', 'amount_in_bucket': 100}}}}}
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :param backends_to_include: Backend(s) to check for
        :type backends_to_include: tuple[str]
        :param backends_to_skip: Backend(s) to skip checking for
        :type backends_to_skip: tuple[str]
        :param include_errored_as_dead: OSDs with errors as treated as dead ones during the calculation
        :type include_errored_as_dead: bool
        :return: Safety of every namespace in every backend
        :rtype: dict
        """
        disk_safety_overview = {}
        for alba_backend in BackendHelper.get_albabackends():
            if backends_to_skip and alba_backend.name in backends_to_skip:
                continue
            if backends_to_include and alba_backend.name not in backends_to_include:
                continue
            disk_safety_overview[alba_backend.name] = {}
            config = Configuration.get_configuration_path('ovs/arakoon/{0}-abm/config'.format(alba_backend.name))
            # Fetch alba info
            try:
                extra_params = []
                if include_errored_as_dead:
                    # @TODO Revisit once the https://github.com/openvstorage/alba/issues/441 has been resolved
                    extra_params.append('--include-errored-as-dead')
                namespaces = AlbaCLI.run(command='get-disk-safety', config=config, extra_params=extra_params)
                cache_eviction_prefix_preset_pairs = AlbaCLI.run(command='get-maintenance-config', config=config)['cache_eviction_prefix_preset_pairs']
                presets = AlbaCLI.run(command='list-presets', config=config)
            except AlbaException as ex:
                result_handler.exception('Could not fetch alba information for backend {0} Message: {1}'.format(alba_backend.name, ex),
                                         code=ErrorCodes.alba_cmd_fail)
                # Do not execute further
                continue

            # collect in_use presets & their policies
            for preset in presets:
                if not preset['in_use']:
                    continue
                for policy in preset['policies']:
                    disk_safety_overview[alba_backend.name]['{0},{1}'.format(str(policy[0]), str(policy[1]))] = {'current_disk_safety': {}, 'max_disk_safety': policy[1]}

            # collect namespaces
            ignorable_namespaces = [cls.BASE_NAMESPACE_KEY] + cache_eviction_prefix_preset_pairs.keys()
            test_worthy_namespaces = (item for item in namespaces if not item['namespace'].startswith(tuple(ignorable_namespaces)))
            for namespace in test_worthy_namespaces:
                # calc total objects in namespace
                total_count = 0
                for bucket_safety in namespace['bucket_safety']:
                    total_count += bucket_safety['count']

                for bucket_safety in namespace['bucket_safety']:
                    # calc safety bucket
                    calculated_disk_safety = bucket_safety['remaining_safety']
                    safety = '{0},{1}'.format(str(bucket_safety['bucket'][0]), str(bucket_safety['bucket'][1]))
                    current_disk_safety = disk_safety_overview[alba_backend.name][safety]['current_disk_safety']
                    to_be_added_namespace = {'namespace': namespace['namespace'],
                                             'amount_in_bucket': "%.5f" % (float(bucket_safety['count'])/float(total_count)*100)}
                    if calculated_disk_safety in current_disk_safety:
                        current_disk_safety[calculated_disk_safety].append(to_be_added_namespace)
                    else:
                        current_disk_safety[calculated_disk_safety] = [to_be_added_namespace]
        return disk_safety_overview
    def check_if_proxies_work(cls, result_handler):
        """
        Checks if all Alba Proxies work on a local machine, it creates a namespace and tries to put and object
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        namespace_params = {'bucket_count': (list, None),
                            'logical': (int, None),
                            'storage': (int, None),
                            'storage_per_osd': (list, None)}

        result_handler.info('Checking the ALBA proxies.', add_to_result=False)

        amount_of_presets_not_working = []
        # ignore possible subprocess output
        fnull = open(os.devnull, 'w')
        # try put/get/verify on all available proxies on the local node
        local_proxies = ServiceHelper.get_local_proxy_services()
        if len(local_proxies) == 0:
            result_handler.info('Found no proxies.', add_to_result=False)
            return amount_of_presets_not_working
        api_cache = {}
        for service in local_proxies:
            try:
                result_handler.info('Checking ALBA proxy {0}.'.format(service.name), add_to_result=False)
                ip = service.alba_proxy.storagedriver.storage_ip
                # Encapsulating try to determine test output
                try:
                    # Determine what to what backend the proxy is connected
                    proxy_client_cfg = AlbaCLI.run(command='proxy-client-cfg', named_params={'host': ip, 'port': service.ports[0]})
                except AlbaException:
                    result_handler.failure('Fetching proxy info has failed. Please verify if {0}:{1} is the correct address for proxy {2}.'.format(ip, service.ports[0], service.name),
                                           code=ErrorCodes.alba_cmd_fail)
                    continue
                # Fetch arakoon information
                abm_name = proxy_client_cfg.get('cluster_id')
                # Check if proxy config is correctly setup
                if abm_name is None:
                    raise ConfigNotMatchedException('Proxy config for proxy {0} does not have the correct format on node {1} with port {2}.'.format(service.name, ip, service.ports[0]))
                abm_config = Configuration.get_configuration_path('/ovs/vpools/{0}/proxies/{1}/config/abm' .format(service.alba_proxy.storagedriver.vpool.guid, service.alba_proxy.guid))

                # Determine presets / backend
                try:
                    presets = AlbaCLI.run(command='list-presets', config=abm_config)
                except AlbaException:
                    result_handler.failure('Listing the presets has failed. Please check the arakoon config path. We used {0}'.format(abm_config),
                                           code=ErrorCodes.alba_cmd_fail)
                    continue

                for preset in presets:
                    # If preset is not in use, test will fail so add a skip
                    if preset['in_use'] is False:
                        result_handler.skip('Preset {0} is not in use and will not be checked'.format(preset['name']))
                        continue
                    preset_name = preset['name']
                    # Encapsulation try for cleanup
                    try:
                        # Generate new namespace name using the preset
                        namespace_key_prefix = 'ovs-healthcheck-ns-{0}-{1}'.format(preset_name, AlbaHealthCheck.LOCAL_ID)
                        namespace_key = '{0}_{1}'.format(namespace_key_prefix, uuid.uuid4())
                        object_key = 'ovs-healthcheck-obj-{0}'.format(str(uuid.uuid4()))
                        # Create namespace
                        AlbaCLI.run(command='proxy-create-namespace',
                                    named_params={'host': ip, 'port': service.ports[0]},
                                    extra_params=[namespace_key, preset_name])
                        # Wait until fully created
                        namespace_start_time = time.time()
                        for index in xrange(2):
                            # Running twice because the first one could give a false positive as the osds will alert the nsm
                            # and the nsm would respond with got messages but these were not the ones we are after
                            AlbaCLI.run(command='deliver-messages', config=abm_config)
                        while True:
                            if time.time() - namespace_start_time > AlbaHealthCheck.NAMESPACE_TIMEOUT:
                                raise AlbaTimeOutException('Creating namespace has timed out after {0}s'.format(time.time() - namespace_start_time), 'deliver-messages')
                            list_ns_osds_output = AlbaCLI.run(command='list-ns-osds', config=abm_config, extra_params=[namespace_key])
                            # Example output: [[0, [u'Active']], [3, [u'Active']]]
                            namespace_ready = True
                            for osd_info in list_ns_osds_output:
                                if osd_info[1][0] != 'Active':
                                    # If we found an OSD not Active, check if preset is satisfiable
                                    namespace_ready = False
                                    break
                            if namespace_ready is True:
                                break
                            else:
                                result_handler.info('Not all OSDs have responded to the creation message. Fetching the safety', add_to_result=False)
                                try:
                                    # Fetch the preset information on the Framework
                                    # This add an extra delay for the messages to propagate too
                                    vpool = service.alba_proxy.storagedriver.vpool
                                    alba_backend_guid = vpool.metadata['backend']['backend_info']['alba_backend_guid']
                                    api_url = 'alba/backends/{0}'.format(alba_backend_guid)
                                    if api_url not in api_cache:
                                        connection_info = vpool.metadata['backend']['backend_info']['connection_info']
                                        api_client = OVSClient(connection_info['host'], connection_info['port'], (connection_info['client_id'], connection_info['client_secret']))
                                        start = time.time()
                                        _presets = api_client.get(api_url, params={'contents': 'presets'})['presets']
                                        api_cache[api_url] = _presets
                                        result_handler.info('Fetching the safety took {0} seconds'.format(time.time() - start))
                                    _presets = api_cache[api_url]
                                    _preset = filter(lambda p: p['name'] == preset_name, _presets)[0]
                                    if _preset['is_available'] is True:
                                        # Preset satisfiable, don't care about osds availability
                                        result_handler.info('Requested preset is available, no longer waiting on \'deliver_messages\'', add_to_result=False)
                                        break
                                    else:
                                        raise ValueError('Requested preset is marked as unavailable. Please check the disk safety'.format(time.time() - namespace_start_time))
                                except ValueError:
                                    raise
                                except Exception:
                                    msg = 'Could not query the preset data. Checking the preset might timeout'
                                    result_handler.warning(msg)
                                    cls.logger.exception(msg)
                                    # Sleep for syncing purposes
                                    time.sleep(1)
                        result_handler.success('Namespace successfully created on proxy {0} with preset {1}!'.format(service.name, preset_name),
                                               code=ErrorCodes.proxy_namespace_create)
                        namespace_info = AlbaCLI.run(command='show-namespace', config=abm_config, extra_params=[namespace_key])
                        ExtensionsToolbox.verify_required_params(required_params=namespace_params, actual_params=namespace_info)
                        result_handler.success('Namespace successfully fetched on proxy {0} with preset {1}!'.format(service.name, preset_name),
                                               code=ErrorCodes.proxy_namespace_fetch)

                        # Put test object to given dir
                        with open(AlbaHealthCheck.TEMP_FILE_LOC, 'wb') as output_file:
                            output_file.write(os.urandom(AlbaHealthCheck.TEMP_FILE_SIZE))
                        AlbaCLI.run(command='proxy-upload-object',
                                    named_params={'host': ip, 'port': service.ports[0]},
                                    extra_params=[namespace_key, AlbaHealthCheck.TEMP_FILE_LOC, object_key])
                        result_handler.success('Successfully uploaded the object to namespace {0}'.format(namespace_key),
                                               code=ErrorCodes.proxy_upload_obj)
                        # download object
                        AlbaCLI.run(command='proxy-download-object',
                                    named_params={'host': ip, 'port': service.ports[0]},
                                    extra_params=[namespace_key, object_key, AlbaHealthCheck.TEMP_FILE_FETCHED_LOC])
                        result_handler.success('Successfully downloaded the object to namespace {0}'.format(namespace_key),
                                               code=ErrorCodes.proxy_download_obj)
                        # check if files exists - issue #57
                        if not(os.path.isfile(AlbaHealthCheck.TEMP_FILE_FETCHED_LOC) and os.path.isfile(AlbaHealthCheck.TEMP_FILE_LOC)):
                            # creation of object failed
                            raise ObjectNotFoundException(ValueError('Creation of object has failed'))
                        hash_original = hashlib.md5(open(AlbaHealthCheck.TEMP_FILE_LOC, 'rb').read()).hexdigest()
                        hash_fetched = hashlib.md5(open(AlbaHealthCheck.TEMP_FILE_FETCHED_LOC, 'rb').read()).hexdigest()

                        if hash_original == hash_fetched:
                            result_handler.success('Fetched object {0} from namespace {1} on proxy {2} with preset {3} matches the created object!'.format(object_key, namespace_key, service.name, preset_name),
                                                   code=ErrorCodes.proxy_verify_obj)
                        else:
                            result_handler.failure('Fetched object {0} from namespace {1} on proxy {2} with preset {3} does not match the created object!'.format(object_key, namespace_key, service.name, preset_name),
                                                   code=ErrorCodes.proxy_verify_obj_fail)

                    except ValueError:
                        result_handler.failure('The preset is not available for use')
                    except ObjectNotFoundException as ex:
                        amount_of_presets_not_working.append(preset_name)
                        result_handler.failure('Failed to put object on namespace {0} failed on proxy {1}with preset {2} With error {3}'.format(namespace_key, service.name, preset_name, ex))
                    except AlbaTimeOutException as ex:
                        result_handler.failure(str(ex))
                    except AlbaException as ex:
                        code = ErrorCodes.alba_cmd_fail
                        if ex.alba_command == 'proxy-create-namespace':
                            result_handler.failure('Create namespace has failed with {0} on namespace {1} with proxy {2} with preset {3}'.format(str(ex), namespace_key, service.name, preset_name),
                                                   code=code)
                        elif ex.alba_command == 'show-namespace':
                            result_handler.failure('Show namespace has failed with {0} on namespace {1} with proxy {2} with preset {3}'.format(str(ex), namespace_key, service.name, preset_name),
                                                   code=code)
                        elif ex.alba_command == 'proxy-upload-object':
                            result_handler.failure('Uploading the object has failed with {0} on namespace {1} with proxy {2} with preset {3}'.format(str(ex), namespace_key, service.name, preset_name),
                                                   code=code)
                        elif ex.alba_command == 'proxy-download-object':
                            result_handler.failure('Downloading the object has failed with {0} on namespace {1} with proxy {2} with preset {3}'.format(str(ex), namespace_key, service.name, preset_name),
                                                   code=code)
                    finally:
                        # Delete the created namespace and preset
                        subprocess.call(['rm', str(AlbaHealthCheck.TEMP_FILE_LOC)], stdout=fnull, stderr=subprocess.STDOUT)
                        subprocess.call(['rm', str(AlbaHealthCheck.TEMP_FILE_FETCHED_LOC)], stdout=fnull, stderr=subprocess.STDOUT)
                        try:
                            namespaces = AlbaCLI.run(command='list-namespaces', config=abm_config)
                            namespaces_to_remove = []
                            proxy_named_params = {'host': ip, 'port': service.ports[0]}
                            for namespace in namespaces:
                                if namespace['name'].startswith(namespace_key_prefix):
                                    namespaces_to_remove.append(namespace['name'])
                            for namespace_name in namespaces_to_remove:
                                if namespace_name == namespace_key:
                                    result_handler.info('Deleting namespace {0}.'.format(namespace_name))
                                else:
                                    result_handler.warning('Deleting namespace {0} which was leftover from a previous run.'.format(namespace_name))

                                AlbaCLI.run(command='proxy-delete-namespace',
                                            named_params=proxy_named_params,
                                            extra_params=[namespace_name])

                                namespace_delete_start = time.time()
                                while True:
                                    try:
                                        AlbaCLI.run(command='show-namespace', config=abm_config, extra_params=[namespace_name])  # Will fail if the namespace does not exist
                                    except AlbaException:
                                        result_handler.success('Namespace {0} successfully removed.'.format(namespace_name))
                                        break
                                    if time.time() - namespace_delete_start > AlbaHealthCheck.NAMESPACE_TIMEOUT:
                                        raise AlbaTimeOutException('Delete namespace has timed out after {0}s'.format(time.time() - namespace_start_time), 'show-namespace')

                                # be tidy, and make the proxy forget the namespace
                                try:
                                    AlbaCLI.run(command='proxy-statistics',
                                                named_params=proxy_named_params,
                                                extra_params=['--forget', namespace_name])
                                except:
                                    result_handler.warning('Failed to make proxy forget namespace {0}.'.format(namespace_name))
                        except AlbaException as ex:
                            if ex.alba_command == 'list-namespaces':
                                result_handler.failure(
                                    'list namespaces has failed with {0} on namespace {1} with proxy {2} with preset {3}'.format(
                                        str(ex), namespace_key, service.name, preset_name))
                            elif ex.alba_command == 'proxy-delete-namespace':
                                result_handler.failure(
                                    'Delete namespace has failed with {0} on namespace {1} with proxy {2} with preset {3}'.format(
                                        str(ex), namespace_key, service.name, preset_name))

            except subprocess.CalledProcessError as ex:
                # this should stay for the deletion of the remaining files
                amount_of_presets_not_working.append(service.name)
                result_handler.failure('Proxy {0} has some problems. Got {1} as error'.format(service.name, ex),
                                       code=ErrorCodes.proxy_problems)

            except ConfigNotMatchedException as ex:
                amount_of_presets_not_working.append(service.name)
                result_handler.failure('Proxy {0} has some problems. Got {1} as error'.format(service.name, ex),
                                       code=ErrorCodes.proxy_problems)
Beispiel #33
0
    def get_disk_safety(cls, result_handler):
        """
        Fetch safety of every namespace in every backend
        - amount_in_bucket is in %
        - max_disk_safety is the max. key that should be available in current_disk_safety
        Output example: {'mybackend02': {'1,2': {'max_disk_safety': 2, 'current_disk_safety':
        {2: {'namespace': u'b4eef27e-ef54-4fe8-8658-cdfbda7ceae4_000000065', 'amount_in_bucket': 100}}}}, 'mybackend':
        {'1,2': {'max_disk_safety': 2, 'current_disk_safety':
        {2: {'namespace': u'b4eef27e-ef54-4fe8-8658-cdfbda7ceae4_000000065', 'amount_in_bucket': 100}}}},
        'mybackend-global': {'1,2': {'max_disk_safety': 2, 'current_disk_safety':
        {1: {'namespace': u'e88c88c9-632c-4975-b39f-e9993e352560', 'amount_in_bucket': 100}}}}}
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: Safety of every namespace in every backend
        :rtype: dict
        """
        disk_safety_overview = {}
        for alba_backend in BackendHelper.get_albabackends():
            disk_safety_overview[alba_backend.name] = {}
            config = Configuration.get_configuration_path(
                'ovs/arakoon/{0}-abm/config'.format(alba_backend.name))
            # Fetch alba info
            try:
                # @TODO add this to extra_params to include corrupt asds. Currently there is a bug with it
                # Ticket: https://github.com/openvstorage/alba/issues/441
                # extra_params=['--include-errored-as-dead']
                namespaces = AlbaCLI.run(command='get-disk-safety',
                                         config=config)
                cache_eviction_prefix_preset_pairs = AlbaCLI.run(
                    command='get-maintenance-config',
                    config=config)['cache_eviction_prefix_preset_pairs']
                presets = AlbaCLI.run(command='list-presets', config=config)
            except AlbaException as ex:
                result_handler.exception(
                    'Could not fetch alba information for backend {0} Message: {1}'
                    .format(alba_backend.name, ex))
                # Do not execute further
                continue

            # collect in_use presets & their policies
            for preset in presets:
                if not preset['in_use']:
                    continue
                for policy in preset['policies']:
                    disk_safety_overview[alba_backend.name]['{0},{1}'.format(
                        str(policy[0]), str(policy[1]))] = {
                            'current_disk_safety': {},
                            'max_disk_safety': policy[1]
                        }

            # collect namespaces
            ignorable_namespaces = [
                cls.BASE_NAMESPACE_KEY
            ] + cache_eviction_prefix_preset_pairs.keys()
            test_worthy_namespaces = (item for item in namespaces
                                      if not item['namespace'].startswith(
                                          tuple(ignorable_namespaces)))
            for namespace in test_worthy_namespaces:
                # calc total objects in namespace
                total_count = 0
                for bucket_safety in namespace['bucket_safety']:
                    total_count += bucket_safety['count']

                for bucket_safety in namespace['bucket_safety']:
                    # calc safety bucket
                    calculated_disk_safety = bucket_safety['remaining_safety']
                    safety = '{0},{1}'.format(str(bucket_safety['bucket'][0]),
                                              str(bucket_safety['bucket'][1]))
                    current_disk_safety = disk_safety_overview[
                        alba_backend.name][safety]['current_disk_safety']
                    to_be_added_namespace = {
                        'namespace':
                        namespace['namespace'],
                        'amount_in_bucket':
                        "%.5f" % (float(bucket_safety['count']) /
                                  float(total_count) * 100)
                    }
                    if calculated_disk_safety in current_disk_safety:
                        current_disk_safety[calculated_disk_safety].append(
                            to_be_added_namespace)
                    else:
                        current_disk_safety[calculated_disk_safety] = [
                            to_be_added_namespace
                        ]
        return disk_safety_overview
Beispiel #34
0
    def check_model_consistency(result_handler):
        """
        Checks if the model consistency of OVSDB vs. VOLUMEDRIVER and does a preliminary check on RABBITMQ
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        result_handler.info('Checking model consistency: ')

        # Checking consistency of volumedriver vs. ovsdb and backwards
        for vp in VPoolHelper.get_vpools():
            if vp.guid not in OpenvStorageHealthCheck.LOCAL_SR.vpools_guids:
                result_handler.skip(
                    'Skipping vPool {0} because it is not living here.'.format(
                        vp.name))
                continue
            result_handler.info(
                'Checking consistency of volumedriver vs. ovsdb for {0}: '.
                format(vp.name),
                add_to_result=False)
            missing_in_volumedriver = []
            missing_in_model = []
            config_file = Configuration.get_configuration_path(
                '/ovs/vpools/{0}/hosts/{1}/config'.format(
                    vp.guid, vp.storagedrivers[0].name))
            try:
                voldrv_client = src.LocalStorageRouterClient(config_file)
                # noinspection PyArgumentList
                voldrv_volume_list = voldrv_client.list_volumes()
            except (ClusterNotReachableException, RuntimeError) as ex:
                result_handler.warning(
                    'Seems like the volumedriver {0} is not running. Got {1}'.
                    format(vp.name, ex.message))
                continue

            vdisk_volume_ids = []
            # cross-reference model vs. volumedriver
            for vdisk in vp.vdisks:
                vdisk_volume_ids.append(vdisk.volume_id)
                if vdisk.volume_id not in voldrv_volume_list:
                    missing_in_volumedriver.append(vdisk.guid)
                else:
                    voldrv_volume_list.remove(vdisk.volume_id)
            # cross-reference volumedriver vs. model
            for voldrv_id in voldrv_volume_list:
                if voldrv_id not in vdisk_volume_ids:
                    missing_in_model.append(voldrv_id)

            # display discrepancies for vPool
            if len(missing_in_volumedriver) != 0:
                result_handler.warning(
                    'Detected volumes that are MISSING in volumedriver but are in ovsdb in vpool: {0} - vdisk guid(s):{1}.'
                    .format(vp.name, ' '.join(missing_in_volumedriver)))
            else:
                result_handler.success(
                    'No discrepancies found for ovsdb in vPool {0}'.format(
                        vp.name))

            if len(missing_in_model) != 0:
                result_handler.warning(
                    'Detected volumes that are AVAILABLE in volumedriver but are not in ovsdb in vpool: {0} - vdisk volume id(s):{1}'
                    .format(vp.name, ', '.join(missing_in_model)))
            else:
                result_handler.success(
                    'No discrepancies found for voldrv in vpool {0}'.format(
                        vp.name))
Beispiel #35
0
    def cluster_registry_checkup():
        """
        Verify whether changes have occurred in the cluster registry for each vPool
        :return: Information whether changes occurred
        :rtype: dict
        """
        changed_vpools = {}
        for vpool in VPoolList.get_vpools():
            changed_vpools[vpool.guid] = {'changes': False,
                                          'success': True}
            try:
                StorageDriverController._logger.info('Validating cluster registry settings for Vpool {0}'.format(vpool.guid))

                current_configs = vpool.clusterregistry_client.get_node_configs()
                changes = len(current_configs) == 0
                node_configs = []
                for sd in vpool.storagedrivers:
                    sd.invalidate_dynamics(['cluster_node_config'])
                    new_config = sd.cluster_node_config
                    node_configs.append(ClusterNodeConfig(**new_config))
                    if changes is False:
                        current_node_configs = [config for config in current_configs if config.vrouter_id == sd.storagedriver_id]
                        if len(current_node_configs) == 1:
                            current_node_config = current_node_configs[0]
                            for key in new_config:
                                if getattr(current_node_config, key) != new_config[key]:
                                    changes = True
                                    break
                changed_vpools[vpool.guid]['changes'] = changes

                if changes is True:
                    StorageDriverController._logger.info('Cluster registry settings for Vpool {0} needs to be updated'.format(vpool.guid))
                    available_storagedrivers = []
                    for sd in vpool.storagedrivers:
                        storagerouter = sd.storagerouter
                        try:
                            SSHClient(storagerouter, username='******')
                            with remote(storagerouter.ip, [LocalStorageRouterClient]) as rem:
                                sd_key = '/ovs/vpools/{0}/hosts/{1}/config'.format(vpool.guid, sd.storagedriver_id)
                                if Configuration.exists(sd_key) is True:
                                    path = Configuration.get_configuration_path(sd_key)
                                    lsrc = rem.LocalStorageRouterClient(path)
                                    lsrc.server_revision()  # 'Cheap' call to verify whether volumedriver is responsive
                                    available_storagedrivers.append(sd)
                        except UnableToConnectException:
                            StorageDriverController._logger.warning('StorageRouter {0} not available.'.format(storagerouter.name))
                        except Exception as ex:
                            if 'ClusterNotReachableException' in str(ex):
                                StorageDriverController._logger.warning('StorageDriver {0} on StorageRouter {1} not available.'.format(
                                    sd.guid, storagerouter.name
                                ))
                            else:
                                StorageDriverController._logger.exception('Got exception when validating StorageDriver {0} on StorageRouter {1}.'.format(
                                    sd.guid, storagerouter.name
                                ))

                    StorageDriverController._logger.info('Updating cluster node configs for VPool {0}'.format(vpool.guid))
                    vpool.clusterregistry_client.set_node_configs(node_configs)
                    for sd in available_storagedrivers:
                        StorageDriverController._logger.info('Trigger config reload for StorageDriver {0}'.format(sd.guid))
                        vpool.storagedriver_client.update_cluster_node_configs(str(sd.storagedriver_id), req_timeout_secs=10)
                    StorageDriverController._logger.info('Updating cluster node configs for Vpool {0} completed'.format(vpool.guid))
                else:
                    StorageDriverController._logger.info('Cluster registry settings for Vpool {0} is up to date'.format(vpool.guid))
            except Exception as ex:
                StorageDriverController._logger.exception('Got exception when validating cluster registry settings for Vpool {0}.'.format(vpool.name))
                changed_vpools[vpool.guid]['success'] = False
                changed_vpools[vpool.guid]['error'] = ex.message
        return changed_vpools
    def _presets(self):
        """
        Returns the policies active on the node
        """
        if self.abm_cluster is None:
            return []  # No ABM cluster yet, so backend not fully installed yet

        osds = {}
        if self.scaling != AlbaBackend.SCALINGS.GLOBAL:
            for node_id, slots in self.local_stack.iteritems():
                osds[node_id] = 0
                for slot_id, slot_data in slots.iteritems():
                    for osd_id, osd_data in slot_data['osds'].iteritems():
                        if osd_data['status'] in [
                                AlbaNode.OSD_STATUSES.OK,
                                AlbaNode.OSD_STATUSES.WARNING
                        ] and osd_data.get('claimed_by') == self.guid:
                            osds[node_id] += 1
        config = Configuration.get_configuration_path(
            self.abm_cluster.config_location)
        presets = AlbaCLI.run(command='list-presets', config=config)
        preset_dict = {}
        for preset in presets:
            preset_dict[preset['name']] = preset
            if 'in_use' not in preset:
                preset['in_use'] = True
            if 'is_default' not in preset:
                preset['is_default'] = False
            preset['is_available'] = False
            preset['policies'] = [
                tuple(policy) for policy in preset['policies']
            ]
            preset['policy_metadata'] = {}
            active_policy = None
            for policy in preset['policies']:
                is_available = False
                available_disks = 0
                if self.scaling == AlbaBackend.SCALINGS.GLOBAL:
                    available_disks += sum(
                        self.local_summary['devices'].values())
                if self.scaling == AlbaBackend.SCALINGS.LOCAL:
                    available_disks += sum(
                        min(osds[node], policy[3]) for node in osds)
                if available_disks >= policy[2]:
                    if active_policy is None:
                        active_policy = policy
                    is_available = True
                preset['policy_metadata'][policy] = {
                    'is_active': False,
                    'in_use': False,
                    'is_available': is_available
                }
                preset['is_available'] |= is_available
            if active_policy is not None:
                preset['policy_metadata'][active_policy]['is_active'] = True
        for namespace in self.ns_data:
            if namespace['namespace']['state'] != 'active':
                continue
            policy_usage = namespace['statistics']['bucket_count']
            preset = preset_dict[namespace['namespace']['preset_name']]
            for usage in policy_usage:
                used_policy = tuple(
                    usage[0])  # Policy as reported to be "in use"
                for configured_policy in preset[
                        'policies']:  # All configured policies
                    if used_policy[0] == configured_policy[0] and used_policy[
                            1] == configured_policy[
                                1] and used_policy[3] <= configured_policy[3]:
                        preset['policy_metadata'][configured_policy][
                            'in_use'] = True
                        break
        for preset in presets:
            preset['policies'] = [str(policy) for policy in preset['policies']]
            for key in preset['policy_metadata'].keys():
                preset['policy_metadata'][str(
                    key)] = preset['policy_metadata'][key]
                del preset['policy_metadata'][key]
        return presets
Beispiel #37
0
    def start_services(self):
        """
        Start all services related to the Storagedriver
        :return: None
        :rtype: NoneType
        """
        if self.sr_installer is None:
            raise RuntimeError('No StorageRouterInstaller instance found')

        vpool = self.vp_installer.vpool
        root_client = self.sr_installer.root_client
        storagerouter = self.sr_installer.storagerouter
        alba_pkg_name, alba_version_cmd = PackageFactory.get_package_and_version_cmd_for(component=PackageFactory.COMP_ALBA)
        voldrv_pkg_name, voldrv_version_cmd = PackageFactory.get_package_and_version_cmd_for(component=PackageFactory.COMP_SD)

        # Add/start watcher volumedriver service
        if not self.service_manager.has_service(name=ServiceFactory.SERVICE_WATCHER_VOLDRV, client=root_client):
            self.service_manager.add_service(name=ServiceFactory.SERVICE_WATCHER_VOLDRV, client=root_client)
            self.service_manager.start_service(name=ServiceFactory.SERVICE_WATCHER_VOLDRV, client=root_client)

        # Add/start DTL service
        self.service_manager.add_service(name=self.SERVICE_TEMPLATE_DTL,
                                         params={'DTL_PATH': self.storagedriver_partition_dtl.path,
                                                 'DTL_ADDRESS': self.storagedriver.storage_ip,
                                                 'DTL_PORT': str(self.storagedriver.ports['dtl']),
                                                 'DTL_TRANSPORT': StorageDriverClient.VPOOL_DTL_TRANSPORT_MAP[self.dtl_transport],
                                                 'LOG_SINK': Logger.get_sink_path('storagedriver-dtl_{0}'.format(self.storagedriver.storagedriver_id)),
                                                 'VOLDRV_PKG_NAME': voldrv_pkg_name,
                                                 'VOLDRV_VERSION_CMD': voldrv_version_cmd},
                                         client=root_client,
                                         target_name=self.dtl_service)
        self.service_manager.start_service(name=self.dtl_service, client=root_client)

        # Add/start ALBA proxy services
        for proxy in self.storagedriver.alba_proxies:
            alba_proxy_service = 'ovs-{0}'.format(proxy.service.name)
            self.service_manager.add_service(name=self.SERVICE_TEMPLATE_PROXY,
                                             params={'VPOOL_NAME': vpool.name,
                                                     'LOG_SINK': Logger.get_sink_path(proxy.service.name),
                                                     'CONFIG_PATH': Configuration.get_configuration_path('/ovs/vpools/{0}/proxies/{1}/config/main'.format(vpool.guid, proxy.guid)),
                                                     'ALBA_PKG_NAME': alba_pkg_name,
                                                     'ALBA_VERSION_CMD': alba_version_cmd},
                                             client=root_client,
                                             target_name=alba_proxy_service)
            self.service_manager.start_service(name=alba_proxy_service, client=root_client)

        # Add/start StorageDriver service
        self.service_manager.add_service(name=self.SERVICE_TEMPLATE_SD,
                                         params={'KILL_TIMEOUT': '30',
                                                 'VPOOL_NAME': vpool.name,
                                                 'VPOOL_MOUNTPOINT': self.storagedriver.mountpoint,
                                                 'CONFIG_PATH': StorageDriverConfiguration(vpool_guid=vpool.guid, storagedriver_id=self.storagedriver.storagedriver_id).remote_path,
                                                 'OVS_UID': root_client.run(['id', '-u', 'ovs']).strip(),
                                                 'OVS_GID': root_client.run(['id', '-g', 'ovs']).strip(),
                                                 'LOG_SINK': Logger.get_sink_path('storagedriver_{0}'.format(self.storagedriver.storagedriver_id)),
                                                 'VOLDRV_PKG_NAME': voldrv_pkg_name,
                                                 'VOLDRV_VERSION_CMD': voldrv_version_cmd,
                                                 'METADATASTORE_BITS': 5},
                                         client=root_client,
                                         target_name=self.sd_service)

        current_startup_counter = self.storagedriver.startup_counter
        self.service_manager.start_service(name=self.sd_service, client=root_client)

        tries = 60
        while self.storagedriver.startup_counter == current_startup_counter and tries > 0:
            self._logger.debug('Waiting for the StorageDriver to start up for vPool {0} on StorageRouter {1} ...'.format(vpool.name, storagerouter.name))
            if self.service_manager.get_service_status(name=self.sd_service, client=root_client) != 'active':
                raise RuntimeError('StorageDriver service failed to start (service not running)')
            tries -= 1
            time.sleep(60 - tries)
            self.storagedriver.discard()
        if self.storagedriver.startup_counter == current_startup_counter:
            raise RuntimeError('StorageDriver service failed to start (got no event)')
        self._logger.debug('StorageDriver running')
Beispiel #38
0
    def stop_services(self):
        """
        Stop all services related to the Storagedriver
        :return: A boolean indicating whether something went wrong
        :rtype: bool
        """
        if self.sr_installer is None:
            raise RuntimeError('No StorageRouterInstaller instance found')

        root_client = self.sr_installer.root_client
        errors_found = False

        for service in [self.sd_service, self.dtl_service]:
            try:
                if self.service_manager.has_service(name=service, client=root_client):
                    self._logger.debug('StorageDriver {0} - Stopping service {1}'.format(self.storagedriver.guid, service))
                    self.service_manager.stop_service(name=service, client=root_client)
                    self._logger.debug('StorageDriver {0} - Removing service {1}'.format(self.storagedriver.guid, service))
                    self.service_manager.remove_service(name=service, client=root_client)
            except Exception:
                self._logger.exception('StorageDriver {0} - Disabling/stopping service {1} failed'.format(self.storagedriver.guid, service))
                errors_found = True

        sd_config_key = '/ovs/vpools/{0}/hosts/{1}/config'.format(self.vp_installer.vpool.guid, self.storagedriver.storagedriver_id)
        if self.vp_installer.storagedriver_amount <= 1 and Configuration.exists(sd_config_key):
            try:
                for proxy in self.storagedriver.alba_proxies:
                    if self.service_manager.has_service(name=proxy.service.name, client=root_client):
                        self._logger.debug('StorageDriver {0} - Starting proxy {1}'.format(self.storagedriver.guid, proxy.service.name))
                        self.service_manager.start_service(name=proxy.service.name, client=root_client)
                        tries = 10
                        running = False
                        port = proxy.service.ports[0]
                        while running is False and tries > 0:
                            self._logger.debug('StorageDriver {0} - Waiting for the proxy {1} to start up'.format(self.storagedriver.guid, proxy.service.name))
                            tries -= 1
                            time.sleep(10 - tries)
                            try:
                                root_client.run(['alba', 'proxy-statistics', '--host', self.storagedriver.storage_ip, '--port', str(port)])
                                running = True
                            except CalledProcessError as ex:
                                self._logger.error('StorageDriver {0} - Fetching alba proxy-statistics failed with error (but ignoring): {1}'.format(self.storagedriver.guid, ex))
                        if running is False:
                            raise RuntimeError('Alba proxy {0} failed to start'.format(proxy.service.name))
                        self._logger.debug('StorageDriver {0} - Alba proxy {0} running'.format(self.storagedriver.guid, proxy.service.name))

                self._logger.debug('StorageDriver {0} - Destroying filesystem and erasing node configs'.format(self.storagedriver.guid))
                with remote(root_client.ip, [LocalStorageRouterClient], username='******') as rem:
                    path = Configuration.get_configuration_path(sd_config_key)
                    storagedriver_client = rem.LocalStorageRouterClient(path)
                    try:
                        storagedriver_client.destroy_filesystem()
                    except RuntimeError as rte:
                        # If backend has already been deleted, we cannot delete the filesystem anymore --> storage leak!!!
                        if 'MasterLookupResult.Error' not in rte.message:
                            raise

                self.vp_installer.vpool.clusterregistry_client.erase_node_configs()
            except RuntimeError:
                self._logger.exception('StorageDriver {0} - Destroying filesystem and erasing node configs failed'.format(self.storagedriver.guid))
                errors_found = True

        for proxy in self.storagedriver.alba_proxies:
            service_name = proxy.service.name
            try:
                if self.service_manager.has_service(name=service_name, client=root_client):
                    self._logger.debug('StorageDriver {0} - Stopping service {1}'.format(self.storagedriver.guid, service_name))
                    self.service_manager.stop_service(name=service_name, client=root_client)
                    self._logger.debug('StorageDriver {0} - Removing service {1}'.format(self.storagedriver.guid, service_name))
                    self.service_manager.remove_service(name=service_name, client=root_client)
            except Exception:
                self._logger.exception('StorageDriver {0} - Disabling/stopping service {1} failed'.format(self.storagedriver.guid, service_name))
                errors_found = True

        return errors_found
Beispiel #39
0
    def get_stats_vdisks(cls):
        """
        Retrieve statistics about all vDisks on the system.
        Check the safety, storage amount on the Backend, fail-over status and others
        """
        if cls._config is None:
            cls.validate_and_retrieve_config()

        stats = []
        errors = False
        environment = cls._config['environment']
        alba_backend_info = {}
        for alba_backend in AlbaBackendList.get_albabackends():
            config_path = Configuration.get_configuration_path(
                alba_backend.abm_cluster.config_location)
            disk_safety = {}
            namespace_usage = {}

            # Retrieve namespace, preset and disk safety information
            try:
                preset_info = AlbaCLI.run(
                    command='list-presets', config=config_path
                )  # Not using alba_backend.presets, because it takes a whole lot longer to retrieve
                all_namespace_info = AlbaCLI.run(command='show-namespaces',
                                                 config=config_path,
                                                 extra_params=['--max=-1'])[1]
                all_disk_safety_info = AlbaCLI.run(command='get-disk-safety',
                                                   config=config_path)
            except Exception:
                errors = True
                cls._logger.exception(
                    'Retrieving information for ALBA Backend {0} failed'.
                    format(alba_backend.name))
                continue

            alba_backend_info[alba_backend.guid] = {
                'disk_safety': disk_safety,
                'namespace_usage': namespace_usage
            }

            # Parse namespace information
            for namespace_info in all_namespace_info:
                namespace_usage[namespace_info['name']] = float(
                    namespace_info['statistics']['storage'])

            # Parse preset information
            policies = []
            preset_name = None
            for preset in preset_info:
                if preset['in_use'] is not True:
                    continue
                preset_name = preset['name']
                policies.extend(preset['policies'])
            if preset_name is None:
                continue

            # Parse disk safety information
            total_objects = 0
            max_lost_disks = 0
            max_disk_safety = 0
            bucket_overview = {}
            disk_lost_overview = {}
            disk_safety_overview = {}
            for disk_safety_info in all_disk_safety_info:
                safety = disk_safety_info['safety']
                volume_id = disk_safety_info['namespace']
                disk_safety[volume_id] = float(
                    safety) if safety is not None else safety

                for bucket_safety in disk_safety_info['bucket_safety']:
                    bucket = bucket_safety['bucket']
                    objects = bucket_safety['count']
                    remaining_safety = bucket_safety['remaining_safety']

                    if bucket[1] > max_lost_disks:
                        max_lost_disks = bucket[1]
                    if remaining_safety > max_disk_safety:
                        max_disk_safety = remaining_safety

                    for policy in policies:
                        k = policy[0] == bucket[0]
                        m = policy[1] == bucket[1]
                        c = policy[2] <= bucket[2]
                        x = policy[3] >= bucket[3]
                        if k and m and c and x:
                            if preset_name not in bucket_overview:
                                bucket_overview[preset_name] = {
                                    'policy': str(policy),
                                    'presets': {}
                                }

                    bucket[2] -= bucket_safety['applicable_dead_osds']
                    if str(bucket
                           ) not in bucket_overview[preset_name]['presets']:
                        bucket_overview[preset_name]['presets'][str(
                            bucket)] = {
                                'objects': 0,
                                'disk_safety': 0
                            }

                    disk_lost = bucket[0] + bucket[1] - bucket[
                        2]  # Data fragments + parity fragments - amount of fragments to write + dead osds
                    if disk_lost not in disk_lost_overview:
                        disk_lost_overview[disk_lost] = 0
                    if remaining_safety not in disk_safety_overview:
                        disk_safety_overview[remaining_safety] = 0

                    total_objects += objects
                    disk_lost_overview[disk_lost] += objects
                    disk_safety_overview[remaining_safety] += objects
                    bucket_overview[preset_name]['presets'][str(
                        bucket)]['objects'] += objects
                    bucket_overview[preset_name]['presets'][str(
                        bucket)]['disk_safety'] = remaining_safety

            # Create statistics regarding disk safety
            for disk_lost_number in xrange(max_lost_disks + 1):
                stats.append({
                    'tags': {
                        'disk_lost': disk_lost_number,
                        'environment': environment,
                        'backend_name': alba_backend.name
                    },
                    'fields': {
                        'objects': disk_lost_overview.get(disk_lost_number, 0),
                        'total_objects': total_objects
                    },
                    'measurement': 'disk_lost'
                })

            for disk_safety_number in xrange(max_disk_safety + 1):
                stats.append({
                    'tags': {
                        'disk_safety': disk_safety_number,
                        'environment': environment,
                        'backend_name': alba_backend.name
                    },
                    'fields': {
                        'objects':
                        disk_safety_overview.get(disk_safety_number, 0),
                        'total_objects': total_objects
                    },
                    'measurement': 'disk_safety'
                })

            for preset_name, result in bucket_overview.iteritems():
                for bucket_count, bucket_result in result['presets'].iteritems(
                ):
                    stats.append({
                        'tags': {
                            'bucket': bucket_count,
                            'policy': result['policy'],
                            'preset_name': preset_name,
                            'environment': environment,
                            'disk_safety': bucket_result['disk_safety'],
                            'backend_name': alba_backend.name
                        },
                        'fields': {
                            'objects': bucket_result['objects'],
                            'total_objects': total_objects
                        },
                        'measurement': 'bucket'
                    })

        # Integrate namespace and disk safety information in vPool stats
        for vpool in VPoolList.get_vpools():
            alba_backend_guid = vpool.metadata['backend']['backend_info'][
                'alba_backend_guid']
            for vdisk in vpool.vdisks:
                try:
                    metrics = cls._convert_to_float_values(
                        cls._pop_realtime_info(vdisk.statistics))
                    metrics['failover_mode'] = vdisk.dtl_status
                    metrics['frontend_size'] = float(vdisk.size)
                    metrics['failover_mode_status'] = cls._FAILOVER_MAP.get(
                        vdisk.dtl_status, 3)
                    if alba_backend_guid in alba_backend_info:
                        metrics['disk_safety'] = alba_backend_info[
                            alba_backend_guid]['disk_safety'].get(
                                vdisk.volume_id)
                        metrics['backend_stored'] = alba_backend_info[
                            alba_backend_guid]['namespace_usage'].get(
                                vdisk.volume_id)

                    stats.append({
                        'tags': {
                            'disk_name':
                            vdisk.name,
                            'volume_id':
                            vdisk.volume_id,
                            'vpool_name':
                            vdisk.vpool.name,
                            'environment':
                            environment,
                            'storagerouter_name':
                            StorageRouter(vdisk.storagerouter_guid).name
                        },
                        'fields': metrics,
                        'measurement': 'vdisk'
                    })
                except Exception:
                    errors = True
                    cls._logger.exception(
                        'Retrieving statistics for vDisk {0} with guid {1} failed'
                        .format(vdisk.name, vdisk.guid))
        return errors, stats
Beispiel #40
0
    def shrink_vpool(cls,
                     storagedriver_guid,
                     offline_storage_router_guids=list()):
        """
        Removes a StorageDriver (if its the last StorageDriver for a vPool, the vPool is removed as well)
        :param storagedriver_guid: Guid of the StorageDriver to remove
        :type storagedriver_guid: str
        :param offline_storage_router_guids: Guids of StorageRouters which are offline and will be removed from cluster.
                                             WHETHER VPOOL WILL BE DELETED DEPENDS ON THIS
        :type offline_storage_router_guids: list
        :return: None
        :rtype: NoneType
        """
        # TODO: Add logging
        # TODO: Unit test individual pieces of code
        # Validations
        storagedriver = StorageDriver(storagedriver_guid)
        storagerouter = storagedriver.storagerouter
        cls._logger.info(
            'StorageDriver {0} - Deleting StorageDriver {1}'.format(
                storagedriver.guid, storagedriver.name))

        vp_installer = VPoolInstaller(name=storagedriver.vpool.name)
        vp_installer.validate(storagedriver=storagedriver)

        sd_installer = StorageDriverInstaller(vp_installer=vp_installer,
                                              storagedriver=storagedriver)

        cls._logger.info(
            'StorageDriver {0} - Checking availability of related StorageRouters'
            .format(storagedriver.guid, storagedriver.name))
        sr_client_map = SSHClient.get_clients(endpoints=[
            sd.storagerouter for sd in vp_installer.vpool.storagedrivers
        ],
                                              user_names=['root'])
        sr_installer = StorageRouterInstaller(root_client=sr_client_map.get(
            storagerouter, {}).get('root'),
                                              storagerouter=storagerouter,
                                              vp_installer=vp_installer,
                                              sd_installer=sd_installer)

        offline_srs = sr_client_map.pop('offline')
        if sorted([sr.guid for sr in offline_srs
                   ]) != sorted(offline_storage_router_guids):
            raise RuntimeError('Not all StorageRouters are reachable')

        if storagerouter not in offline_srs:
            mtpt_pids = sr_installer.root_client.run(
                "lsof -t +D '/mnt/{0}' || true".format(
                    vp_installer.name.replace(r"'", r"'\''")),
                allow_insecure=True).splitlines()
            if len(mtpt_pids) > 0:
                raise RuntimeError(
                    'vPool cannot be deleted. Following processes keep the vPool mount point occupied: {0}'
                    .format(', '.join(mtpt_pids)))

        # Retrieve reachable StorageDrivers
        reachable_storagedrivers = []
        for sd in vp_installer.vpool.storagedrivers:
            if sd.storagerouter not in sr_client_map:
                # StorageRouter is offline
                continue

            sd_key = '/ovs/vpools/{0}/hosts/{1}/config'.format(
                vp_installer.vpool.guid, sd.storagedriver_id)
            if Configuration.exists(sd_key) is True:
                path = Configuration.get_configuration_path(sd_key)
                with remote(sd.storagerouter.ip,
                            [LocalStorageRouterClient]) as rem:
                    try:
                        lsrc = rem.LocalStorageRouterClient(path)
                        lsrc.server_revision(
                        )  # 'Cheap' call to verify whether volumedriver is responsive
                        cls._logger.info(
                            'StorageDriver {0} - Responsive StorageDriver {1} on node with IP {2}'
                            .format(storagedriver.guid, sd.name,
                                    sd.storagerouter.ip))
                        reachable_storagedrivers.append(sd)
                    except Exception as exception:
                        if not is_connection_failure(exception):
                            raise

        if len(reachable_storagedrivers) == 0:
            raise RuntimeError(
                'Could not find any responsive node in the cluster')

        # Start removal
        if vp_installer.storagedriver_amount > 1:
            vp_installer.update_status(status=VPool.STATUSES.SHRINKING)
        else:
            vp_installer.update_status(status=VPool.STATUSES.DELETING)

        # Clean up stale vDisks
        cls._logger.info('StorageDriver {0} - Removing stale vDisks'.format(
            storagedriver.guid))
        VDiskController.remove_stale_vdisks(vpool=vp_installer.vpool)

        # Reconfigure the MDSes
        cls._logger.info('StorageDriver {0} - Reconfiguring MDSes'.format(
            storagedriver.guid))
        for vdisk_guid in storagerouter.vdisks_guids:
            try:
                MDSServiceController.ensure_safety(
                    vdisk_guid=vdisk_guid,
                    excluded_storagerouter_guids=[storagerouter.guid] +
                    offline_storage_router_guids)
            except Exception:
                cls._logger.exception(
                    'StorageDriver {0} - vDisk {1} - Ensuring MDS safety failed'
                    .format(storagedriver.guid, vdisk_guid))

        # Validate that all MDSes on current StorageRouter have been moved away
        # Ensure safety does not always throw an error, that's why we perform this check here instead of in the Exception clause of above code
        vdisks = []
        for mds in vp_installer.mds_services:
            for junction in mds.vdisks:
                vdisk = junction.vdisk
                if vdisk in vdisks:
                    continue
                vdisks.append(vdisk)
                cls._logger.critical(
                    'StorageDriver {0} - vDisk {1} {2} - MDS Services have not been migrated away'
                    .format(storagedriver.guid, vdisk.guid, vdisk.name))
        if len(vdisks) > 0:
            # Put back in RUNNING, so it can be used again. Errors keep on displaying in GUI now anyway
            vp_installer.update_status(status=VPool.STATUSES.RUNNING)
            raise RuntimeError(
                'Not all MDS Services have been successfully migrated away')

        # Start with actual removal
        errors_found = False
        if storagerouter not in offline_srs:
            errors_found &= sd_installer.stop_services()

        errors_found &= vp_installer.configure_cluster_registry(
            exclude=[storagedriver], apply_on=reachable_storagedrivers)
        errors_found &= vp_installer.update_node_distance_map()
        errors_found &= vp_installer.remove_mds_services()
        errors_found &= sd_installer.clean_config_management()
        errors_found &= sd_installer.clean_model()

        if storagerouter not in offline_srs:
            errors_found &= sd_installer.clean_directories(
                mountpoints=StorageRouterController.get_mountpoints(
                    client=sr_installer.root_client))

            try:
                DiskController.sync_with_reality(
                    storagerouter_guid=storagerouter.guid)
            except Exception:
                cls._logger.exception(
                    'StorageDriver {0} - Synchronizing disks with reality failed'
                    .format(storagedriver.guid))
                errors_found = True

        if vp_installer.storagedriver_amount > 1:
            # Update the vPool metadata and run DTL checkup
            vp_installer.vpool.metadata['caching_info'].pop(
                sr_installer.storagerouter.guid, None)
            vp_installer.vpool.save()

            try:
                VDiskController.dtl_checkup(vpool_guid=vp_installer.vpool.guid,
                                            ensure_single_timeout=600)
            except Exception:
                cls._logger.exception(
                    'StorageDriver {0} - DTL checkup failed for vPool {1} with guid {2}'
                    .format(storagedriver.guid, vp_installer.name,
                            vp_installer.vpool.guid))
        else:
            cls._logger.info(
                'StorageDriver {0} - Removing vPool from model'.format(
                    storagedriver.guid))
            # Clean up model
            try:
                vp_installer.vpool.delete()
            except Exception:
                errors_found = True
                cls._logger.exception(
                    'StorageDriver {0} - Cleaning up vPool from the model failed'
                    .format(storagedriver.guid))
            Configuration.delete('/ovs/vpools/{0}'.format(
                vp_installer.vpool.guid))

        cls._logger.info('StorageDriver {0} - Running MDS checkup'.format(
            storagedriver.guid))
        try:
            MDSServiceController.mds_checkup()
        except Exception:
            cls._logger.exception(
                'StorageDriver {0} - MDS checkup failed'.format(
                    storagedriver.guid))

        # Update vPool status
        if errors_found is True:
            if vp_installer.storagedriver_amount > 1:
                vp_installer.update_status(status=VPool.STATUSES.FAILURE)
            raise RuntimeError(
                '1 or more errors occurred while trying to remove the StorageDriver. Please check the logs for more information'
            )

        if vp_installer.storagedriver_amount > 1:
            vp_installer.update_status(status=VPool.STATUSES.RUNNING)
        cls._logger.info(
            'StorageDriver {0} - Deleted StorageDriver {1}'.format(
                storagedriver.guid, storagedriver.name))

        if len(VPoolList.get_vpools()) == 0:
            cluster_name = ArakoonInstaller.get_cluster_name('voldrv')
            if ArakoonInstaller.get_arakoon_metadata_by_cluster_name(
                    cluster_name=cluster_name)['internal'] is True:
                cls._logger.debug(
                    'StorageDriver {0} - Removing Arakoon cluster {1}'.format(
                        storagedriver.guid, cluster_name))
                try:
                    installer = ArakoonInstaller(cluster_name=cluster_name)
                    installer.load()
                    installer.delete_cluster()
                except Exception:
                    cls._logger.exception(
                        'StorageDriver {0} - Delete voldrv Arakoon cluster failed'
                        .format(storagedriver.guid))
                service_type = ServiceTypeList.get_by_name(
                    ServiceType.SERVICE_TYPES.ARAKOON)
                service_name = ArakoonInstaller.get_service_name_for_cluster(
                    cluster_name=cluster_name)
                for service in list(service_type.services):
                    if service.name == service_name:
                        service.delete()

        # Remove watcher volumedriver service if last StorageDriver on current StorageRouter
        if len(
                storagerouter.storagedrivers
        ) == 0 and storagerouter not in offline_srs:  # ensure client is initialized for StorageRouter
            try:
                if cls._service_manager.has_service(
                        ServiceFactory.SERVICE_WATCHER_VOLDRV,
                        client=sr_installer.root_client):
                    cls._service_manager.stop_service(
                        ServiceFactory.SERVICE_WATCHER_VOLDRV,
                        client=sr_installer.root_client)
                    cls._service_manager.remove_service(
                        ServiceFactory.SERVICE_WATCHER_VOLDRV,
                        client=sr_installer.root_client)
            except Exception:
                cls._logger.exception(
                    'StorageDriver {0} - {1} service deletion failed'.format(
                        storagedriver.guid,
                        ServiceFactory.SERVICE_WATCHER_VOLDRV))
Beispiel #41
0
    def test_collapse():
        """
        Test the arakoon collapsing

        :return:
        """
        ArakoonCollapse.LOGGER.info("Starting validating arakoon collapse")
        node_ips = StoragerouterHelper.get_storagerouter_ips()
        node_ips.sort()
        for node_ip in node_ips:
            ArakoonCollapse.LOGGER.info(
                "Fetching arakoons on node `{0}`".format(node_ip))
            arakoon_clusters = []
            root_client = SSHClient(node_ip, username='******')

            # fetch arakoon clusters
            for service in ServiceList.get_services():
                if service.is_internal is True and service.storagerouter.ip == node_ip and \
                    service.type.name in (ServiceType.SERVICE_TYPES.ARAKOON,
                                          ServiceType.SERVICE_TYPES.NS_MGR,
                                          ServiceType.SERVICE_TYPES.ALBA_MGR):
                    arakoon_clusters.append(
                        service.name.replace('arakoon-', ''))

            # perform collapse
            ArakoonCollapse.LOGGER.info(
                "Starting arakoon collapse on node `{0}`".format(node_ip))
            for arakoon_cluster in arakoon_clusters:
                ArakoonCollapse.LOGGER.info(
                    "Fetching `{0}` arakoon on node `{1}`".format(
                        arakoon_cluster, node_ip))
                arakoon_config_path = Configuration.get_configuration_path(
                    '/ovs/arakoon/{0}/config'.format(arakoon_cluster))
                tlog_location = '/opt/OpenvStorage/db/arakoon/{0}/tlogs'.format(
                    arakoon_cluster)

                # read_tlog_dir
                with remote(node_ip, [Configuration]) as rem:
                    config_contents = rem.Configuration.get(
                        '/ovs/arakoon/{0}/config'.format(arakoon_cluster),
                        raw=True)
                for line in config_contents.splitlines():
                    if 'tlog_dir' in line:
                        tlog_location = line.split()[-1]

                nr_of_tlogs = ArakoonCollapse.get_nr_of_tlogs_in_folder(
                    root_client, tlog_location)
                old_headdb_timestamp = 0
                if root_client.file_exists('/'.join([tlog_location,
                                                     'head.db'])):
                    old_headdb_timestamp = root_client.run([
                        'stat', '--format=%Y',
                        '{0}/{1}'.format(tlog_location, 'head.db')
                    ])
                if nr_of_tlogs <= 2:
                    benchmark_command = [
                        'arakoon', '--benchmark', '-n_clients', '1', '-max_n',
                        '5_000', '-config', arakoon_config_path
                    ]
                    root_client.run(benchmark_command)

                ArakoonCollapse.LOGGER.info(
                    "Collapsing arakoon `{0}` on node `{1}` ...".format(
                        arakoon_cluster, node_ip))
                GenericController.collapse_arakoon()

                nr_of_tlogs = ArakoonCollapse.get_nr_of_tlogs_in_folder(
                    root_client, tlog_location)
                new_headdb_timestamp = root_client.run([
                    'stat', '--format=%Y',
                    '{0}/{1}'.format(tlog_location, 'head.db')
                ])

                # perform assertion
                assert nr_of_tlogs <= 2,\
                    'Arakoon collapse left {0} tlogs on the environment, expecting less than 2 in `{1}` on node `{1}`'\
                    .format(nr_of_tlogs, arakoon_cluster, node_ip)
                assert old_headdb_timestamp != new_headdb_timestamp,\
                    'Timestamp of the head_db file was not changed ' \
                    'in the process of collapsing tlogs of arakoon `{0}` on node `{1}`'\
                    .format(arakoon_cluster, node_ip)

                ArakoonCollapse.LOGGER.info(
                    "Successfully collapsed arakoon `{0}` on node `{1}`".
                    format(arakoon_cluster, node_ip))

        ArakoonCollapse.LOGGER.info("Finished validating arakoon collapsing")
Beispiel #42
0
    def check_backends(result_handler):
        """
        Checks Alba as a whole
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        result_handler.info('Checking available ALBA backends.',
                            add_to_result=False)
        try:
            alba_backends = AlbaHealthCheck._get_all_responding_backends(
                result_handler)
            if len(alba_backends) == 0:
                return result_handler.skip('No backends found.')

            result_handler.success('We found {0} backend(s)!'.format(
                len(alba_backends)))

            result_handler.info('Checking the ALBA ASDs.', add_to_result=False)
            for backend in alba_backends:
                backend_name = backend['name']
                # check disks of backend, ignore global backends
                if backend['type'] != 'LOCAL':
                    result_handler.skip(
                        'Alba backend {0} is a global backend.'.format(
                            backend_name),
                        add_to_result=False)
                    continue

                config = Configuration.get_configuration_path(
                    '/ovs/arakoon/{0}-abm/config'.format(backend_name))
                try:
                    result_disks = AlbaHealthCheck._check_backend_asds(
                        result_handler, backend['disks'], backend_name, config)
                except Exception:
                    result_handler.warning(
                        'Could not fetch the asd information for alba backend {0}'
                        .format(backend_name))
                    continue
                working_disks = result_disks['working']
                defective_disks = result_disks['broken']
                # check if backend is available for vPOOL attachment / use
                if backend['is_available_for_vpool']:
                    if len(defective_disks) == 0:
                        result_handler.success(
                            'Alba backend {0} should be available for VPool use. All asds are working fine!'
                            .format(backend_name))
                    else:
                        result_handler.warning(
                            'Alba backend {0} should be available for VPool use with {1} asds, but there are {2} defective asds: {3}'
                            .format(backend_name, len(working_disks),
                                    len(defective_disks),
                                    ', '.join(defective_disks)))
                else:
                    if len(working_disks) == 0 and len(defective_disks) == 0:
                        result_handler.skip(
                            'Alba backend {0} is not available for vPool use, there are no asds assigned to this backend!'
                            .format(backend_name))
                    else:
                        result_handler.failure(
                            'Alba backend {0} is not available for vPool use, preset requirements not satisfied! There are {1} working asds AND {2} '
                            'defective asds!'.format(backend_name,
                                                     len(working_disks),
                                                     len(defective_disks)))
        except NotFoundException as ex:
            result_handler.failure(
                'Failed to fetch the object with exception: {0}'.format(ex))
        except ConnectionFailedException as ex:
            result_handler.failure(
                'Failed to connect to configuration master with exception: {0}'
                .format(ex))
        except (ArakoonNotFound, ArakoonNoMaster, ArakoonNoMasterResult) as e:
            result_handler.failure(
                'Seems like a arakoon has some problems: {0}'.format(e))
Beispiel #43
0
    def cluster_registry_checkup():
        """
        Verify whether changes have occurred in the cluster registry for each vPool
        :return: Information whether changes occurred
        :rtype: dict
        """
        changed_vpools = {}
        for vpool in VPoolList.get_vpools():
            changed_vpools[vpool.guid] = {'changes': False, 'success': True}
            try:
                StorageDriverController._logger.info(
                    'Validating cluster registry settings for Vpool {0}'.
                    format(vpool.guid))

                current_configs = vpool.clusterregistry_client.get_node_configs(
                )
                changes = len(current_configs) == 0
                node_configs = []
                for sd in vpool.storagedrivers:
                    sd.invalidate_dynamics(['cluster_node_config'])
                    new_config = sd.cluster_node_config
                    node_configs.append(ClusterNodeConfig(**new_config))
                    if changes is False:
                        current_node_configs = [
                            config for config in current_configs
                            if config.vrouter_id == sd.storagedriver_id
                        ]
                        if len(current_node_configs) == 1:
                            current_node_config = current_node_configs[0]
                            for key in new_config:
                                if getattr(current_node_config,
                                           key) != new_config[key]:
                                    changes = True
                                    break
                changed_vpools[vpool.guid]['changes'] = changes

                if changes is True:
                    StorageDriverController._logger.info(
                        'Cluster registry settings for Vpool {0} needs to be updated'
                        .format(vpool.guid))
                    available_storagedrivers = []
                    for sd in vpool.storagedrivers:
                        storagerouter = sd.storagerouter
                        try:
                            SSHClient(storagerouter, username='******')
                        except UnableToConnectException:
                            StorageDriverController._logger.warning(
                                'StorageRouter {0} not available.'.format(
                                    storagerouter.name))
                            continue

                        with remote(storagerouter.ip,
                                    [LocalStorageRouterClient]) as rem:
                            sd_key = '/ovs/vpools/{0}/hosts/{1}/config'.format(
                                vpool.guid, sd.storagedriver_id)
                            if Configuration.exists(sd_key) is True:
                                path = Configuration.get_configuration_path(
                                    sd_key)
                                try:
                                    lsrc = rem.LocalStorageRouterClient(path)
                                    lsrc.server_revision(
                                    )  # 'Cheap' call to verify whether volumedriver is responsive
                                    available_storagedrivers.append(sd)
                                except Exception as ex:
                                    if 'ClusterNotReachableException' in str(
                                            ex):
                                        StorageDriverController._logger.warning(
                                            'StorageDriver {0} on StorageRouter {1} not available.'
                                            .format(sd.guid,
                                                    storagerouter.name))
                                    else:
                                        StorageDriverController._logger.exception(
                                            'Got exception when validating StorageDriver {0} on StorageRouter {1}.'
                                            .format(sd.guid,
                                                    storagerouter.name))

                    StorageDriverController._logger.info(
                        'Updating cluster node configs for VPool {0}'.format(
                            vpool.guid))
                    vpool.clusterregistry_client.set_node_configs(node_configs)
                    for sd in available_storagedrivers:
                        StorageDriverController._logger.info(
                            'Trigger config reload for StorageDriver {0}'.
                            format(sd.guid))
                        vpool.storagedriver_client.update_cluster_node_configs(
                            str(sd.storagedriver_id), req_timeout_secs=10)
                    StorageDriverController._logger.info(
                        'Updating cluster node configs for Vpool {0} completed'
                        .format(vpool.guid))
                else:
                    StorageDriverController._logger.info(
                        'Cluster registry settings for Vpool {0} is up to date'
                        .format(vpool.guid))
            except Exception as ex:
                StorageDriverController._logger.exception(
                    'Got exception when validating cluster registry settings for Vpool {0}.'
                    .format(vpool.name))
                changed_vpools[vpool.guid]['success'] = False
                changed_vpools[vpool.guid]['error'] = ex.message
        return changed_vpools
    def _local_stack(self):
        """
        Returns a live list of all disks known to this AlbaBackend
        """
        from ovs.dal.lists.albanodelist import AlbaNodeList
        from ovs.dal.lists.albabackendlist import AlbaBackendList

        if len(self.abm_services) == 0:
            return {}  # No ABM services yet, so backend not fully installed yet

        alba_backend_map = {}
        for alba_backend in AlbaBackendList.get_albabackends():
            alba_backend_map[alba_backend.alba_id] = alba_backend

        # Load information based on the model
        asd_map = {}
        storage_map = {}
        alba_nodes = AlbaNodeList.get_albanodes()
        for node in alba_nodes:
            node_id = node.node_id
            storage_map[node_id] = {}
            for disk in node.disks:
                disk_id = disk.aliases[0].split('/')[-1]
                storage_map[node_id][disk_id] = {'asds': {},
                                                 'name': disk_id,
                                                 'guid': disk.guid,
                                                 'status': 'error',
                                                 'aliases': disk.aliases,
                                                 'status_detail': 'unknown'}
                for osd in disk.osds:
                    osd_id = osd.osd_id
                    data = {'asd_id': osd_id,
                            'guid': osd.guid,
                            'status': 'error',
                            'status_detail': 'unknown',
                            'alba_backend_guid': osd.alba_backend_guid}
                    asd_map[osd_id] = data
                    storage_map[node_id][disk_id]['asds'][osd_id] = data

        # Load information from node
        def _load_live_info(_node, _node_data):
            _data = _node.storage_stack
            if _data['status'] != 'ok':
                for disk_entry in _node_data.values():
                    disk_entry['status_detail'] = _data['status']
                    for entry in disk_entry.get('asds', {}).values():
                        entry['status_detail'] = _data['status']
            else:
                for _disk_id, disk_asd_info in _data['stack'].iteritems():
                    if _disk_id not in _node_data:
                        _node_data[_disk_id] = {'asds': {}}
                    entry = _node_data[_disk_id]
                    disk_info = copy.deepcopy(disk_asd_info)
                    del disk_info['asds']
                    entry.update(disk_info)
                    asds_info = disk_asd_info['asds']
                    for _asd_id, asd_info in asds_info.iteritems():
                        if _asd_id not in _node_data[_disk_id]['asds']:
                            _node_data[_disk_id]['asds'][_asd_id] = asd_info
                        else:
                            _node_data[_disk_id]['asds'][_asd_id].update(asd_info)

        threads = []
        for node in alba_nodes:
            thread = Thread(target=_load_live_info, args=(node, storage_map[node.node_id]))
            thread.start()
            threads.append(thread)
        for thread in threads:
            thread.join()

        # Mix in usage information
        for asd_id, stats in self.asd_statistics.iteritems():
            if asd_id in asd_map:
                asd_map[asd_id]['usage'] = {'size': int(stats['capacity']),
                                            'used': int(stats['disk_usage']),
                                            'available': int(stats['capacity'] - stats['disk_usage'])}

        # Load information from alba
        backend_interval_key = '/ovs/alba/backends/{0}/gui_error_interval'.format(self.guid)
        if Configuration.exists(backend_interval_key):
            interval = Configuration.get(backend_interval_key)
        else:
            interval = Configuration.get('/ovs/alba/backends/global_gui_error_interval')
        config = Configuration.get_configuration_path('/ovs/arakoon/{0}-abm/config'.format(self.name))
        asds = {}
        for found_osd in AlbaCLI.run(command='list-all-osds', config=config):
            asds[found_osd['long_id']] = found_osd
        for node_data in storage_map.values():
            for _disk in node_data.values():
                for asd_id, asd_data in _disk['asds'].iteritems():
                    if asd_id not in asds:
                        continue
                    found_osd = asds[asd_id]
                    if 'state' not in asd_data:
                        continue
                    if found_osd.get('decommissioned') is True:
                        asd_data['status'] = 'unavailable'
                        asd_data['status_detail'] = 'decommissioned'
                        continue
                    state = asd_data['state']
                    if state == 'ok':
                        if found_osd['id'] is None:
                            alba_id = found_osd['alba_id']
                            if alba_id is None:
                                asd_data['status'] = 'available'
                            else:
                                asd_data['status'] = 'unavailable'
                                alba_backend = alba_backend_map.get(alba_id)
                                if alba_backend is not None:
                                    asd_data['alba_backend_guid'] = alba_backend.guid
                        else:
                            asd_data['alba_backend_guid'] = self.guid
                            asd_data['status'] = 'warning'
                            asd_data['status_detail'] = 'recenterrors'

                            read = found_osd['read'] or [0]
                            write = found_osd['write'] or [0]
                            errors = found_osd['errors']
                            if len(errors) == 0 or (len(read + write) > 0 and max(min(read), min(write)) > max(error[0] for error in errors) + interval):
                                asd_data['status'] = 'claimed'
                                asd_data['status_detail'] = ''
                    else:
                        asd_data['status'] = 'error'
                        asd_data['status_detail'] = asd_data.get('state_detail', '')
                        alba_backend = alba_backend_map.get(found_osd.get('alba_id'))
                        if alba_backend is not None:
                            asd_data['alba_backend_guid'] = alba_backend.guid
        return storage_map
    def add_preset(alba_backend_guid,
                   name,
                   compression,
                   policies,
                   encryption,
                   fragment_size=None):
        """
        Adds a preset to Alba
        :param alba_backend_guid: Guid of the ALBA backend
        :type alba_backend_guid: str
        :param name: Name of the preset
        :type name: str
        :param compression: Compression type for the preset (none | snappy | bz2)
        :type compression: str
        :param policies: Policies for the preset
        :type policies: list
        :param encryption: Encryption for the preset (none | aes-cbc-256 | aes-ctr-256)
        :type encryption: str
        :param fragment_size: Size of a fragment in bytes (e.g. 1048576)
        :type fragment_size: int
        :return: None
        """
        # VALIDATIONS
        if not re.match(Toolbox.regex_preset, name):
            raise ValueError('Invalid preset name specified')

        compression_options = ['snappy', 'bz2', 'none']
        if compression not in compression_options:
            raise ValueError(
                'Invalid compression format specified, please choose from: "{0}"'
                .format('", "'.join(compression_options)))

        encryption_options = ['aes-cbc-256', 'aes-ctr-256', 'none']
        if encryption not in encryption_options:
            raise ValueError(
                'Invalid encryption format specified, please choose from: "{0}"'
                .format('", "'.join(encryption_options)))

        if fragment_size is not None and (not isinstance(fragment_size, int) or
                                          not 16 <= fragment_size <= 1024**3):
            raise ValueError(
                'Fragment size should be a positive integer smaller than 1 GiB'
            )

        AlbaPresetController._validate_policies_param(policies=policies)

        alba_backend = AlbaBackend(alba_backend_guid)
        if name in [preset['name'] for preset in alba_backend.presets]:
            raise RuntimeError(
                'Preset with name {0} already exists'.format(name))

        # ADD PRESET
        preset = {
            'compression':
            compression,
            'object_checksum': {
                'default': ['crc-32c'],
                'verify_upload': True,
                'allowed': [['none'], ['sha-1'], ['crc-32c']]
            },
            'osds': ['all'],
            'fragment_size':
            16 * 1024**2 if fragment_size is None else int(fragment_size),
            'policies':
            policies,
            'fragment_checksum': ['crc-32c'],
            'fragment_encryption': ['none'],
            'in_use':
            False,
            'name':
            name
        }

        # Generate encryption key
        temp_key_file = None
        if encryption != 'none':
            encryption_key = ''.join(
                random.choice(chr(random.randint(32, 126))) for _ in range(32))
            temp_key_file = tempfile.mktemp()
            with open(temp_key_file, 'wb') as temp_file:
                temp_file.write(encryption_key)
                temp_file.flush()
            preset['fragment_encryption'] = [
                '{0}'.format(encryption), '{0}'.format(temp_key_file)
            ]

        # Dump preset content on filesystem
        config = Configuration.get_configuration_path(
            alba_backend.abm_cluster.config_location)
        temp_config_file = tempfile.mktemp()
        with open(temp_config_file, 'wb') as data_file:
            data_file.write(json.dumps(preset))
            data_file.flush()

        # Create preset
        AlbaPresetController._logger.debug(
            'Adding preset {0} with compression {1} and policies {2}'.format(
                name, compression, policies))
        AlbaCLI.run(command='create-preset',
                    config=config,
                    named_params={'input-url': temp_config_file},
                    extra_params=[name])

        # Cleanup
        alba_backend.invalidate_dynamics()
        for filename in [temp_key_file, temp_config_file]:
            if filename and os.path.exists(filename) and os.path.isfile(
                    filename):
                os.remove(filename)
Beispiel #46
0
    def check_if_proxies_work(result_handler):
        """
        Checks if all Alba Proxies work on a local machine, it creates a namespace and tries to put and object
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        namespace_params = {
            'bucket_count': (list, None),
            'logical': (int, None),
            'storage': (int, None),
            'storage_per_osd': (list, None)
        }

        result_handler.info('Checking the ALBA proxies.', add_to_result=False)

        amount_of_presets_not_working = []
        # ignore possible subprocess output
        fnull = open(os.devnull, 'w')
        # try put/get/verify on all available proxies on the local node
        local_proxies = ServiceHelper.get_local_proxy_services()
        if len(local_proxies) == 0:
            result_handler.info('Found no proxies.', add_to_result=False)
            return amount_of_presets_not_working
        for service in local_proxies:
            try:
                result_handler.info('Checking ALBA proxy {0}.'.format(
                    service.name),
                                    add_to_result=False)
                ip = service.alba_proxy.storagedriver.storage_ip
                # Encapsulating try to determine test output
                try:
                    # Determine what to what backend the proxy is connected
                    proxy_client_cfg = AlbaCLI.run(command='proxy-client-cfg',
                                                   named_params={
                                                       'host': ip,
                                                       'port': service.ports[0]
                                                   })
                except AlbaException:
                    result_handler.failure(
                        'Fetching proxy info has failed. Please verify if {0}:{1} is the correct address for proxy {2}.'
                        .format(ip, service.ports[0], service.name))
                    continue
                # Fetch arakoon information
                abm_name = proxy_client_cfg.get('cluster_id')
                # Check if proxy config is correctly setup
                if abm_name is None:
                    raise ConfigNotMatchedException(
                        'Proxy config for proxy {0} does not have the correct format on node {1} with port {2}.'
                        .format(service.name, ip, service.ports[0]))
                abm_config = Configuration.get_configuration_path(
                    '/ovs/vpools/{0}/proxies/{1}/config/abm'.format(
                        service.alba_proxy.storagedriver.vpool.guid,
                        service.alba_proxy.guid))

                # Determine presets / backend
                try:
                    presets = AlbaCLI.run(command='list-presets',
                                          config=abm_config)
                except AlbaException:
                    result_handler.failure(
                        'Listing the presets has failed. Please check the arakoon config path. We used {0}'
                        .format(abm_config))
                    continue

                for preset in presets:
                    # If preset is not in use, test will fail so add a skip
                    if preset['in_use'] is False:
                        result_handler.skip(
                            'Preset {0} is not in use and will not be checked'.
                            format(preset['name']))
                        continue
                    preset_name = preset['name']
                    # Encapsulation try for cleanup
                    try:
                        # Generate new namespace name using the preset
                        namespace_key_prefix = 'ovs-healthcheck-ns-{0}-{1}'.format(
                            preset_name, AlbaHealthCheck.LOCAL_ID)
                        namespace_key = '{0}_{1}'.format(
                            namespace_key_prefix, uuid.uuid4())
                        object_key = 'ovs-healthcheck-obj-{0}'.format(
                            str(uuid.uuid4()))
                        # Create namespace
                        AlbaCLI.run(command='proxy-create-namespace',
                                    named_params={
                                        'host': ip,
                                        'port': service.ports[0]
                                    },
                                    extra_params=[namespace_key, preset_name])
                        # Wait until fully created
                        namespace_start_time = time.time()
                        for index in xrange(2):
                            # Running twice because the first one could give a false positive as the osds will alert the nsm
                            # and the nsm would respond with got messages but these were not the ones we are after
                            AlbaCLI.run(command='deliver-messages',
                                        config=abm_config)
                        while True:
                            if time.time(
                            ) - namespace_start_time > AlbaHealthCheck.NAMESPACE_TIMEOUT:
                                raise RuntimeError(
                                    'Creation namespace has timed out after {0}s'
                                    .format(time.time() -
                                            namespace_start_time))
                            list_ns_osds_output = AlbaCLI.run(
                                command='list-ns-osds',
                                config=abm_config,
                                extra_params=[namespace_key])
                            # Example output: [[0, [u'Active']], [3, [u'Active']]]
                            namespace_ready = True
                            for osd_info in list_ns_osds_output:  # If there are no osd_info records, uploading will fail so covered by HC
                                osd_state = osd_info[1][0]
                                if osd_state != 'Active':
                                    namespace_ready = False
                            if namespace_ready is True:
                                break
                        result_handler.success(
                            'Namespace successfully created on proxy {0} with preset {1}!'
                            .format(service.name, preset_name))
                        namespace_info = AlbaCLI.run(
                            command='show-namespace',
                            config=abm_config,
                            extra_params=[namespace_key])
                        Toolbox.verify_required_params(
                            required_params=namespace_params,
                            actual_params=namespace_info)
                        result_handler.success(
                            'Namespace successfully fetched on proxy {0} with preset {1}!'
                            .format(service.name, preset_name))

                        # Put test object to given dir
                        with open(AlbaHealthCheck.TEMP_FILE_LOC,
                                  'wb') as output_file:
                            output_file.write(
                                os.urandom(AlbaHealthCheck.TEMP_FILE_SIZE))
                        AlbaCLI.run(command='proxy-upload-object',
                                    named_params={
                                        'host': ip,
                                        'port': service.ports[0]
                                    },
                                    extra_params=[
                                        namespace_key,
                                        AlbaHealthCheck.TEMP_FILE_LOC,
                                        object_key
                                    ])
                        result_handler.success(
                            'Successfully uploaded the object to namespace {0}'
                            .format(namespace_key))
                        # download object
                        AlbaCLI.run(command='proxy-download-object',
                                    named_params={
                                        'host': ip,
                                        'port': service.ports[0]
                                    },
                                    extra_params=[
                                        namespace_key, object_key,
                                        AlbaHealthCheck.TEMP_FILE_FETCHED_LOC
                                    ])
                        result_handler.success(
                            'Successfully downloaded the object to namespace {0}'
                            .format(namespace_key))
                        # check if files exists - issue #57
                        if not (os.path.isfile(
                                AlbaHealthCheck.TEMP_FILE_FETCHED_LOC) and
                                os.path.isfile(AlbaHealthCheck.TEMP_FILE_LOC)):
                            # creation of object failed
                            raise ObjectNotFoundException(
                                ValueError('Creation of object has failed'))
                        hash_original = hashlib.md5(
                            open(AlbaHealthCheck.TEMP_FILE_LOC,
                                 'rb').read()).hexdigest()
                        hash_fetched = hashlib.md5(
                            open(AlbaHealthCheck.TEMP_FILE_FETCHED_LOC,
                                 'rb').read()).hexdigest()

                        if hash_original == hash_fetched:
                            result_handler.success(
                                'Fetched object {0} from namespace {1} on proxy {2} with preset {3} matches the created object!'
                                .format(object_key, namespace_key,
                                        service.name, preset_name))
                        else:
                            result_handler.failure(
                                'Fetched object {0} from namespace {1} on proxy {2} with preset {3} does not match the created object!'
                                .format(object_key, namespace_key,
                                        service.name, preset_name))

                    except ObjectNotFoundException as ex:
                        amount_of_presets_not_working.append(preset_name)
                        result_handler.failure(
                            'Failed to put object on namespace {0} failed on proxy {1}with preset {2} With error {3}'
                            .format(namespace_key, service.name, preset_name,
                                    ex))
                    except AlbaException as ex:
                        if ex.alba_command == 'proxy-create-namespace':
                            result_handler.failure(
                                'Create namespace has failed with {0} on namespace {1} with proxy {2} with preset {3}'
                                .format(str(ex), namespace_key, service.name,
                                        preset_name))
                        elif ex.alba_command == 'show-namespace':
                            result_handler.failure(
                                'Show namespace has failed with {0} on namespace {1} with proxy {2} with preset {3}'
                                .format(str(ex), namespace_key, service.name,
                                        preset_name))
                        elif ex.alba_command == 'proxy-upload-object':
                            result_handler.failure(
                                'Uploading the object has failed with {0} on namespace {1} with proxy {2} with preset {3}'
                                .format(str(ex), namespace_key, service.name,
                                        preset_name))
                        elif ex.alba_command == 'proxy-download-object':
                            result_handler.failure(
                                'Downloading the object has failed with {0} on namespace {1} with proxy {2} with preset {3}'
                                .format(str(ex), namespace_key, service.name,
                                        preset_name))
                    finally:
                        # Delete the created namespace and preset
                        subprocess.call(
                            ['rm', str(AlbaHealthCheck.TEMP_FILE_LOC)],
                            stdout=fnull,
                            stderr=subprocess.STDOUT)
                        subprocess.call(
                            ['rm',
                             str(AlbaHealthCheck.TEMP_FILE_FETCHED_LOC)],
                            stdout=fnull,
                            stderr=subprocess.STDOUT)
                        namespaces = AlbaCLI.run(command='list-namespaces',
                                                 config=abm_config)
                        namespaces_to_remove = []
                        proxy_named_params = {
                            'host': ip,
                            'port': service.ports[0]
                        }
                        for namespace in namespaces:
                            if namespace['name'].startswith(
                                    namespace_key_prefix):
                                namespaces_to_remove.append(namespace['name'])
                        for namespace_name in namespaces_to_remove:
                            if namespace_name == namespace_key:
                                result_handler.info(
                                    'Deleting namespace {0}.'.format(
                                        namespace_name))
                            else:
                                result_handler.warning(
                                    'Deleting namespace {0} which was leftover from a previous run.'
                                    .format(namespace_name))

                            AlbaCLI.run(command='proxy-delete-namespace',
                                        named_params=proxy_named_params,
                                        extra_params=[namespace_name])

                            namespace_delete_start = time.time()
                            while True:
                                try:
                                    AlbaCLI.run(
                                        command='show-namespace',
                                        config=abm_config,
                                        extra_params=[namespace_name]
                                    )  # Will fail if the namespace does not exist
                                except AlbaException:
                                    result_handler.success(
                                        'Namespace {0} successfully removed.'.
                                        format(namespace_name))
                                    break
                                if time.time(
                                ) - namespace_delete_start > AlbaHealthCheck.NAMESPACE_TIMEOUT:
                                    raise RuntimeError(
                                        'Delete namespace has timed out after {0}s'
                                        .format(time.time() -
                                                namespace_start_time))

                            # be tidy, and make the proxy forget the namespace
                            try:
                                AlbaCLI.run(
                                    command='proxy-statistics',
                                    named_params=proxy_named_params,
                                    extra_params=['--forget', namespace_name])
                            except:
                                result_handler.warning(
                                    'Failed to make proxy forget namespace {0}.'
                                    .format(namespace_name))

            except subprocess.CalledProcessError as ex:
                # this should stay for the deletion of the remaining files
                amount_of_presets_not_working.append(service.name)
                result_handler.failure(
                    'Proxy {0} has some problems. Got {1} as error'.format(
                        service.name, ex))

            except ConfigNotMatchedException as ex:
                amount_of_presets_not_working.append(service.name)
                result_handler.failure(
                    'Proxy {0} has some problems. Got {1} as error'.format(
                        service.name, ex))
    def add_preset(alba_backend_guid, name, compression, policies, encryption, fragment_size=None):
        """
        Adds a preset to Alba
        :param alba_backend_guid: Guid of the ALBA backend
        :type alba_backend_guid: str
        :param name: Name of the preset
        :type name: str
        :param compression: Compression type for the preset (none | snappy | bz2)
        :type compression: str
        :param policies: Policies for the preset
        :type policies: list
        :param encryption: Encryption for the preset (none | aes-cbc-256 | aes-ctr-256)
        :type encryption: str
        :param fragment_size: Size of a fragment in bytes (e.g. 1048576)
        :type fragment_size: int
        :return: None
        """
        # VALIDATIONS
        if not re.match(Toolbox.regex_preset, name):
            raise ValueError('Invalid preset name specified')

        compression_options = ['snappy', 'bz2', 'none']
        if compression not in compression_options:
            raise ValueError('Invalid compression format specified, please choose from: "{0}"'.format('", "'.join(compression_options)))

        encryption_options = ['aes-cbc-256', 'aes-ctr-256', 'none']
        if encryption not in encryption_options:
            raise ValueError('Invalid encryption format specified, please choose from: "{0}"'.format('", "'.join(encryption_options)))

        if fragment_size is not None and (not isinstance(fragment_size, int) or not 16 <= fragment_size <= 1024 ** 3):
            raise ValueError('Fragment size should be a positive integer smaller than 1 GiB')

        AlbaPresetController._validate_policies_param(policies=policies)

        alba_backend = AlbaBackend(alba_backend_guid)
        if name in [preset['name'] for preset in alba_backend.presets]:
            raise RuntimeError('Preset with name {0} already exists'.format(name))

        # ADD PRESET
        preset = {'compression': compression,
                  'object_checksum': {'default': ['crc-32c'],
                                      'verify_upload': True,
                                      'allowed': [['none'], ['sha-1'], ['crc-32c']]},
                  'osds': ['all'],
                  'fragment_size': 16 * 1024 ** 2 if fragment_size is None else int(fragment_size),
                  'policies': policies,
                  'fragment_checksum': ['crc-32c'],
                  'fragment_encryption': ['none'],
                  'in_use': False,
                  'name': name}

        # Generate encryption key
        temp_key_file = None
        if encryption != 'none':
            encryption_key = ''.join(random.choice(chr(random.randint(32, 126))) for _ in range(32))
            temp_key_file = tempfile.mktemp()
            with open(temp_key_file, 'wb') as temp_file:
                temp_file.write(encryption_key)
                temp_file.flush()
            preset['fragment_encryption'] = ['{0}'.format(encryption), '{0}'.format(temp_key_file)]

        # Dump preset content on filesystem
        config = Configuration.get_configuration_path(ArakoonInstaller.CONFIG_KEY.format(AlbaController.get_abm_cluster_name(alba_backend=alba_backend)))
        temp_config_file = tempfile.mktemp()
        with open(temp_config_file, 'wb') as data_file:
            data_file.write(json.dumps(preset))
            data_file.flush()

        # Create preset
        AlbaPresetController._logger.debug('Adding preset {0} with compression {1} and policies {2}'.format(name, compression, policies))
        AlbaCLI.run(command='create-preset', config=config, named_params={'input-url': temp_config_file}, extra_params=[name])

        # Cleanup
        alba_backend.invalidate_dynamics()
        for filename in [temp_key_file, temp_config_file]:
            if filename and os.path.exists(filename) and os.path.isfile(filename):
                os.remove(filename)
 def check_nsm_load(cls, result_handler, max_load=None, use_total_capacity=False, total_capacity_warning=None, total_capacity_error=None):
     """
     Checks all NSM services registered within the Framework and will report their load
     :param result_handler: logging object
     :type result_handler: ovs.extensions.healthcheck.result.HCResults
     :param max_load: Maximum load percentage before marking it as overloaded. Defaults to ovs/framework/plugins/alba/config|nsm.maxload
     :type max_load: float
     :param use_total_capacity: Base NSM load of the total possible capacity (capacity of NSMs before they are marked as overloaded)
     instead of checking the least filled NSM. Use threshold arguments for tuning'
     :type use_total_capacity: bool
     :param total_capacity_warning: Number of remaining namespaces threshold before throwing a warning. Defaults 20% of the total namespaces
     :type total_capacity_warning: int
     :param total_capacity_error: Number of remaining namespaces threshold before throwing an error. Defaults to 5% of the total namespaces
     :type total_capacity_error: int
     :return: None
     :rtype: NoneType
     """
     max_nsm_load_config = Configuration.get('ovs/framework/plugins/alba/config|nsm.maxload')
     max_load = max_load or max_nsm_load_config
     for alba_backend in AlbaBackendList.get_albabackends():
         if alba_backend.abm_cluster is None:
             result_handler.failure('No ABM cluster found for ALBA Backend {0}'.format(alba_backend.name))
             continue
         if len(alba_backend.abm_cluster.abm_services) == 0:
             result_handler.failure('ALBA Backend {0} does not have any registered ABM services'.format(alba_backend.name))
             continue
         if len(alba_backend.nsm_clusters) == 0:
             result_handler.failure('ALBA Backend {0} does not have any registered NSM services'.format(alba_backend.name))
             continue
         internal = alba_backend.abm_cluster.abm_services[0].service.is_internal
         if use_total_capacity:
             maximum_capacity_before_overload = AlbaHealthCheck._get_nsm_max_capacity_before_overload(alba_backend, max_nsm_load_config)
             total_capacity_warning = total_capacity_warning or math.ceil(maximum_capacity_before_overload * 1.0/5)
             total_capacity_error = total_capacity_error or math.ceil(maximum_capacity_before_overload * 1.0/20)
             config = Configuration.get_configuration_path(key=alba_backend.abm_cluster.config_location)
             hosts_data = AlbaCLI.run(command='list-nsm-hosts', config=config)
             current_capacity = sum([host['namespaces_count'] for host in hosts_data if not host['lost']])
             remaining_capacity = maximum_capacity_before_overload - current_capacity
             if remaining_capacity > total_capacity_warning and remaining_capacity > total_capacity_error:  # Only error could be specified
                 result_handler.success('NSMs for backend {0} have enough capacity remaining ({1}/{2} used)'.format(alba_backend.name, current_capacity, maximum_capacity_before_overload),
                                        code=ErrorCodes.nsm_load_ok)
             elif total_capacity_warning >= remaining_capacity > total_capacity_error:
                 result_handler.warning('NSMs for backend {0} have reached the warning threshold '
                                        '({1} namespaces had to be remaining, {2}/{3} used)'.format(alba_backend.name, total_capacity_warning, current_capacity, maximum_capacity_before_overload),
                                        code=ErrorCodes.nsm_load_ok)
             else:
                 result_handler.failure('NSMs for backend {0} have reached the error threshold '
                                        '({1} namespaces had to be remaining, ({2}/{3} used)'.format(alba_backend.name, total_capacity_error, current_capacity, maximum_capacity_before_overload),
                                        code=ErrorCodes.nsm_load_ok)
         else:
             nsm_loads = {}
             sorted_nsm_clusters = sorted(alba_backend.nsm_clusters, key=lambda k: k.number)
             for nsm_cluster in sorted_nsm_clusters:
                 nsm_loads[nsm_cluster.number] = AlbaController.get_load(nsm_cluster)
             overloaded = min(nsm_loads.values()) >= max_load
             if overloaded is False:
                 result_handler.success('NSMs for backend {0} are not overloaded'.format(alba_backend.name),
                                        code=ErrorCodes.nsm_load_ok)
             else:
                 if internal is True:
                     result_handler.warning('NSMs for backend {0} are overloaded. The NSM checkup will take care of this'.format(alba_backend.name),
                                            code=ErrorCodes.nsm_load_warn)
                 else:
                     result_handler.failure('NSMs for backend {0} are overloaded. Please add your own NSM clusters to the backend'.format(alba_backend.name),
                                            code=ErrorCodes.nsm_load_failure)
    def test_arakoon_collapse(self):
        """
        Test the Arakoon collapse functionality
        """
        # Set up the test
        structure = DalHelper.build_dal_structure(
            structure={'storagerouters': [1, 2]})
        storagerouter_1 = structure['storagerouters'][1]
        storagerouter_2 = structure['storagerouters'][2]
        MockedSSHClient._run_returns[storagerouter_1.ip] = {}
        MockedSSHClient._run_returns[storagerouter_2.ip] = {}

        # Make sure we cover all Arakoon cluster types
        clusters_to_create = {
            ServiceType.ARAKOON_CLUSTER_TYPES.SD: [{
                'name': 'unittest-voldrv',
                'internal': True,
                'success': True
            }],
            ServiceType.ARAKOON_CLUSTER_TYPES.CFG: [{
                'name': 'unittest-cacc',
                'internal': True,
                'success': True
            }],
            ServiceType.ARAKOON_CLUSTER_TYPES.FWK: [{
                'name': 'unittest-ovsdb',
                'internal': True,
                'success': False
            }],
            ServiceType.ARAKOON_CLUSTER_TYPES.ABM: [{
                'name': 'unittest-cluster-1-abm',
                'internal': True,
                'success': False
            }, {
                'name': 'unittest-random-abm-name',
                'internal': False,
                'success': True
            }],
            ServiceType.ARAKOON_CLUSTER_TYPES.NSM: [{
                'name': 'unittest-cluster-1-nsm_0',
                'internal': True,
                'success': True
            }]
        }
        self.assertEqual(
            first=sorted(clusters_to_create.keys()),
            second=sorted(ServiceType.ARAKOON_CLUSTER_TYPES.keys()),
            msg=
            'An Arakoon cluster type has been removed or added, please update this test accordingly'
        )

        # Create all Arakoon clusters and related services
        failed_clusters = []
        external_clusters = []
        successful_clusters = []
        for cluster_type, cluster_infos in clusters_to_create.iteritems():
            filesystem = cluster_type == ServiceType.ARAKOON_CLUSTER_TYPES.CFG
            for cluster_info in cluster_infos:
                internal = cluster_info['internal']
                cluster_name = cluster_info['name']

                base_dir = DalHelper.CLUSTER_DIR.format(cluster_name)
                arakoon_installer = ArakoonInstaller(cluster_name=cluster_name)
                arakoon_installer.create_cluster(cluster_type=cluster_type,
                                                 ip=storagerouter_1.ip,
                                                 base_dir=base_dir,
                                                 internal=internal)
                arakoon_installer.start_cluster()
                arakoon_installer.extend_cluster(new_ip=storagerouter_2.ip,
                                                 base_dir=base_dir)

                service_name = ArakoonInstaller.get_service_name_for_cluster(
                    cluster_name=cluster_name)
                if cluster_type == ServiceType.ARAKOON_CLUSTER_TYPES.ABM:
                    service_type = ServiceTypeList.get_by_name(
                        ServiceType.SERVICE_TYPES.ALBA_MGR)
                elif cluster_type == ServiceType.ARAKOON_CLUSTER_TYPES.NSM:
                    service_type = ServiceTypeList.get_by_name(
                        ServiceType.SERVICE_TYPES.NS_MGR)
                else:
                    service_type = ServiceTypeList.get_by_name(
                        ServiceType.SERVICE_TYPES.ARAKOON)

                if internal is True:
                    DalHelper.create_service(
                        service_name=service_name,
                        service_type=service_type,
                        storagerouter=storagerouter_1,
                        ports=arakoon_installer.ports[storagerouter_1.ip])
                    DalHelper.create_service(
                        service_name=service_name,
                        service_type=service_type,
                        storagerouter=storagerouter_2,
                        ports=arakoon_installer.ports[storagerouter_2.ip])
                else:
                    DalHelper.create_service(service_name=service_name,
                                             service_type=service_type)

                    external_clusters.append(cluster_name)
                    continue

                if cluster_info['success'] is True:
                    if filesystem is True:
                        config_path = ArakoonClusterConfig.CONFIG_FILE.format(
                            cluster_name)
                    else:
                        config_path = Configuration.get_configuration_path(
                            ArakoonClusterConfig.CONFIG_KEY.format(
                                cluster_name))
                    MockedSSHClient._run_returns[storagerouter_1.ip][
                        'arakoon --collapse-local 1 2 -config {0}'.format(
                            config_path)] = None
                    MockedSSHClient._run_returns[storagerouter_2.ip][
                        'arakoon --collapse-local 2 2 -config {0}'.format(
                            config_path)] = None
                    successful_clusters.append(cluster_name)
                else:  # For successful False clusters we don't emulate the collapse, thus making it fail
                    failed_clusters.append(cluster_name)

        # Start collapse and make it fail for all clusters on StorageRouter 2
        SSHClient._raise_exceptions[storagerouter_2.ip] = {
            'users': ['ovs'],
            'exception': UnableToConnectException('No route to host')
        }
        GenericController.collapse_arakoon()

        # Verify all log messages for each type of cluster
        generic_logs = Logger._logs.get('lib', {})
        for cluster_name in successful_clusters + failed_clusters + external_clusters:
            collect_msg = (
                'DEBUG',
                'Collecting info for cluster {0}'.format(cluster_name))
            unreachable_msg = (
                'ERROR',
                'Could not collapse any cluster on {0} (not reachable)'.format(
                    storagerouter_2.name))
            end_collapse_msg = (
                'DEBUG', 'Collapsing cluster {0} on {1} completed'.format(
                    cluster_name, storagerouter_1.ip))
            start_collapse_msg = ('DEBUG',
                                  'Collapsing cluster {0} on {1}'.format(
                                      cluster_name, storagerouter_1.ip))
            failed_collapse_msg = (
                'ERROR', 'Collapsing cluster {0} on {1} failed'.format(
                    cluster_name, storagerouter_1.ip))
            messages_to_validate = []
            if cluster_name in successful_clusters:
                assert_function = self.assertIn
                messages_to_validate.append(collect_msg)
                messages_to_validate.append(unreachable_msg)
                messages_to_validate.append(start_collapse_msg)
                messages_to_validate.append(end_collapse_msg)
            elif cluster_name in failed_clusters:
                assert_function = self.assertIn
                messages_to_validate.append(collect_msg)
                messages_to_validate.append(unreachable_msg)
                messages_to_validate.append(start_collapse_msg)
                messages_to_validate.append(failed_collapse_msg)
            else:
                assert_function = self.assertNotIn
                messages_to_validate.append(collect_msg)
                messages_to_validate.append(start_collapse_msg)
                messages_to_validate.append(end_collapse_msg)

            for severity, message in messages_to_validate:
                if assert_function == self.assertIn:
                    assert_message = 'Expected to find log message: {0}'.format(
                        message)
                else:
                    assert_message = 'Did not expect to find log message: {0}'.format(
                        message)
                assert_function(member=message,
                                container=generic_logs,
                                msg=assert_message)
                if assert_function == self.assertIn:
                    self.assertEqual(
                        first=severity,
                        second=generic_logs[message],
                        msg='Log message {0} is of severity {1} expected {2}'.
                        format(message, generic_logs[message], severity))

        # Collapse should always have a 'finished' message since each cluster should be attempted to be collapsed
        for general_message in [
                'Arakoon collapse started', 'Arakoon collapse finished'
        ]:
            self.assertIn(member=general_message,
                          container=generic_logs,
                          msg='Expected to find log message: {0}'.format(
                              general_message))
Beispiel #50
0
    def execute_scrub_work(queue, vpool, scrub_info, error_messages):
        """
        Executes scrub work for a given vDisk queue and vPool, based on scrub_info
        :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool)
        :type queue: Queue
        :param vpool: the vPool object of the vDisks
        :type vpool: VPool
        :param scrub_info: A dict containing scrub information: `scrub_path` with the path where to scrub and `storage_router` with the StorageRouter
                           that needs to do the work
        :type scrub_info: dict
        :param error_messages: A list of error messages to be filled
        :type error_messages: list
        :return: a list of error messages
        :rtype: list
        """

        def _verify_mds_config(current_vdisk):
            current_vdisk.invalidate_dynamics('info')
            vdisk_configs = current_vdisk.info['metadata_backend_config']
            if len(vdisk_configs) == 0:
                raise RuntimeError('Could not load MDS configuration')
            return vdisk_configs

        client = None
        lock_time = 5 * 60
        storagerouter = scrub_info['storage_router']
        scrub_directory = '{0}/scrub_work_{1}_{2}'.format(scrub_info['scrub_path'], vpool.name, storagerouter.name)
        scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format(vpool.guid, storagerouter.guid)
        backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format(vpool.guid, storagerouter.guid)
        alba_proxy_service = 'ovs-albaproxy_{0}_{1}_scrub'.format(vpool.name, storagerouter.name)

        # Deploy a proxy
        try:
            with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time):
                ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service))
                client = SSHClient(storagerouter, 'root')
                client.dir_create(scrub_directory)
                client.dir_chmod(scrub_directory, 0777)  # Celery task executed by 'ovs' user and should be able to write in it
                if ServiceManager.has_service(name=alba_proxy_service, client=client) is True and ServiceManager.get_service_status(name=alba_proxy_service, client=client) is True:
                    ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service))
                    scrub_config = Configuration.get(scrub_config_key)
                else:
                    machine_id = System.get_my_machine_id(client)
                    port_range = Configuration.get('/ovs/framework/hosts/{0}/ports|storagedriver'.format(machine_id))
                    port = System.get_free_ports(selected_range=port_range, nr=1, client=client)[0]
                    # Scrub config
                    # {u'albamgr_cfg_url': u'arakoon://config/ovs/vpools/71e2f717-f270-4a41-bbb0-d4c8c084d43e/proxies/64759516-3471-4321-b912-fb424568fc5b/config/abm?ini=%2Fopt%2FOpenvStorage%2Fconfig%2Farakoon_cacc.ini',
                    #  u'fragment_cache': [u'none'],
                    #  u'ips': [u'127.0.0.1'],
                    #  u'log_level': u'info',
                    #  u'manifest_cache_size': 17179869184,
                    #  u'port': 0,
                    #  u'transport': u'tcp'}

                    # Backend config
                    # {u'alba_connection_host': u'10.100.193.155',
                    #  u'alba_connection_port': 26204,
                    #  u'alba_connection_preset': u'preset',
                    #  u'alba_connection_timeout': 15,
                    #  u'alba_connection_transport': u'TCP',
                    #  u'backend_interface_retries_on_error': 5,
                    #  u'backend_interface_retry_backoff_multiplier': 2.0,
                    #  u'backend_interface_retry_interval_secs': 1,
                    #  u'backend_type': u'ALBA'}
                    scrub_config = Configuration.get('ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid))
                    scrub_config['port'] = port
                    scrub_config['transport'] = 'tcp'
                    Configuration.set(scrub_config_key, json.dumps(scrub_config, indent=4), raw=True)

                    params = {'VPOOL_NAME': vpool.name,
                              'LOG_SINK': LogHandler.get_sink_path('alba_proxy'),
                              'CONFIG_PATH': Configuration.get_configuration_path(scrub_config_key)}
                    ServiceManager.add_service(name='ovs-albaproxy', params=params, client=client, target_name=alba_proxy_service)
                    ServiceManager.start_service(name=alba_proxy_service, client=client)
                    ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service))

                backend_config = Configuration.get('ovs/vpools/{0}/hosts/{1}/config'.format(vpool.guid, vpool.storagedrivers[0].storagedriver_id))['backend_connection_manager']
                backend_config['alba_connection_host'] = '127.0.0.1'
                backend_config['alba_connection_port'] = scrub_config['port']
                Configuration.set(backend_config_key, json.dumps({"backend_connection_manager": backend_config}, indent=4), raw=True)
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)
            error_messages.append(message)
            ScheduledTaskController._logger.exception(message)
            if client is not None and ServiceManager.has_service(name=alba_proxy_service, client=client) is True:
                if ServiceManager.get_service_status(name=alba_proxy_service, client=client) is True:
                    ServiceManager.stop_service(name=alba_proxy_service, client=client)
                ServiceManager.remove_service(name=alba_proxy_service, client=client)
            if Configuration.exists(scrub_config_key):
                Configuration.delete(scrub_config_key)

        try:
            # Empty the queue with vDisks to scrub
            with remote(storagerouter.ip, [VDisk]) as rem:
                while True:
                    vdisk = None
                    vdisk_guid = queue.get(False)
                    try:
                        # Check MDS master is local. Trigger MDS handover if necessary
                        vdisk = rem.VDisk(vdisk_guid)
                        ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Started scrubbing at location {3}'.format(vpool.name, storagerouter.name, vdisk.name, scrub_directory))
                        configs = _verify_mds_config(current_vdisk=vdisk)
                        storagedriver = StorageDriverList.get_by_storagedriver_id(vdisk.storagedriver_id)
                        if configs[0].get('ip') != storagedriver.storagerouter.ip:
                            ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - MDS master is not local, trigger handover'.format(vpool.name, storagerouter.name, vdisk.name))
                            MDSServiceController.ensure_safety(VDisk(vdisk_guid))  # Do not use a remote VDisk instance here
                            configs = _verify_mds_config(current_vdisk=vdisk)
                            if configs[0].get('ip') != storagedriver.storagerouter.ip:
                                ScheduledTaskController._logger.warning('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because master MDS still not local'.format(vpool.name, storagerouter.name, vdisk.name))
                                continue

                        # Do the actual scrubbing
                        with vdisk.storagedriver_client.make_locked_client(str(vdisk.volume_id)) as locked_client:
                            ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Retrieve and apply scrub work'.format(vpool.name, storagerouter.name, vdisk.name))
                            work_units = locked_client.get_scrubbing_workunits()
                            for work_unit in work_units:
                                res = locked_client.scrub(work_unit=work_unit,
                                                          scratch_dir=scrub_directory,
                                                          log_sinks=[LogHandler.get_sink_path('scrubber', allow_override=True)],
                                                          backend_config=Configuration.get_configuration_path(backend_config_key))
                                locked_client.apply_scrubbing_result(scrubbing_work_result=res)
                            if work_units:
                                ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - {3} work units successfully applied'.format(vpool.name, storagerouter.name, vdisk.name, len(work_units)))
                            else:
                                ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - No scrubbing required'.format(vpool.name, storagerouter.name, vdisk.name))
                    except Exception:
                        if vdisk is None:
                            message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk with guid {2} could not be found'.format(vpool.name, storagerouter.name, vdisk_guid)
                        else:
                            message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Scrubbing failed'.format(vpool.name, storagerouter.name, vdisk.name)
                        error_messages.append(message)
                        ScheduledTaskController._logger.exception(message)

        except Empty:  # Raised when all items have been fetched from the queue
            ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Queue completely processed'.format(vpool.name, storagerouter.name))
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - Scrubbing failed'.format(vpool.name, storagerouter.name)
            error_messages.append(message)
            ScheduledTaskController._logger.exception(message)

        # Delete the proxy again
        try:
            with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time):
                ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service))
                client = SSHClient(storagerouter, 'root')
                client.dir_delete(scrub_directory)
                if ServiceManager.has_service(alba_proxy_service, client=client):
                    ServiceManager.stop_service(alba_proxy_service, client=client)
                    ServiceManager.remove_service(alba_proxy_service, client=client)
                if Configuration.exists(scrub_config_key):
                    Configuration.delete(scrub_config_key)
                ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service))
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format(vpool.name, storagerouter.name, alba_proxy_service)
            error_messages.append(message)
            ScheduledTaskController._logger.exception(message)
Beispiel #51
0
    def _deploy_stack_and_scrub(queue, vpool, scrub_info, error_messages):
        """
        Executes scrub work for a given vDisk queue and vPool, based on scrub_info
        :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool)
        :type queue: Queue
        :param vpool: the vPool object of the vDisks
        :type vpool: VPool
        :param scrub_info: A dict containing scrub information:
                           `scrub_path` with the path where to scrub
                           `storage_router` with the StorageRouter that needs to do the work
        :type scrub_info: dict
        :param error_messages: A list of error messages to be filled (by reference)
        :type error_messages: list
        :return: None
        :rtype: NoneType
        """
        if len(vpool.storagedrivers
               ) == 0 or not vpool.storagedrivers[0].storagedriver_id:
            error_messages.append(
                'vPool {0} does not have any valid StorageDrivers configured'.
                format(vpool.name))
            return

        service_manager = ServiceFactory.get_manager()
        client = None
        lock_time = 5 * 60
        storagerouter = scrub_info['storage_router']
        partition_guid = scrub_info['partition_guid']
        alba_proxy_service = 'ovs-albaproxy_{0}_{1}_{2}_scrub'.format(
            vpool.name, storagerouter.name, partition_guid)
        scrub_directory = '{0}/scrub_work_{1}_{2}'.format(
            scrub_info['scrub_path'], vpool.name, partition_guid)
        scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format(
            vpool.guid, partition_guid)
        backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format(
            vpool.guid, partition_guid)

        # Deploy a proxy
        try:
            with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time):
                GenericController._logger.info(
                    'Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}'
                    .format(vpool.name, storagerouter.name,
                            alba_proxy_service))
                client = SSHClient(storagerouter, 'root')
                client.dir_create(scrub_directory)
                client.dir_chmod(
                    scrub_directory, 0777
                )  # Celery task executed by 'ovs' user and should be able to write in it
                if service_manager.has_service(
                        name=alba_proxy_service, client=client
                ) is True and service_manager.get_service_status(
                        name=alba_proxy_service, client=client) == 'active':
                    GenericController._logger.info(
                        'Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}'
                        .format(vpool.name, storagerouter.name,
                                alba_proxy_service))
                    scrub_config = Configuration.get(scrub_config_key)
                else:
                    machine_id = System.get_my_machine_id(client)
                    port_range = Configuration.get(
                        '/ovs/framework/hosts/{0}/ports|storagedriver'.format(
                            machine_id))
                    with volatile_mutex('deploy_proxy_for_scrub_{0}'.format(
                            storagerouter.guid),
                                        wait=30):
                        port = System.get_free_ports(selected_range=port_range,
                                                     nr=1,
                                                     client=client)[0]
                    scrub_config = Configuration.get(
                        'ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(
                            vpool.guid))
                    scrub_config['port'] = port
                    scrub_config['transport'] = 'tcp'
                    Configuration.set(scrub_config_key,
                                      json.dumps(scrub_config, indent=4),
                                      raw=True)

                    params = {
                        'VPOOL_NAME':
                        vpool.name,
                        'LOG_SINK':
                        LogHandler.get_sink_path(alba_proxy_service),
                        'CONFIG_PATH':
                        Configuration.get_configuration_path(scrub_config_key)
                    }
                    service_manager.add_service(name='ovs-albaproxy',
                                                params=params,
                                                client=client,
                                                target_name=alba_proxy_service)
                    service_manager.start_service(name=alba_proxy_service,
                                                  client=client)
                    GenericController._logger.info(
                        'Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}'
                        .format(vpool.name, storagerouter.name,
                                alba_proxy_service))

                backend_config = Configuration.get(
                    'ovs/vpools/{0}/hosts/{1}/config'.format(
                        vpool.guid, vpool.storagedrivers[0].storagedriver_id
                    ))['backend_connection_manager']
                if backend_config.get('backend_type') != 'MULTI':
                    backend_config['alba_connection_host'] = '127.0.0.1'
                    backend_config['alba_connection_port'] = scrub_config[
                        'port']
                else:
                    for value in backend_config.itervalues():
                        if isinstance(value, dict):
                            value['alba_connection_host'] = '127.0.0.1'
                            value['alba_connection_port'] = scrub_config[
                                'port']
                # Copy backend connection manager information in separate key
                Configuration.set(
                    backend_config_key,
                    json.dumps({"backend_connection_manager": backend_config},
                               indent=4),
                    raw=True)
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format(
                vpool.name, storagerouter.name, alba_proxy_service)
            error_messages.append(message)
            GenericController._logger.exception(message)
            if client is not None and service_manager.has_service(
                    name=alba_proxy_service, client=client) is True:
                if service_manager.get_service_status(
                        name=alba_proxy_service, client=client) == 'active':
                    service_manager.stop_service(name=alba_proxy_service,
                                                 client=client)
                service_manager.remove_service(name=alba_proxy_service,
                                               client=client)
            if Configuration.exists(scrub_config_key):
                Configuration.delete(scrub_config_key)

        # Execute the actual scrubbing
        threads = []
        threads_key = '/ovs/framework/hosts/{0}/config|scrub_stack_threads'.format(
            storagerouter.machine_id)
        amount_threads = Configuration.get(
            key=threads_key) if Configuration.exists(key=threads_key) else 2
        if not isinstance(amount_threads, int):
            error_messages.append(
                'Amount of threads to spawn must be an integer for StorageRouter with ID {0}'
                .format(storagerouter.machine_id))
            return

        amount_threads = max(amount_threads,
                             1)  # Make sure amount_threads is at least 1
        amount_threads = min(min(queue.qsize(), amount_threads),
                             20)  # Make sure amount threads is max 20
        GenericController._logger.info(
            'Scrubber - vPool {0} - StorageRouter {1} - Spawning {2} threads for proxy service {3}'
            .format(vpool.name, storagerouter.name, amount_threads,
                    alba_proxy_service))
        for index in range(amount_threads):
            thread = Thread(name='execute_scrub_{0}_{1}_{2}'.format(
                vpool.guid, partition_guid, index),
                            target=GenericController._execute_scrub,
                            args=(queue, vpool, scrub_info, scrub_directory,
                                  error_messages))
            thread.start()
            threads.append(thread)
        for thread in threads:
            thread.join()

        # Delete the proxy again
        try:
            with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time):
                GenericController._logger.info(
                    'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}'
                    .format(vpool.name, storagerouter.name,
                            alba_proxy_service))
                client = SSHClient(storagerouter, 'root')
                client.dir_delete(scrub_directory)
                if service_manager.has_service(alba_proxy_service,
                                               client=client):
                    service_manager.stop_service(alba_proxy_service,
                                                 client=client)
                    service_manager.remove_service(alba_proxy_service,
                                                   client=client)
                if Configuration.exists(scrub_config_key):
                    Configuration.delete(scrub_config_key)
                GenericController._logger.info(
                    'Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}'
                    .format(vpool.name, storagerouter.name,
                            alba_proxy_service))
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format(
                vpool.name, storagerouter.name, alba_proxy_service)
            error_messages.append(message)
            GenericController._logger.exception(message)