コード例 #1
0
    def monitor_services(self):
        # type: () -> None
        """
        Monitor the local services
        :return: None
        :rtype: NoneType
        """
        try:
            grep = ['egrep "{0}"'.format(prefix) for prefix in self._monitor_prefixes]
            previous_output = None
            while True:
                # Gather service states
                running_services = {}
                non_running_services = {}
                longest_service_name = 0
                for service_name in check_output('systemctl list-unit-files --full --type=service --no-legend --no-pager | {0} | tr -s " " | cut -d " " -f 1'.format(' | '.join(grep)), shell=True).splitlines():
                    try:
                        service_state = check_output('systemctl is-active {0}'.format(service_name), shell=True).strip()
                    except CalledProcessError as cpe:
                        service_state = cpe.output.strip()

                    service_name = service_name.replace('.service', '')
                    if service_state == 'active':
                        service_pid = check_output('systemctl show {0} --property=MainPID'.format(service_name), shell=True).strip().split('=')[1]
                        running_services[service_name] = (service_state, service_pid)
                    else:
                        non_running_services[service_name] = service_state

                    if len(service_name) > longest_service_name:
                        longest_service_name = len(service_name)

                # Put service states in list
                output = ['Running processes',
                          '=================\n']
                for service_name in sorted(running_services, key=lambda service: ExtensionsToolbox.advanced_sort(service, '_')):
                    output.append('{0} {1} {2}  {3}'.format(service_name, ' ' * (longest_service_name - len(service_name)), running_services[service_name][0], running_services[service_name][1]))

                output.extend(['\n\nNon-running processes',
                               '=====================\n'])
                for service_name in sorted(non_running_services, key=lambda service: ExtensionsToolbox.advanced_sort(service, '_')):
                    output.append('{0} {1} {2}'.format(service_name, ' ' * (longest_service_name - len(service_name)), non_running_services[service_name]))

                # Print service states (only if changes)
                if previous_output != output:
                    print '\x1b[2J\x1b[H'
                    for line in output:
                        print line
                    previous_output = list(output)
                time.sleep(1)
        except KeyboardInterrupt:
            pass
コード例 #2
0
 def check_arakoon_ports(cls, result_handler):
     """
     Verifies that the Arakoon clusters still respond to connections
     :param result_handler: logging object
     :type result_handler: ovs.extensions.healthcheck.result.HCResults
     :return: None
     :rtype: NoneType
     """
     arakoon_clusters = cls._get_arakoon_clusters(result_handler)
     result_handler.info('Starting Arakoon ports test.',
                         add_to_result=False)
     result_handler.info(
         'Retrieving all collapsing statistics. This might take a while',
         add_to_result=False)
     start = time.time()
     arakoon_stats = cls._get_port_connections(result_handler,
                                               arakoon_clusters)
     result_handler.info(
         'Retrieving all collapsing statistics succeeded (duration: {0})'.
         format(time.time() - start),
         add_to_result=False)
     for cluster_type, clusters in arakoon_stats.iteritems():
         result_handler.info(
             'Testing the collapse of {0} Arakoons'.format(cluster_type),
             add_to_result=False)
         for cluster in clusters:
             cluster_name = cluster['cluster_name']
             connection_result = cluster['connection_result']
             connection_result = OrderedDict(
                 sorted(connection_result.items(),
                        key=lambda item: ExtensionsToolbox.advanced_sort(
                            item[0].ip, separator='.')))
             for node, stats in connection_result.iteritems():
                 identifier_log = 'Arakoon cluster {0} on node {1}'.format(
                     cluster_name, node.ip)
                 if len(stats['errors']) > 0:
                     # Determine where issues were found
                     for step, exception in stats['errors']:
                         if step == 'test_connection':
                             try:
                                 # Raise the thrown exception
                                 raise exception
                             except Exception:
                                 message = 'Connection to {0} could not be established due to an unhandled exception.'.format(
                                     identifier_log)
                                 cls.logger.exception(message)
                                 result_handler.exception(
                                     message,
                                     code=ErrorCodes.unhandled_exception)
                     continue
                 if stats['result'] is True:
                     result_handler.success(
                         'Connection established to {0}'.format(
                             identifier_log),
                         code=ErrorCodes.arakoon_connection_ok)
                 else:
                     result_handler.failure(
                         'Connection could not be established to {0}'.
                         format(identifier_log),
                         code=ErrorCodes.arakoon_connection_failure)
コード例 #3
0
ファイル: mdsservice.py プロジェクト: cynpna/framework
    def mds_checkup():
        """
        Validates the current MDS setup/configuration and takes actions where required
        Actions:
            * Verify which StorageRouters are available
            * Make mapping between vPools and its StorageRouters
            * For each vPool make sure every StorageRouter has at least 1 MDS service with capacity available
            * For each vPool retrieve the optimal configuration and store it for each StorageDriver
            * For each vPool run an ensure safety for all vDisks
        :raises RuntimeError: When ensure safety fails for any vDisk
        :return: None
        :rtype: NoneType
        """
        MDSServiceController._logger.info('Started')

        # Verify StorageRouter availability
        root_client_cache = {}
        storagerouters = StorageRouterList.get_storagerouters()
        storagerouters.sort(key=lambda _sr: ExtensionsToolbox.advanced_sort(
            element=_sr.ip, separator='.'))
        offline_nodes = []
        for storagerouter in storagerouters:
            try:
                root_client = SSHClient(endpoint=storagerouter,
                                        username='******')
                MDSServiceController._logger.debug(
                    'StorageRouter {0} - ONLINE'.format(storagerouter.name))
            except UnableToConnectException:
                root_client = None
                offline_nodes.append(storagerouter)
                MDSServiceController._logger.error(
                    'StorageRouter {0} - OFFLINE'.format(storagerouter.name))
            root_client_cache[storagerouter] = root_client

        # Create mapping per vPool and its StorageRouters
        mds_dict = collections.OrderedDict()
        for vpool in sorted(VPoolList.get_vpools(), key=lambda k: k.name):
            MDSServiceController._logger.info('vPool {0}'.format(vpool.name))
            mds_dict[vpool] = {}

            # Loop all StorageDrivers and add StorageDriver to mapping
            for storagedriver in vpool.storagedrivers:
                storagerouter = storagedriver.storagerouter
                if storagerouter not in mds_dict[vpool]:
                    mds_dict[vpool][storagerouter] = {
                        'client': root_client_cache.get(storagerouter),
                        'services': [],
                        'storagedriver': storagedriver
                    }

            # Loop all MDS Services and append services to appropriate vPool / StorageRouter combo
            mds_services = vpool.mds_services
            mds_services.sort(
                key=lambda _mds_service: ExtensionsToolbox.advanced_sort(
                    element=_mds_service.service.storagerouter.ip,
                    separator='.'))
            for mds_service in mds_services:
                service = mds_service.service
                storagerouter = service.storagerouter
                if storagerouter not in mds_dict[vpool]:
                    mds_dict[vpool][storagerouter] = {
                        'client': root_client_cache.get(storagerouter),
                        'services': [],
                        'storagedriver': None
                    }
                MDSServiceController._logger.debug(
                    'vPool {0} - StorageRouter {1} - Service on port {2}'.
                    format(vpool.name, storagerouter.name, service.ports[0]))
                mds_dict[vpool][storagerouter]['services'].append(mds_service)

        failures = []
        for vpool, storagerouter_info in mds_dict.iteritems():
            # Make sure there's at least 1 MDS on every StorageRouter that's not overloaded
            # Remove all MDS Services which have been manually marked for removal (by setting its capacity to 0)
            max_load = Configuration.get(
                '/ovs/vpools/{0}/mds_config|mds_maxload'.format(vpool.guid))
            for storagerouter in sorted(storagerouter_info,
                                        key=lambda k: k.ip):
                total_load = 0.0
                root_client = mds_dict[vpool][storagerouter]['client']
                mds_services = mds_dict[vpool][storagerouter]['services']

                for mds_service in list(
                        sorted(mds_services, key=lambda k: k.number)):
                    port = mds_service.service.ports[0]
                    number = mds_service.number
                    # Manual intervention required here in order for the MDS to be cleaned up
                    # @TODO: Remove this and make a dynamic calculation to check which MDSes to remove
                    if mds_service.capacity == 0 and len(
                            mds_service.vdisks_guids) == 0:
                        MDSServiceController._logger.warning(
                            'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: Removing'
                            .format(vpool.name, storagerouter.name, number,
                                    port))
                        try:
                            MDSServiceController.remove_mds_service(
                                mds_service=mds_service,
                                reconfigure=True,
                                allow_offline=root_client is None)
                        except Exception:
                            MDSServiceController._logger.exception(
                                'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: Failed to remove'
                                .format(vpool.name, storagerouter.name, number,
                                        port))
                        mds_services.remove(mds_service)
                    else:
                        _, next_load = MDSServiceController.get_mds_load(
                            mds_service=mds_service)
                        if next_load == float('inf'):
                            total_load = sys.maxint * -1  # Cast to lowest possible value if any MDS service capacity is set to infinity
                        else:
                            total_load += next_load

                        if next_load < max_load:
                            MDSServiceController._logger.debug(
                                'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: Capacity available - Load at {4}%'
                                .format(vpool.name, storagerouter.name, number,
                                        port, next_load))
                        else:
                            MDSServiceController._logger.debug(
                                'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: No capacity available - Load at {4}%'
                                .format(vpool.name, storagerouter.name, number,
                                        port, next_load))

                if total_load >= max_load * len(mds_services):
                    mds_services_to_add = int(
                        math.ceil((total_load - max_load * len(mds_services)) /
                                  max_load))
                    MDSServiceController._logger.info(
                        'vPool {0} - StorageRouter {1} - Average load per service {2:.2f}% - Max load per service {3:.2f}% - {4} MDS service{5} will be added'
                        .format(vpool.name, storagerouter.name,
                                total_load / len(mds_services), max_load,
                                mds_services_to_add,
                                '' if mds_services_to_add == 1 else 's'))

                    for _ in range(mds_services_to_add):
                        MDSServiceController._logger.info(
                            'vPool {0} - StorageRouter {1} - Adding new MDS Service'
                            .format(vpool.name, storagerouter.name))
                        try:
                            mds_services.append(
                                MDSServiceController.prepare_mds_service(
                                    storagerouter=storagerouter, vpool=vpool))
                        except Exception:
                            MDSServiceController._logger.exception(
                                'vPool {0} - StorageRouter {1} - Failed to create new MDS Service'
                                .format(vpool.name, storagerouter.name))

            # After potentially having added new MDSes, retrieve the optimal configuration
            mds_config_set = {}
            try:
                mds_config_set = MDSServiceController.get_mds_storagedriver_config_set(
                    vpool=vpool, offline_nodes=offline_nodes)
                MDSServiceController._logger.debug(
                    'vPool {0} - Optimal configuration {1}'.format(
                        vpool.name, mds_config_set))
            except (NotFoundException, RuntimeError):
                MDSServiceController._logger.exception(
                    'vPool {0} - Failed to retrieve the optimal configuration'.
                    format(vpool.name))

            # Apply the optimal MDS configuration per StorageDriver
            for storagerouter in sorted(storagerouter_info,
                                        key=lambda k: k.ip):
                root_client = mds_dict[vpool][storagerouter]['client']
                storagedriver = mds_dict[vpool][storagerouter]['storagedriver']

                if storagedriver is None:
                    MDSServiceController._logger.critical(
                        'vPool {0} - StorageRouter {1} - No matching StorageDriver found'
                        .format(vpool.name, storagerouter.name))
                    continue
                if storagerouter.guid not in mds_config_set:
                    MDSServiceController._logger.critical(
                        'vPool {0} - StorageRouter {1} - Not marked as offline, but could not retrieve an optimal MDS config'
                        .format(vpool.name, storagerouter.name))
                    continue
                if root_client is None:
                    MDSServiceController._logger.debug(
                        'vPool {0} - StorageRouter {1} - Marked as offline, not setting optimal MDS configuration'
                        .format(vpool.name, storagerouter.name))
                    continue

                storagedriver_config = StorageDriverConfiguration(
                    vpool_guid=vpool.guid,
                    storagedriver_id=storagedriver.storagedriver_id)
                if storagedriver_config.config_missing is False:
                    optimal_mds_config = mds_config_set[storagerouter.guid]
                    MDSServiceController._logger.debug(
                        'vPool {0} - StorageRouter {1} - Storing optimal MDS configuration: {2}'
                        .format(vpool.name, storagerouter.name,
                                optimal_mds_config))
                    # Filesystem section in StorageDriver configuration are all parameters used for vDisks created directly on the filesystem
                    # So when a vDisk gets created on the filesystem, these MDSes will be assigned to them
                    storagedriver_config.configure_filesystem(
                        fs_metadata_backend_mds_nodes=optimal_mds_config)
                    storagedriver_config.save(root_client)

            # Execute a safety check, making sure the master/slave configuration is optimal.
            MDSServiceController._logger.info(
                'vPool {0} - Ensuring safety for all vDisks'.format(
                    vpool.name))
            for vdisk in vpool.vdisks:
                try:
                    MDSServiceController.ensure_safety(vdisk_guid=vdisk.guid)
                except Exception:
                    message = 'Ensure safety for vDisk {0} with guid {1} failed'.format(
                        vdisk.name, vdisk.guid)
                    MDSServiceController._logger.exception(message)
                    failures.append(message)
        if len(failures) > 0:
            raise RuntimeError('\n - ' + '\n - '.join(failures))
        MDSServiceController._logger.info('Finished')
コード例 #4
0
    def refresh_package_information():
        """
        Retrieve and store the package information of all StorageRouters
        :return: None
        """
        GenericController._logger.info('Updating package information')

        client_map = {}
        prerequisites = []
        package_info_cluster = {}
        all_storagerouters = StorageRouterList.get_storagerouters()
        all_storagerouters.sort(key=lambda sr: ExtensionsToolbox.advanced_sort(
            element=sr.ip, separator='.'))
        for storagerouter in all_storagerouters:
            package_info_cluster[storagerouter.ip] = {}
            try:
                # We make use of these clients in Threads --> cached = False
                client_map[storagerouter] = SSHClient(endpoint=storagerouter,
                                                      username='******',
                                                      cached=False)
            except (NotAuthenticatedException, UnableToConnectException):
                GenericController._logger.warning(
                    'StorageRouter {0} is inaccessible'.format(
                        storagerouter.ip))
                prerequisites.append(['node_down', storagerouter.name])
                package_info_cluster[storagerouter.ip]['errors'] = [
                    'StorageRouter {0} is inaccessible'.format(
                        storagerouter.name)
                ]

        # Retrieve for each StorageRouter in the cluster the installed and candidate versions of related packages
        # This also validates whether all required packages have been installed
        GenericController._logger.debug(
            'Retrieving package information for the cluster')
        threads = []
        for storagerouter, client in client_map.iteritems():
            for fct in Toolbox.fetch_hooks(
                    component='update',
                    sub_component='get_package_update_info_cluster'):
                thread = Thread(target=fct,
                                args=(client, package_info_cluster))
                thread.start()
                threads.append(thread)

        for thread in threads:
            thread.join()

        # Retrieve the related downtime / service restart information
        GenericController._logger.debug(
            'Retrieving update information for the cluster')
        update_info_cluster = {}
        for storagerouter, client in client_map.iteritems():
            update_info_cluster[storagerouter.ip] = {
                'errors':
                package_info_cluster[storagerouter.ip].get('errors', [])
            }
            for fct in Toolbox.fetch_hooks(
                    component='update',
                    sub_component='get_update_info_cluster'):
                fct(client, update_info_cluster,
                    package_info_cluster[storagerouter.ip])

        # Retrieve the update information for plugins (eg: ALBA, iSCSI)
        GenericController._logger.debug(
            'Retrieving package and update information for the plugins')
        threads = []
        update_info_plugin = {}
        for fct in Toolbox.fetch_hooks('update', 'get_update_info_plugin'):
            thread = Thread(target=fct, args=(update_info_plugin, ))
            thread.start()
            threads.append(thread)

        for thread in threads:
            thread.join()

        # Add the prerequisites
        if len(prerequisites) > 0:
            for ip, component_info in update_info_cluster.iteritems():
                if PackageFactory.COMP_FWK in component_info:
                    component_info[PackageFactory.COMP_FWK][
                        'prerequisites'].extend(prerequisites)

        # Store information in model and collect errors for OVS cluster
        errors = set()
        for storagerouter in all_storagerouters:
            GenericController._logger.debug(
                'Storing update information for StorageRouter {0}'.format(
                    storagerouter.ip))
            update_info = update_info_cluster.get(storagerouter.ip, {})

            # Remove the errors from the update information
            sr_errors = update_info.pop('errors', [])
            if len(sr_errors) > 0:
                errors.update([
                    '{0}: {1}'.format(storagerouter.ip, error)
                    for error in sr_errors
                ])
                update_info = {
                }  # If any error occurred, we store no update information for this StorageRouter

            # Remove the components without updates from the update information
            update_info_copy = copy.deepcopy(update_info)
            for component, info in update_info_copy.iteritems():
                if len(info['packages']) == 0:
                    update_info.pop(component)

            # Store the update information
            storagerouter.package_information = update_info
            storagerouter.save()

        # Collect errors for plugins
        for ip, plugin_errors in update_info_plugin.iteritems():
            if len(plugin_errors) > 0:
                errors.update(
                    ['{0}: {1}'.format(ip, error) for error in plugin_errors])

        if len(errors) > 0:
            raise Exception('\n - {0}'.format('\n - '.join(errors)))
        GenericController._logger.info('Finished updating package information')
コード例 #5
0
    def _get_mds_information(vpools=None):
        # type: (Optional[List[VPool]]) -> Tuple[collections.OrderedDict, List[StorageRouter]]
        """
        Retrieve a complete overview of all storagerouters and their mds layout
        :param vpools: VPools to get the overview for
        :type vpools: List[VPool]
        :return: - An overview with the vpool as keys, storagerouter - client, services and storagedriver map
                 - All storagerouters that were offline
        :rtype: Tuple[collection.OrderedDict, List[StorageRouter]]
        """
        # Verify StorageRouter availability
        if vpools is None:
            vpools = VPoolList.get_vpools()

        root_client_cache = {}
        storagerouters = StorageRouterList.get_storagerouters()
        storagerouters.sort(key=lambda _sr: ExtensionsToolbox.advanced_sort(
            element=_sr.ip, separator='.'))
        offline_nodes = []
        for storagerouter in storagerouters:
            try:
                root_client = SSHClient(endpoint=storagerouter,
                                        username='******')
                MDSServiceController._logger.debug(
                    'StorageRouter {0} - ONLINE'.format(storagerouter.name))
            except UnableToConnectException:
                root_client = None
                offline_nodes.append(storagerouter)
                MDSServiceController._logger.error(
                    'StorageRouter {0} - OFFLINE'.format(storagerouter.name))
            root_client_cache[storagerouter] = root_client

        # Create mapping per vPool and its StorageRouters
        mds_dict = collections.OrderedDict()
        for vpool in sorted(vpools, key=lambda k: k.name):
            MDSServiceController._logger.info('vPool {0}'.format(vpool.name))
            mds_dict[vpool] = {}

            # Loop all StorageDrivers and add StorageDriver to mapping
            for storagedriver in vpool.storagedrivers:
                storagerouter = storagedriver.storagerouter
                if storagerouter not in mds_dict[vpool]:
                    mds_dict[vpool][storagerouter] = {
                        'client': root_client_cache.get(storagerouter),
                        'services': [],
                        'storagedriver': storagedriver
                    }

            # Loop all MDS Services and append services to appropriate vPool / StorageRouter combo
            mds_services = vpool.mds_services
            mds_services.sort(
                key=lambda _mds_service: ExtensionsToolbox.advanced_sort(
                    element=_mds_service.service.storagerouter.ip,
                    separator='.'))
            for mds_service in mds_services:
                service = mds_service.service
                storagerouter = service.storagerouter
                if storagerouter not in mds_dict[vpool]:
                    mds_dict[vpool][storagerouter] = {
                        'client': root_client_cache.get(storagerouter),
                        'services': [],
                        'storagedriver': None
                    }
                MDSServiceController._logger.debug(
                    'vPool {0} - StorageRouter {1} - Service on port {2}'.
                    format(vpool.name, storagerouter.name, service.ports[0]))
                mds_dict[vpool][storagerouter]['services'].append(mds_service)
        return mds_dict, offline_nodes
コード例 #6
0
    def check_arakoon_fd(cls, result_handler, fd_limit=30, passed_connections=None):
        """
        Checks all current open tcp file descriptors for all Arakoon clusters in the OVS cluster
        Will raise warnings when these reach a certain threshold
        :param result_handler: Logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :param fd_limit: Threshold for the number number of tcp connections for which to start logging warnings
        :type fd_limit: int
        :param passed_connections: checked TCP connections
        :type passed_connections: list
        :return: None
        :rtype: NoneType
        """
        if passed_connections is None:
            passed_connections = ['ESTABLISHED', 'TIME_WAIT']
        warning_threshold = fd_limit * 80 / 100
        error_threshold = fd_limit * 95 / 100

        result_handler.info('Starting Arakoon integrity test', add_to_result=False)
        arakoon_clusters = cls._get_arakoon_clusters(result_handler)
        start = time.time()
        arakoon_fd_results = cls._get_filedescriptors(result_handler, arakoon_clusters)
        result_handler.info('Retrieving all file descriptor information succeeded (duration: {0})'.format(time.time() - start), add_to_result=False)
        for cluster_type, clusters in arakoon_fd_results.iteritems():
            result_handler.info('Checking the file descriptors of {0} Arakoons'.format(cluster_type), add_to_result=False)
            for cluster in clusters:
                cluster_name = cluster['cluster_name']
                fd_result = cluster['fd_result']
                fd_result = OrderedDict(sorted(fd_result.items(), key=lambda item: ExtensionsToolbox.advanced_sort(item[0].ip, separator='.')))
                for node, stats in fd_result.iteritems():
                    identifier_log = 'Arakoon cluster {0} on node {1}'.format(cluster_name, node.ip)
                    if len(stats['errors']) > 0:
                        # Determine where issues were found
                        for step, exception in stats['errors']:
                            if step == 'build_client':
                                try:
                                    # Raise the thrown exception
                                    raise exception
                                except TimeOutException:
                                    result_handler.warning('Connection to {0} has timed out'.format(identifier_log), code=ErrorCodes.ssh_connection_time)
                                except (socket.error, UnableToConnectException):
                                    result_handler.failure(
                                        'Connection to {0} could not be established'.format(identifier_log), code=ErrorCodes.ssh_connection_fail)
                                except NotAuthenticatedException:
                                    result_handler.skip('Connection to {0} could not be authenticated. This node has no access to the Arakoon node.'.format(identifier_log),
                                                        code=ErrorCodes.ssh_connection_authentication)
                                except Exception:
                                    message = 'Connection to {0} could not be established due to an unhandled exception.'.format(identifier_log)
                                    cls.logger.exception(message)
                                    result_handler.exception(message, code=ErrorCodes.unhandled_exception)
                            elif step == 'lsof':
                                try:
                                    raise exception
                                except Exception:
                                    message = 'Unable to list the file descriptors for {0}'.format(identifier_log)
                                    cls.logger.exception(message)
                                    result_handler.exception(message, ErrorCodes.unhandled_exception)
                        continue
                    fds = stats['result']['fds']
                    filtered_fds = [i for i in fds if i.split()[-1].strip('(').strip(')') in passed_connections]
                    if len(filtered_fds) >= warning_threshold:
                        if len(filtered_fds) >= error_threshold:
                            result_handler.warning('Number of TCP connections exceeded the 95% warning threshold for {0}, ({1}/{2})'.format(identifier_log, len(filtered_fds), fd_limit),
                                                   code=ErrorCodes.arakoon_fd_95)
                        else:
                            result_handler.warning('Number of TCP connections exceeded the 80% warning threshold for {0}, ({1}/{2})'.format(identifier_log, len(filtered_fds), fd_limit),
                                                   code=ErrorCodes.arakoon_fd_80)
                    else:
                        result_handler.success('Number of TCP connections for {0} is healthy ({1}/{2})'.format(identifier_log, len(filtered_fds), fd_limit),
                                               code=ErrorCodes.arakoon_fd_ok)
コード例 #7
0
    def check_collapse(cls, result_handler, max_collapse_age=3, min_tlx_amount=10):
        """
        Verifies collapsing has occurred for all Arakoons
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :param max_collapse_age: tlx files may not be longer than x days
        :type max_collapse_age: int
        :param min_tlx_amount: Minimum amount of tlxes before making collapsing mandatory (defaults to 10)
        :type min_tlx_amount: int
        :return: None
        :rtype: NoneType
        """
        arakoon_clusters = cls._get_arakoon_clusters(result_handler)
        result_handler.info('Starting Arakoon collapse test', add_to_result=False)
        max_age_seconds = timedelta(days=max_collapse_age).total_seconds()
        result_handler.info('Retrieving all collapsing statistics. This might take a while', add_to_result=False)
        start = time.time()
        arakoon_stats = cls._retrieve_stats(result_handler, arakoon_clusters)
        result_handler.info('Retrieving all collapsing statistics succeeded (duration: {0})'.format(time.time() - start), add_to_result=False)
        for cluster_type, clusters in arakoon_stats.iteritems():
            result_handler.info('Testing the collapse of {0} Arakoons'.format(cluster_type), add_to_result=False)
            for cluster in clusters:
                cluster_name = cluster['cluster_name']
                collapse_result = cluster['collapse_result']
                collapse_result = OrderedDict(sorted(collapse_result.items(), key=lambda item: ExtensionsToolbox.advanced_sort(item[0].ip, separator='.')))
                for node, stats in collapse_result.iteritems():
                    identifier_log = 'Arakoon cluster {0} on node {1}'.format(cluster_name, node.ip)
                    if len(stats['errors']) > 0:
                        # Determine where issues were found
                        for step, exception in stats['errors']:
                            if step == 'build_client':
                                try:
                                    # Raise the thrown exception
                                    raise exception
                                except TimeOutException:
                                    result_handler.warning('Connection to {0} has timed out'.format(identifier_log), code=ErrorCodes.ssh_connection_time)
                                except (socket.error, UnableToConnectException):
                                    result_handler.failure(
                                        'Connection to {0} could not be established'.format(identifier_log), code=ErrorCodes.ssh_connection_fail)
                                except NotAuthenticatedException:
                                    result_handler.skip('Connection to {0} could not be authenticated. This node has no access to the Arakoon node.'.format(identifier_log),
                                                        code=ErrorCodes.ssh_connection_authentication)
                                except Exception:
                                    message = 'Connection to {0} could not be established due to an unhandled exception.'.format(identifier_log)
                                    cls.logger.exception(message)
                                    result_handler.exception(message, code=ErrorCodes.unhandled_exception)
                            elif step == 'stat_dir':
                                try:
                                    raise exception
                                except Exception:
                                    message = 'Unable to list the contents of the tlog directory ({0}) for {1}'.format(node.tlog_dir, identifier_log)
                                    cls.logger.exception(message)
                                    result_handler.exception(message, code=ErrorCodes.unhandled_exception)
                        continue
                    tlx_files = stats['result']['tlx']
                    tlog_files = stats['result']['tlog']
                    headdb_files = stats['result']['headDB']
                    avail_size = stats['result']['avail_size']

                    if any(item is None for item in [tlx_files, tlog_files, avail_size]):
                        # Exception occurred but no errors were logged
                        result_handler.exception('Either the tlx or tlog files or available size could be found in/of the tlog directory ({0}) for {1}'.format(node.tlog_dir, identifier_log),
                                                 code=ErrorCodes.tlx_tlog_not_found)
                        continue
                    if len(headdb_files) > 0:
                        headdb_size = sum([int(i[2]) for i in headdb_files])
                        collapse_size_msg = 'Spare space for local collapse is'
                        if avail_size >= headdb_size * 4:
                            result_handler.success('{0} sufficient (n > 4x head.db size)'.format(collapse_size_msg))
                        elif avail_size >= headdb_size * 3:
                            result_handler.warning('{0} running short (n > 3x head.db size)'.format(collapse_size_msg))
                        elif avail_size >= headdb_size * 2:
                            result_handler.failure('{0} just enough (n > 2x head.db size'.format(collapse_size_msg))
                        else:
                            result_handler.failure('{0} insufficient (n <2 x head.db size'.format(collapse_size_msg))

                    if len(tlog_files) == 0:
                        # A tlog should always be present
                        result_handler.failure('{0} has no open tlog'.format(identifier_log), code=ErrorCodes.tlog_not_found)
                        continue
                    if len(tlx_files) < min_tlx_amount:
                        result_handler.skip('{0} only has {1} tlx, not worth collapsing (required: {2})'.format(identifier_log, len(tlx_files), min_tlx_amount))
                        continue
                    # Compare youngest tlog and oldest tlx timestamp
                    seconds_difference = int(tlog_files[-1][0]) - int(tlx_files[0][0])
                    if max_age_seconds > seconds_difference:
                        result_handler.success('{0} should not be collapsed. The oldest tlx is at least {1} days younger than the youngest tlog (actual age: {2})'.format(identifier_log, max_collapse_age, str(timedelta(seconds=seconds_difference))),
                                               code=ErrorCodes.collapse_ok)
                    else:
                        result_handler.failure('{0} should be collapsed. The oldest tlx is currently {1} old'.format(identifier_log, str(timedelta(seconds=seconds_difference))), code=ErrorCodes.collapse_not_ok)
コード例 #8
0
 def check_arakoon_ports(cls, result_handler):
     """
     Verifies that the Arakoon clusters still respond to connections
     :param result_handler: logging object
     :type result_handler: ovs.extensions.healthcheck.result.HCResults
     :return: None
     :rtype: NoneType
     """
     arakoon_clusters = cls._get_arakoon_clusters(result_handler)
     result_handler.info('Starting Arakoon ports test.', add_to_result=False)
     result_handler.info('Retrieving all collapsing statistics. This might take a while', add_to_result=False)
     start = time.time()
     arakoon_stats = cls._get_port_connections(result_handler, arakoon_clusters)
     result_handler.info('Retrieving all collapsing statistics succeeded (duration: {0})'.format(time.time() - start), add_to_result=False)
     for cluster_type, clusters in arakoon_stats.iteritems():
         result_handler.info('Testing the collapse of {0} Arakoons'.format(cluster_type), add_to_result=False)
         for cluster in clusters:
             cluster_name = cluster['cluster_name']
             connection_result = cluster['connection_result']
             connection_result = OrderedDict(sorted(connection_result.items(), key=lambda item: ExtensionsToolbox.advanced_sort(item[0].ip, separator='.')))
             for node, stats in connection_result.iteritems():
                 identifier_log = 'Arakoon cluster {0} on node {1}'.format(cluster_name, node.ip)
                 if len(stats['errors']) > 0:
                     # Determine where issues were found
                     for step, exception in stats['errors']:
                         if step == 'test_connection':
                             try:
                                 # Raise the thrown exception
                                 raise exception
                             except Exception:
                                 message = 'Connection to {0} could not be established due to an unhandled exception.'.format(identifier_log)
                                 cls.logger.exception(message)
                                 result_handler.exception(message, code=ErrorCodes.unhandled_exception)
                     continue
                 if stats['result'] is True:
                     result_handler.success('Connection established to {0}'.format(identifier_log),
                                            code=ErrorCodes.arakoon_connection_ok)
                 else:
                     result_handler.failure('Connection could not be established to {0}'.format(identifier_log),
                                            code=ErrorCodes.arakoon_connection_failure)
コード例 #9
0
    def check_arakoon_fd(cls,
                         result_handler,
                         fd_limit=30,
                         passed_connections=None):
        """
        Checks all current open tcp file descriptors for all Arakoon clusters in the OVS cluster
        Will raise warnings when these reach a certain threshold
        :param result_handler: Logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :param fd_limit: Threshold for the number number of tcp connections for which to start logging warnings
        :type fd_limit: int
        :param passed_connections: checked TCP connections
        :type passed_connections: list
        :return: None
        :rtype: NoneType
        """
        if passed_connections is None:
            passed_connections = ['ESTABLISHED', 'TIME_WAIT']
        warning_threshold = fd_limit * 80 / 100
        error_threshold = fd_limit * 95 / 100

        result_handler.info('Starting Arakoon integrity test',
                            add_to_result=False)
        arakoon_clusters = cls._get_arakoon_clusters(result_handler)
        start = time.time()
        arakoon_fd_results = cls._get_filedescriptors(result_handler,
                                                      arakoon_clusters)
        result_handler.info(
            'Retrieving all file descriptor information succeeded (duration: {0})'
            .format(time.time() - start),
            add_to_result=False)
        for cluster_type, clusters in arakoon_fd_results.iteritems():
            result_handler.info(
                'Checking the file descriptors of {0} Arakoons'.format(
                    cluster_type),
                add_to_result=False)
            for cluster in clusters:
                cluster_name = cluster['cluster_name']
                fd_result = cluster['fd_result']
                fd_result = OrderedDict(
                    sorted(fd_result.items(),
                           key=lambda item: ExtensionsToolbox.advanced_sort(
                               item[0].ip, separator='.')))
                for node, stats in fd_result.iteritems():
                    identifier_log = 'Arakoon cluster {0} on node {1}'.format(
                        cluster_name, node.ip)
                    if len(stats['errors']) > 0:
                        # Determine where issues were found
                        for step, exception in stats['errors']:
                            if step == 'build_client':
                                try:
                                    # Raise the thrown exception
                                    raise exception
                                except TimeOutException:
                                    result_handler.warning(
                                        'Connection to {0} has timed out'.
                                        format(identifier_log),
                                        code=ErrorCodes.ssh_connection_time)
                                except (socket.error,
                                        UnableToConnectException):
                                    result_handler.failure(
                                        'Connection to {0} could not be established'
                                        .format(identifier_log),
                                        code=ErrorCodes.ssh_connection_fail)
                                except NotAuthenticatedException:
                                    result_handler.skip(
                                        'Connection to {0} could not be authenticated. This node has no access to the Arakoon node.'
                                        .format(identifier_log),
                                        code=ErrorCodes.
                                        ssh_connection_authentication)
                                except Exception:
                                    message = 'Connection to {0} could not be established due to an unhandled exception.'.format(
                                        identifier_log)
                                    cls.logger.exception(message)
                                    result_handler.exception(
                                        message,
                                        code=ErrorCodes.unhandled_exception)
                            elif step == 'lsof':
                                try:
                                    raise exception
                                except Exception:
                                    message = 'Unable to list the file descriptors for {0}'.format(
                                        identifier_log)
                                    cls.logger.exception(message)
                                    result_handler.exception(
                                        message,
                                        ErrorCodes.unhandled_exception)
                        continue
                    fds = stats['result']['fds']
                    filtered_fds = [
                        i for i in fds if i.split()[-1].strip('(').strip(')')
                        in passed_connections
                    ]
                    if len(filtered_fds) >= warning_threshold:
                        if len(filtered_fds) >= error_threshold:
                            result_handler.warning(
                                'Number of TCP connections exceeded the 95% warning threshold for {0}, ({1}/{2})'
                                .format(identifier_log, len(filtered_fds),
                                        fd_limit),
                                code=ErrorCodes.arakoon_fd_95)
                        else:
                            result_handler.warning(
                                'Number of TCP connections exceeded the 80% warning threshold for {0}, ({1}/{2})'
                                .format(identifier_log, len(filtered_fds),
                                        fd_limit),
                                code=ErrorCodes.arakoon_fd_80)
                    else:
                        result_handler.success(
                            'Number of TCP connections for {0} is healthy ({1}/{2})'
                            .format(identifier_log, len(filtered_fds),
                                    fd_limit),
                            code=ErrorCodes.arakoon_fd_ok)
コード例 #10
0
    def check_collapse(cls,
                       result_handler,
                       max_collapse_age=3,
                       min_tlx_amount=10):
        """
        Verifies collapsing has occurred for all Arakoons
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :param max_collapse_age: tlx files may not be longer than x days
        :type max_collapse_age: int
        :param min_tlx_amount: Minimum amount of tlxes before making collapsing mandatory (defaults to 10)
        :type min_tlx_amount: int
        :return: None
        :rtype: NoneType
        """
        arakoon_clusters = cls._get_arakoon_clusters(result_handler)
        result_handler.info('Starting Arakoon collapse test',
                            add_to_result=False)
        max_age_seconds = timedelta(days=max_collapse_age).total_seconds()
        result_handler.info(
            'Retrieving all collapsing statistics. This might take a while',
            add_to_result=False)
        start = time.time()
        arakoon_stats = cls._retrieve_stats(result_handler, arakoon_clusters)
        result_handler.info(
            'Retrieving all collapsing statistics succeeded (duration: {0})'.
            format(time.time() - start),
            add_to_result=False)
        for cluster_type, clusters in arakoon_stats.iteritems():
            result_handler.info(
                'Testing the collapse of {0} Arakoons'.format(cluster_type),
                add_to_result=False)
            for cluster in clusters:
                cluster_name = cluster['cluster_name']
                collapse_result = cluster['collapse_result']
                collapse_result = OrderedDict(
                    sorted(collapse_result.items(),
                           key=lambda item: ExtensionsToolbox.advanced_sort(
                               item[0].ip, separator='.')))
                for node, stats in collapse_result.iteritems():
                    identifier_log = 'Arakoon cluster {0} on node {1}'.format(
                        cluster_name, node.ip)
                    if len(stats['errors']) > 0:
                        # Determine where issues were found
                        for step, exception in stats['errors']:
                            if step == 'build_client':
                                try:
                                    # Raise the thrown exception
                                    raise exception
                                except TimeOutException:
                                    result_handler.warning(
                                        'Connection to {0} has timed out'.
                                        format(identifier_log),
                                        code=ErrorCodes.ssh_connection_time)
                                except (socket.error,
                                        UnableToConnectException):
                                    result_handler.failure(
                                        'Connection to {0} could not be established'
                                        .format(identifier_log),
                                        code=ErrorCodes.ssh_connection_fail)
                                except NotAuthenticatedException:
                                    result_handler.skip(
                                        'Connection to {0} could not be authenticated. This node has no access to the Arakoon node.'
                                        .format(identifier_log),
                                        code=ErrorCodes.
                                        ssh_connection_authentication)
                                except Exception:
                                    message = 'Connection to {0} could not be established due to an unhandled exception.'.format(
                                        identifier_log)
                                    cls.logger.exception(message)
                                    result_handler.exception(
                                        message,
                                        code=ErrorCodes.unhandled_exception)
                            elif step == 'stat_dir':
                                try:
                                    raise exception
                                except Exception:
                                    message = 'Unable to list the contents of the tlog directory ({0}) for {1}'.format(
                                        node.tlog_dir, identifier_log)
                                    cls.logger.exception(message)
                                    result_handler.exception(
                                        message,
                                        code=ErrorCodes.unhandled_exception)
                        continue
                    tlx_files = stats['result']['tlx']
                    tlog_files = stats['result']['tlog']
                    headdb_files = stats['result']['headDB']
                    avail_size = stats['result']['avail_size']

                    if any(item is None
                           for item in [tlx_files, tlog_files, avail_size]):
                        # Exception occurred but no errors were logged
                        result_handler.exception(
                            'Either the tlx or tlog files or available size could be found in/of the tlog directory ({0}) for {1}'
                            .format(node.tlog_dir, identifier_log),
                            code=ErrorCodes.tlx_tlog_not_found)
                        continue
                    if len(headdb_files) > 0:
                        headdb_size = sum([int(i[2]) for i in headdb_files])
                        collapse_size_msg = 'Spare space for local collapse is'
                        if avail_size >= headdb_size * 4:
                            result_handler.success(
                                '{0} sufficient (n > 4x head.db size)'.format(
                                    collapse_size_msg))
                        elif avail_size >= headdb_size * 3:
                            result_handler.warning(
                                '{0} running short (n > 3x head.db size)'.
                                format(collapse_size_msg))
                        elif avail_size >= headdb_size * 2:
                            result_handler.failure(
                                '{0} just enough (n > 2x head.db size'.format(
                                    collapse_size_msg))
                        else:
                            result_handler.failure(
                                '{0} insufficient (n <2 x head.db size'.format(
                                    collapse_size_msg))

                    if len(tlog_files) == 0:
                        # A tlog should always be present
                        result_handler.failure(
                            '{0} has no open tlog'.format(identifier_log),
                            code=ErrorCodes.tlog_not_found)
                        continue
                    if len(tlx_files) < min_tlx_amount:
                        result_handler.skip(
                            '{0} only has {1} tlx, not worth collapsing (required: {2})'
                            .format(identifier_log, len(tlx_files),
                                    min_tlx_amount))
                        continue
                    # Compare youngest tlog and oldest tlx timestamp
                    seconds_difference = int(tlog_files[-1][0]) - int(
                        tlx_files[0][0])
                    if max_age_seconds > seconds_difference:
                        result_handler.success(
                            '{0} should not be collapsed. The oldest tlx is at least {1} days younger than the youngest tlog (actual age: {2})'
                            .format(
                                identifier_log, max_collapse_age,
                                str(timedelta(seconds=seconds_difference))),
                            code=ErrorCodes.collapse_ok)
                    else:
                        result_handler.failure(
                            '{0} should be collapsed. The oldest tlx is currently {1} old'
                            .format(
                                identifier_log,
                                str(timedelta(seconds=seconds_difference))),
                            code=ErrorCodes.collapse_not_ok)
コード例 #11
0
    def get_primary_and_secondary_storagerouters(self):
        # type: () -> Tuple[List[StorageRouter], List[StorageRouter]]
        """
        Retrieve the primary and secondary storagerouters for MDS deployment
        :return: Both primary and secondary storagerouters
        :rtype: Tuple[List[StorageRouter], List[StorageRouter]]
        """
        # Create a pool of StorageRouters being a part of the primary and secondary domains of this StorageRouter
        vdisk = self.vdisk

        vdisk_storagerouter = StorageRouter(vdisk.storagerouter_guid)
        primary_domains = [
            junction.domain for junction in vdisk_storagerouter.domains
            if junction.backup is False
        ]
        secondary_domains = [
            junction.domain for junction in vdisk_storagerouter.domains
            if junction.backup is True
        ]
        primary_storagerouters = set()
        secondary_storagerouters = set()
        for domain in primary_domains:
            primary_storagerouters.update(
                StorageRouterList.get_primary_storagerouters_for_domain(
                    domain))
        for domain in secondary_domains:
            secondary_storagerouters.update(
                StorageRouterList.get_primary_storagerouters_for_domain(
                    domain))

        # In case no domains have been configured
        if len(primary_storagerouters) == 0:
            primary_storagerouters = set(
                StorageRouterList.get_storagerouters())

        # Remove all excluded StorageRouters from primary StorageRouters
        primary_storagerouters = primary_storagerouters.difference(
            self.excluded_storagerouters)

        # Remove all StorageRouters from secondary which are present in primary, all excluded
        secondary_storagerouters = secondary_storagerouters.difference(
            primary_storagerouters)
        secondary_storagerouters = secondary_storagerouters.difference(
            self.excluded_storagerouters)

        # Make sure to only use the StorageRouters related to the current vDisk's vPool
        related_storagerouters = [
            sd.storagerouter for sd in vdisk.vpool.storagedrivers
            if sd.storagerouter is not None
        ]
        primary_storagerouters = list(
            primary_storagerouters.intersection(related_storagerouters))
        secondary_storagerouters = list(
            secondary_storagerouters.intersection(related_storagerouters))

        if vdisk_storagerouter not in primary_storagerouters:
            raise RuntimeError(
                'Host of vDisk {0} ({1}) should be part of the primary domains'
                .format(vdisk.name, vdisk_storagerouter.name))

        primary_storagerouters.sort(
            key=lambda sr: ExtensionsToolbox.advanced_sort(element=sr.ip,
                                                           separator='.'))
        secondary_storagerouters.sort(
            key=lambda sr: ExtensionsToolbox.advanced_sort(element=sr.ip,
                                                           separator='.'))
        for primary_storagerouter in primary_storagerouters:
            self._logger.debug(
                'vDisk {0} - Primary StorageRouter {1} with IP {2}'.format(
                    vdisk.guid, primary_storagerouter.name,
                    primary_storagerouter.ip))
        for secondary_storagerouter in secondary_storagerouters:
            self._logger.debug(
                'vDisk {0} - Secondary StorageRouter {1} with IP {2}'.format(
                    vdisk.guid, secondary_storagerouter.name,
                    secondary_storagerouter.ip))
        for excluded_storagerouter in self.excluded_storagerouters:
            self._logger.debug(
                'vDisk {0} - Excluded StorageRouter {1} with IP {2}'.format(
                    vdisk.guid, excluded_storagerouter.name,
                    excluded_storagerouter.ip))

        return primary_storagerouters, secondary_storagerouters
コード例 #12
0
    def _post_update_alba_plugin_alba(cls, components):
        """
        Execute some functionality after the ALBA plugin packages have been updated for the ASD manager nodes
        :param components: Update components which have been executed
        :type components: list
        :return: None
        :rtype: NoneType
        """
        if PackageFactory.COMP_ALBA not in components:
            return

        # First run post-update migrations to update services, config mgmt, ... and restart services afterwards
        for method_name in ['migrate', 'migrate_sdm']:
            try:
                # noinspection PyUnresolvedReferences
                from ovs.lib.albamigration import AlbaMigrationController
                cls._logger.debug(
                    'Executing migration code: AlbaMigrationController.{0}()'.
                    format(method_name))
                getattr(AlbaMigrationController, method_name)()
            except ImportError:
                cls._logger.error('Could not import AlbaMigrationController')
            except Exception:
                cls._logger.exception(
                    'Migration code for the ALBA plugin failed to be executed')

        # Update ALBA nodes
        method_name = inspect.currentframe().f_code.co_name
        cls._logger.info('Executing hook {0}'.format(method_name))
        alba_nodes = sorted(
            AlbaNodeList.get_albanodes_by_type(AlbaNode.NODE_TYPES.ASD),
            key=lambda an: ExtensionsToolbox.advanced_sort(element=an.ip,
                                                           separator='.'))
        for alba_node in alba_nodes:
            services_to_restart = []
            for component in components:
                if component not in alba_node.package_information:
                    continue

                component_info = alba_node.package_information[component]
                if 'services_post_update' not in component_info:
                    # Package_information still has the old format, so refresh update information
                    # This can occur when updating from earlier than 2.11.0 to 2.11.0 and older
                    try:
                        GenericController.refresh_package_information()
                    except:
                        cls._logger.exception(
                            '{0}: Refreshing package information failed'.
                            format(alba_node.ip))
                    alba_node.discard()
                    component_info = alba_node.package_information.get(
                        component, {})

                services_post_update = dict(
                    (int(key), value) for key, value in component_info.get(
                        'services_post_update', {}).iteritems())
                for restart_order in sorted(services_post_update):
                    for service_name in sorted(
                            services_post_update[restart_order]):
                        if service_name not in services_to_restart:
                            services_to_restart.append(service_name)

            if len(services_to_restart) > 0:
                alba_node.client.restart_services(
                    service_names=services_to_restart)

        # Renew maintenance services
        cls._logger.info('Checkup maintenance agents')
        AlbaController.checkup_maintenance_agents.delay()

        cls._logger.info('Executed hook {0}'.format(method_name))
コード例 #13
0
    def _package_install_plugin_alba(cls, components=None):
        """
        Update the packages related to the ASD manager
        :param components: Components which have been selected for update
        :type components: list
        :return: Boolean indicating whether to continue with the update or not
        :rtype: bool
        """
        cls._logger.info('Updating packages for ALBA plugin')
        if components is None:
            components = [PackageFactory.COMP_ALBA]

        abort = False
        alba_nodes = sorted(
            AlbaNodeList.get_albanodes_by_type(AlbaNode.NODE_TYPES.ASD),
            key=lambda an: ExtensionsToolbox.advanced_sort(element=an.ip,
                                                           separator='.'))
        for alba_node in alba_nodes:
            cls._logger.debug('ALBA Node {0}: Verifying packages'.format(
                alba_node.ip))
            for component in components:
                packages = alba_node.package_information.get(
                    component, {}).get('packages', {})
                package_names = sorted(packages)
                # Always install the extensions package first
                if PackageFactory.PKG_OVS_EXTENSIONS in package_names:
                    package_names.remove(PackageFactory.PKG_OVS_EXTENSIONS)
                    package_names.insert(0, PackageFactory.PKG_OVS_EXTENSIONS)

                if len(package_names) > 0:
                    cls._logger.debug(
                        'ALBA Node {0}: Packages for component {1}: {2}'.
                        format(alba_node.ip, component, package_names))
                for package_name in package_names:
                    try:
                        installed = packages[package_name]['installed']
                        candidate = packages[package_name]['candidate']

                        if candidate == alba_node.client.update_installed_version_package(
                                package_name=package_name):
                            # Package has already been installed by another hook
                            continue

                        cls._logger.debug(
                            'ALBA Node {0}: Updating package {1} ({2} --> {3})'
                            .format(alba_node.ip, package_name, installed,
                                    candidate))
                        alba_node.client.execute_update(package_name)
                        cls._logger.debug(
                            'ALBA Node {0}: Updated package {1}'.format(
                                alba_node.ip, package_name))
                    except requests.ConnectionError as ce:
                        if 'Connection aborted.' not in ce.message:  # This error is thrown due the post-update code of the SDM package which restarts the asd-manager service
                            cls._logger.exception(
                                'ALBA Node {0}: Failed to update package {1}'.
                                format(alba_node.ip, package_name))
                            abort = True
                    except Exception:
                        cls._logger.exception(
                            'ALBA Node {0}: Failed to update package {1}'.
                            format(alba_node.ip, package_name))
                        abort = True

        if abort is False:
            cls._logger.info('Updated packages for ALBA plugin')
        return abort
コード例 #14
0
    def monitor_services(cls):
        """
        Monitor the local OVS services
        :return: None
        :rtype: NoneType
        """
        try:
            previous_output = None
            while True:
                # Gather service states
                running_services = {}
                non_running_services = {}
                longest_service_name = 0
                for service_info in check_output('initctl list',
                                                 shell=True).splitlines():
                    if not service_info.startswith('ovs-'):
                        continue
                    service_info = service_info.split(',')[0].strip()
                    service_name = service_info.split()[0].strip()
                    service_state = service_info.split()[1].strip()
                    if service_state == "start/running":
                        running_services[service_name] = service_state
                    else:
                        non_running_services[service_name] = service_state

                    if len(service_name) > longest_service_name:
                        longest_service_name = len(service_name)

                # Put service states in list
                output = ['OVS running processes', '=====================\n']
                for service_name in sorted(
                        running_services,
                        key=lambda service: ExtensionsToolbox.advanced_sort(
                            service, '_')):
                    output.append('{0} {1} {2}'.format(
                        service_name,
                        ' ' * (longest_service_name - len(service_name)),
                        running_services[service_name]))

                output.extend([
                    '\n\nOVS non-running processes',
                    '=========================\n'
                ])
                for service_name in sorted(
                        non_running_services,
                        key=lambda service: ExtensionsToolbox.advanced_sort(
                            service, '_')):
                    output.append('{0} {1} {2}'.format(
                        service_name,
                        ' ' * (longest_service_name - len(service_name)),
                        non_running_services[service_name]))

                # Print service states (only if changes)
                if previous_output != output:
                    print '\x1b[2J\x1b[H'
                    for line in output:
                        print line
                    previous_output = list(output)
                time.sleep(1)
        except KeyboardInterrupt:
            pass