Ejemplo n.º 1
0
    def _proxy_summary(self):
        """
        Returns a summary of the proxies of this StorageDriver
        :return: summary of the proxies
        :rtype: dict
        """
        proxy_info = {'red': 0, 'orange': 0, 'green': 0}
        summary = {'proxies': proxy_info}

        try:
            service_manager = ServiceFactory.get_manager()
            client = SSHClient(self.storagerouter)
        except Exception:
            self._logger.exception('Unable to retrieve necessary clients')
        else:
            for alba_proxy in self.alba_proxies:
                try:
                    service_status = service_manager.get_service_status(
                        alba_proxy.service.name, client)
                except Exception:
                    # A ValueError can occur when the services are still being deployed (the model will be updated before the actual deployment)
                    self._logger.exception(
                        'Unable to retrieve the service status for service {0} of StorageDriver {1}'
                        .format(alba_proxy.service.name, self.guid))
                    proxy_info['red'] += 1
                    continue
                if service_status == 'active':
                    proxy_info['green'] += 1
                elif service_status == 'inactive':
                    proxy_info['orange'] += 1
                else:
                    proxy_info['red'] += 1
        finally:
            return summary
Ejemplo n.º 2
0
 def wait_for_service(client, name, status, logger):
     """
     Wait for service to enter status
     :param client: SSHClient to run commands
     :type client: ovs_extensions.generic.sshclient.SSHClient
     :param name: Name of service
     :type name: str
     :param status: 'active' if running, 'inactive' if halted
     :type status: str
     :param logger: Logging object
     :type logger: ovs.log.log_handler.LogHandler
     :return: None
     :rtype: NoneType
     """
     tries = 10
     service_manager = ServiceFactory.get_manager()
     service_status = service_manager.get_service_status(name, client)
     while tries > 0:
         if service_status == status:
             return
         logger.debug('... waiting for service {0}'.format(name))
         tries -= 1
         time.sleep(10 - tries)
         service_status = service_manager.get_service_status(name, client)
     raise RuntimeError(
         'Service {0} does not have expected status: Expected: {1} - Actual: {2}'
         .format(name, status, service_status))
Ejemplo n.º 3
0
    def check_rabbitmq_and_enable_ha_mode(client, logger):
        """
        Verify RabbitMQ is running properly and enable HA mode
        :param client: Client on which to check RabbitMQ
        :type client: ovs_extensions.generic.sshclient.SSHClient
        :param logger: Logger object used for logging
        :type logger: ovs.extensions.generic.logger.Logger
        :return: None
        """
        service_manager = ServiceFactory.get_manager()
        if not service_manager.has_service('rabbitmq-server', client):
            raise RuntimeError(
                'Service rabbitmq-server has not been added on node {0}'.
                format(client.ip))
        rabbitmq_running, same_process = service_manager.is_rabbitmq_running(
            client=client)
        if rabbitmq_running is False or same_process is False:
            ServiceFactory.change_service_state(client, 'rabbitmq-server',
                                                'restart', logger)

        time.sleep(5)
        client.run([
            'rabbitmqctl', 'set_policy', 'ha-all', '^(volumerouter|ovs_.*)$',
            '{"ha-mode":"all"}'
        ])
Ejemplo n.º 4
0
 def check_ovs_processes(logger):
     """
     Checks the availability of processes for Open vStorage
     :param logger: logging object
     :type logger: ovs.extensions.healthcheck.result.HCResults
     :return: None
     :rtype: NoneType
     """
     logger.info('Checking local ovs services.')
     client = SSHClient(OpenvStorageHealthCheck.LOCAL_SR)
     service_manager = ServiceFactory.get_manager()
     services = [
         service for service in service_manager.list_services(client=client)
         if service.startswith(OpenvStorageHealthCheck.MODULE)
     ]
     if len(services) == 0:
         logger.warning('Found no local ovs services.')
     for service_name in services:
         if service_manager.get_service_status(service_name,
                                               client) == 'active':
             logger.success('Service {0} is running!'.format(service_name))
         else:
             logger.failure(
                 'Service {0} is not running, please check this.'.format(
                     service_name))
Ejemplo n.º 5
0
    def __init__(self):
        """
        Initializes the client
        """
        # Safe calls
        self._node_id = System.get_my_machine_id().replace(r"'", r"'\''")
        # Alba is currently always installed but the Alba version/package info is located in the SDM section
        self._package_manager = PackageFactory.get_manager()
        self._service_manager = ServiceFactory.get_manager()

        self._service_type = ServiceFactory.get_service_type()
        if self._service_type != 'systemd':
            raise NotImplementedError('Only Systemd is supported')

        # Potential failing calls
        self._cluster_id = self.get_config_key(
            self.LOCATION_CLUSTER_ID,
            fallback=[CONFIG_STORE_LOCATION, 'cluster_id'])
        self.interval = self.get_config_key(
            self.LOCATION_INTERVAL,
            fallback=[self.FALLBACK_CONFIG, self.KEY_INTERVAL],
            default=self.DEFAULT_INTERVAL)
        self._openvpn_service_name = 'openvpn@ovs_{0}-{1}'.format(
            self._cluster_id, self._node_id)

        # Calls to look out for. These could still be None when using them
        self._storagerouter = None
        self._client = None
        self._set_storagerouter()
        self._set_client()

        # Safe call, start caching
        self.caching = SupportAgentCache(self)
Ejemplo n.º 6
0
    def override_scheduletasks(configuration):
        """
        Override the scheduled tasks crontab with your own confguration
        :param configuration: configuration to override scheduled tasks
        :type configuration: dict
        :return:
        """
        service_name = 'ovs-watcher-framework'
        Configuration.set(CelerySetup.SCHEDULED_TASK_CFG, configuration)
        fetched_cfg = Configuration.get(CelerySetup.SCHEDULED_TASK_CFG,
                                        configuration)
        if cmp(fetched_cfg, configuration) == 0:
            # restart ovs-watcher-framework on all nodes
            for sr_ip in StoragerouterHelper.get_storagerouter_ips():
                client = SSHClient(sr_ip, username='******')
                service_manager = ServiceFactory.get_manager()
                try:

                    service_manager.restart_service(service_name, client)
                except:
                    return False
            CelerySetup.LOGGER.info(
                "Successfully restarted all `{0}` services!".format(
                    service_name))
            return True
        else:
            CelerySetup.LOGGER.warning(
                "`{0}` config is `{1}` but should be `{2}`".format(
                    CelerySetup.SCHEDULED_TASK_CFG, fetched_cfg,
                    configuration))
            return False
Ejemplo n.º 7
0
 def check_alba_processes(result_handler):
     """
     Checks the availability of processes for Alba
     :param result_handler: logging object
     :type result_handler: ovs.extensions.healthcheck.result.HCResults
     :return: None
     :rtype: NoneType
     """
     result_handler.info('Checking LOCAL ALBA services: ',
                         add_to_result=False)
     client = SSHClient(AlbaHealthCheck.LOCAL_SR)
     service_manager = ServiceFactory.get_manager()
     services = [
         service for service in service_manager.list_services(client=client)
         if service.startswith(AlbaHealthCheck.MODULE)
     ]
     if len(services) == 0:
         result_handler.skip('Found no LOCAL ALBA services.')
         return
     for service_name in services:
         if service_manager.get_service_status(service_name,
                                               client) == 'active':
             result_handler.success(
                 'Service {0} is running!'.format(service_name))
         else:
             result_handler.failure(
                 'Service {0} is NOT running! '.format(service_name))
Ejemplo n.º 8
0
 def idle_till_ovs_is_up(ip, username, password=None, connection_timeout=300, service_timeout=60, logger=LOGGER):
     """
     wait until a node is back up and all ovs related are running (or potentially stuck)
     :param ip: ip of the node
     :param username: username to login with
     :param password: password to login with
     :param connection_timeout: raise when not online after these seconds
     :param service_timeout: poll for x seconds when checking services
     :param logger: logging instance
     :raise RuntimeError: when the timeout has been reached
     :return: dict with services mapped by their state
     """
     # neutral_states = ['inactive', 'deactivating']
     failed_states = ['failed', 'error']
     active_states = ['active', 'reloading']
     activating_state = 'activating'
     start_time = time.time()
     client = None
     while client is None:
         delta = time.time() - start_time
         if delta > connection_timeout:
             raise RuntimeError('Idling has timed out after {0}s'.format(delta))
         try:
             client = SSHClient(ip, username=username, password=password)
         except:
             logger.debug('Could not establish a connection yet to {0} after {1}s'.format(ip, delta))
         time.sleep(1)
     service_manager = ServiceFactory.get_manager()
     ovs_services = [service for service in service_manager.list_services(client) if service.startswith('ovs-')]
     active_services = []
     failed_service = []
     activating_services = []
     # Initially class these services
     for service in ovs_services:
         logger.debug('Initially classifying {0}'.format(service))
         service_state = service_manager.get_service_status(service, client)
         logger.debug('Service {0} - State {1}'.format(service, service_state))
         if service_state in failed_states:
             failed_service.append(service)
         elif service_state in active_states:
             active_services.append(service)
         elif service_state == activating_state:
             activating_services.append(service)
         else:
             logger.error('Unable to process service state {0}'.format(service_state))
     start_time = time.time()
     while len(activating_services) > 0:
         if time.time() - start_time > service_timeout:
             break
         service = activating_services.pop()
         service_state = service_manager.get_service_status(service, client)
         if service_state in failed_states:
             failed_service.append(service)
         elif service_state in active_states:
             active_services.append(service)
         elif service_state == activating_state:
             activating_services.append(service)
     return {'active': active_services, 'failed': failed_service, 'activating': activating_services}
Ejemplo n.º 9
0
 def _get_filedescriptors(cls,
                          result_handler,
                          arakoon_clusters,
                          batch_size=10):
     """
     Retrieve tlog/tlx stat information for a Arakoon cluster concurrently
     Note: this will mutate the given arakoon_clusters dict
     :param result_handler: logging object
     :type result_handler: ovs.extensions.healthcheck.result.HCResults
     :param arakoon_clusters: Information about all Arakoon clusters, sorted by type and given config
     :type arakoon_clusters: dict
     :param batch_size: Amount of workers to collect the Arakoon information.
     Every worker means a connection towards a different node
     :return: Dict with file descriptors contents for every node config
     :rtype: dict
     """
     queue = Queue.Queue()
     clients = {}
     # Prep work
     for cluster_type, clusters in arakoon_clusters.iteritems():
         for cluster in clusters:
             cluster_name = cluster['cluster_name']
             arakoon_config = cluster['config']
             cluster['fd_result'] = {}
             for node_config in arakoon_config.nodes:
                 result = {'errors': [], 'result': {'fds': []}}
                 # Build SSHClients outside the threads to avoid GIL
                 try:
                     client = clients.get(node_config.ip)
                     if client is None:
                         client = SSHClient(node_config.ip, timeout=5)
                         clients[node_config.ip] = client
                 except Exception as ex:
                     result['errors'].append(('build_client', ex))
                     continue
                 cluster['fd_result'][node_config] = result
                 queue.put((cluster_name, node_config, result))
     service_manager = ServiceFactory.get_manager()
     # Limit to one session for every node.
     # Every process will fork from this one, creating a new session instead of using the already existing channel
     # There might be an issue issue if a ssh session would take too long causing all workers to connect to that one node
     # and therefore hitting the MaxSessions again (theory)
     for _ in xrange(min(len(clients.keys()), batch_size)):
         thread = Thread(target=cls._fd_worker,
                         args=(queue, clients, result_handler,
                               service_manager))
         thread.setDaemon(
             True
         )  # Setting threads as "daemon" allows main program to exit eventually even if these don't finish correctly.
         thread.start()
     # Wait for all results
     queue.join()
     return arakoon_clusters
Ejemplo n.º 10
0
 def _roll_out_dtl_services(vpool, storagerouters):
     """
     Deploy and start the DTL service on all storagerouters
     :param storagerouters: StorageRouters to deploy and start a DTL service on
     :return: None
     """
     service_manager = ServiceFactory.get_manager()
     service_name = 'dtl_{0}'.format(vpool.name)
     for sr in storagerouters.values():
         client = SSHClient(sr, 'root')
         service_manager.add_service(name=service_name, client=client)
         service_manager.start_service(name=service_name, client=client)
Ejemplo n.º 11
0
    def change_config(storagedriver, config):
        """
        Change the config of the volumedriver and reload the config.
        Restart will be triggered if no vDisk are running on the volumedriver.
        :param storagedriver: StorageDriver object
        :type storagedriver: StorageDriver
        :param config: Volumedriver config
        :type config: dict
        :return:
        """
        service_manager = ServiceFactory.get_manager()
        config_key = '/ovs/vpools/{0}/hosts/{1}/config'.format(
            storagedriver.vpool.guid, storagedriver.name)
        current_config = Configuration.get(config_key)

        if 'volume_manager' in config:
            volume_manager = current_config['volume_manager']
            for key, value in config['volume_manager'].iteritems():
                volume_manager[key] = value

        if 'backend_connection_manager' in config:
            backend_connection_manager = current_config[
                'backend_connection_manager']
            for key, value in config['backend_connection_manager'].iteritems():
                if key == 'proxy':
                    for current_config_key, current_config_value in backend_connection_manager.iteritems(
                    ):
                        if current_config_key.isdigit():
                            for proxy_key, proxy_config in config[
                                    'backend_connection_manager'][
                                        'proxy'].iteritems():
                                current_config_value[proxy_key] = proxy_config

                else:
                    backend_connection_manager[key] = value
        StoragedriverHelper.LOGGER.info("New config: {0}".format(
            json.dumps(current_config, indent=4)))
        Configuration.set(config_key,
                          json.dumps(current_config, indent=4),
                          raw=True)
        client = SSHClient(storagedriver.storagerouter, 'root')
        service_name = 'ovs-volumedriver_{0}'.format(storagedriver.vpool.name)

        if len(storagedriver.vdisks_guids) == 0:
            StoragedriverHelper.LOGGER.info(
                "Restarting service: {0}".format(service_name))
            service_manager.restart_service(service_name, client)
        else:
            StoragedriverHelper.LOGGER.info(
                "Not restarting service: {0}, amount of vdisks: {1}".format(
                    service_name, len(storagedriver.vdisks_guids)))
Ejemplo n.º 12
0
 def get_non_running_ovs_services(client):
     """
     get all non-running ovs services
     :param client: sshclient instance
     :return: list of non running ovs services
     :rtype: list
     """
     non_running_ovs_services = []
     service_manager = ServiceFactory.get_manager()
     for service in service_manager.list_services(client):
         if not service.startswith('ovs-'):
             continue
         if service_manager.get_service_status(service, client) != 'active':
             non_running_ovs_services.append(service)
     return non_running_ovs_services
Ejemplo n.º 13
0
    def install_plugins():
        """
        (Re)load plugins
        """
        manager = ServiceFactory.get_manager()
        if manager.has_service('ovs-watcher-framework',
                               SSHClient('127.0.0.1', username='******')):
            # If the watcher is running, 'ovs setup' was executed and we need to restart everything to load
            # the plugin. In the other case, the plugin will be loaded once 'ovs setup' is executed
            print 'Installing plugin into Open vStorage'
            from ovs.dal.lists.storagerouterlist import StorageRouterList
            clients = {}
            masters = StorageRouterList.get_masters()
            slaves = StorageRouterList.get_slaves()
            try:
                for sr in masters + slaves:
                    clients[sr] = SSHClient(sr, username='******')
            except UnableToConnectException:
                raise RuntimeError('Not all StorageRouters are reachable')
            memcached = 'memcached'
            watcher = 'watcher-framework'
            for sr in masters + slaves:
                if manager.has_service(watcher, clients[sr]):
                    print '- Stopping watcher on {0} ({1})'.format(
                        sr.name, sr.ip)
                    manager.stop_service(watcher, clients[sr])
            for sr in masters:
                print '- Restarting memcached on {0} ({1})'.format(
                    sr.name, sr.ip)
                manager.restart_service(memcached, clients[sr])
            for sr in masters + slaves:
                if manager.has_service(watcher, clients[sr]):
                    print '- Starting watcher on {0} ({1})'.format(
                        sr.name, sr.ip)
                    manager.start_service(watcher, clients[sr])

            print '- Execute model migrations'
            from ovs.dal.helpers import Migration
            Migration.migrate()

            from ovs.lib.helpers.toolbox import Toolbox
            ip = System.get_my_storagerouter().ip
            functions = Toolbox.fetch_hooks('plugin', 'postinstall')
            if len(functions) > 0:
                print '- Execute post installation scripts'
            for fct in functions:
                fct(ip=ip)
            print 'Installing plugin into Open vStorage: Completed'
 def _get_filedescriptors(cls, result_handler, arakoon_clusters, batch_size=10):
     """
     Retrieve tlog/tlx stat information for a Arakoon cluster concurrently
     Note: this will mutate the given arakoon_clusters dict
     :param result_handler: logging object
     :type result_handler: ovs.extensions.healthcheck.result.HCResults
     :param arakoon_clusters: Information about all Arakoon clusters, sorted by type and given config
     :type arakoon_clusters: dict
     :param batch_size: Amount of workers to collect the Arakoon information.
     Every worker means a connection towards a different node
     :return: Dict with file descriptors contents for every node config
     :rtype: dict
     """
     queue = Queue.Queue()
     clients = {}
     # Prep work
     for cluster_type, clusters in arakoon_clusters.iteritems():
         for cluster in clusters:
             cluster_name = cluster['cluster_name']
             arakoon_config = cluster['config']
             cluster['fd_result'] = {}
             for node_config in arakoon_config.nodes:
                 result = {'errors': [],
                           'result': {'fds': []}}
                 # Build SSHClients outside the threads to avoid GIL
                 try:
                     client = clients.get(node_config.ip)
                     if client is None:
                         client = SSHClient(node_config.ip, timeout=5)
                         clients[node_config.ip] = client
                 except Exception as ex:
                     result['errors'].append(('build_client', ex))
                     continue
                 cluster['fd_result'][node_config] = result
                 queue.put((cluster_name, node_config, result))
     service_manager = ServiceFactory.get_manager()
     # Limit to one session for every node.
     # Every process will fork from this one, creating a new session instead of using the already existing channel
     # There might be an issue issue if a ssh session would take too long causing all workers to connect to that one node
     # and therefore hitting the MaxSessions again (theory)
     for _ in xrange(min(len(clients.keys()), batch_size)):
         thread = Thread(target=cls._fd_worker, args=(queue, clients, result_handler, service_manager))
         thread.setDaemon(True)  # Setting threads as "daemon" allows main program to exit eventually even if these don't finish correctly.
         thread.start()
     # Wait for all results
     queue.join()
     return arakoon_clusters
    def __init__(self, ip):
        """
        Create RabbitMQ object

        :param ip: ip from the server
        :type ip: str
        """
        # check if rabbitmq is available on the ip
        if not RabbitMQ._check_rabbitmq_ip(ip):
            raise ValueError('RabbitMQ on {0} could not be found.'.format(ip))
        self._service_manager = ServiceFactory.get_manager()
        self.ip = ip
        if RabbitMQ.INTERNAL:
            self._storagerouter = StorageRouterList.get_by_ip(ip)
            self._client = SSHClient(ip, username='******')

        if not self.check_management_plugin():
            self.enable_management_plugin()
    def __init__(self, ip):
        """
        Create RabbitMQ object

        :param ip: ip from the server
        :type ip: str
        """
        # check if rabbitmq is available on the ip
        if not RabbitMQ._check_rabbitmq_ip(ip):
            raise ValueError('RabbitMQ on {0} could not be found.'.format(ip))
        self._service_manager = ServiceFactory.get_manager()
        self.ip = ip
        if RabbitMQ.INTERNAL:
            self._storagerouter = StorageRouterList.get_by_ip(ip)
            self._client = SSHClient(ip, username='******')

        if not self.check_management_plugin():
            self.enable_management_plugin()
Ejemplo n.º 17
0
    def add_services(client, node_type, logger):
        """
        Add the services required by the OVS cluster
        :param client: Client on which to add the services
        :type client: ovs_extensions.generic.sshclient.SSHClient
        :param node_type: Type of node ('master' or 'extra')
        :type node_type: str
        :param logger: Logger object used for logging
        :type logger: ovs.extensions.generic.logger.Logger
        :return: None
        """
        Toolbox.log(logger=logger, messages='Adding services')
        service_manager = ServiceFactory.get_manager()
        services = {}
        worker_queue = System.get_my_machine_id(client=client)
        if node_type == 'master':
            worker_queue += ',ovs_masters'
            services.update({
                'memcached': {
                    'MEMCACHE_NODE_IP': client.ip,
                    'WORKER_QUEUE': worker_queue
                },
                'rabbitmq-server': {
                    'MEMCACHE_NODE_IP': client.ip,
                    'WORKER_QUEUE': worker_queue
                },
                'scheduled-tasks': {},
                'webapp-api': {},
                'volumerouter-consumer': {}
            })
        services.update({
            'workers': {
                'WORKER_QUEUE': worker_queue
            },
            'watcher-framework': {}
        })

        for service_name, params in services.iteritems():
            if not service_manager.has_service(service_name, client):
                Toolbox.log(logger=logger,
                            messages='Adding service {0}'.format(service_name))
                service_manager.add_service(name=service_name,
                                            params=params,
                                            client=client)
Ejemplo n.º 18
0
 def configure_proxy(backend_name, proxy_configuration):
     faulty_keys = [
         key for key in proxy_configuration.keys()
         if key not in ProxySetup.PARAMS
     ]
     if len(faulty_keys) > 0:
         raise ValueError(
             '{0} are unsupported keys for proxy configuration.'.format(
                 ', '.join(faulty_keys)))
     ExtensionsToolbox.verify_required_params(ProxySetup.PARAMS,
                                              proxy_configuration)
     vpools = VPoolList.get_vpools()
     service_manager = ServiceFactory.get_manager()
     with open('/root/old_proxies', 'w') as backup_file:
         for vpool in vpools:
             if vpool.metadata['backend']['backend_info'][
                     'name'] != backend_name:
                 continue
             for storagedriver in vpool.storagedrivers:
                 for proxy in storagedriver.alba_proxies:
                     config_loc = 'ovs/vpools/{0}/proxies/{1}/config/main'.format(
                         vpool.guid, proxy.guid)
                     proxy_service = Service(proxy.service_guid)
                     proxy_config = Configuration.get(config_loc)
                     old_proxy_config = dict(proxy_config)
                     backup_file.write('{} -- {}\n'.format(
                         config_loc, old_proxy_config))
                     proxy_config.update(proxy_configuration)
                     ProxySetup.LOGGER.info(
                         "Changed {0} to {1} for proxy {2}".format(
                             old_proxy_config, proxy_config, config_loc))
                     ProxySetup.LOGGER.info("Changed items {0}".format([
                         (key, value)
                         for key, value in proxy_config.iteritems()
                         if key not in old_proxy_config.keys()
                     ]))
                     Configuration.set(config_loc,
                                       json.dumps(proxy_config, indent=4),
                                       raw=True)
                     client = SSHClient(storagedriver.storage_ip,
                                        username='******')
                     service_manager.restart_service(proxy_service.name,
                                                     client=client)
 def check_ovs_processes(result_handler):
     """
     Checks the availability of processes for Open vStorage
     :param result_handler: logging object
     :type result_handler: ovs.extensions.healthcheck.result.HCResults
     :return: None
     :rtype: NoneType
     """
     result_handler.info('Checking local ovs services.')
     client = SSHClient(System.get_my_storagerouter())
     service_manager = ServiceFactory.get_manager()
     services = [service for service in service_manager.list_services(client=client) if service.startswith(OpenvStorageHealthCheck.MODULE)]
     if len(services) == 0:
         result_handler.warning('Found no local ovs services.')
     for service_name in services:
         if service_manager.get_service_status(service_name, client) == 'active':
             result_handler.success('Service {0} is running!'.format(service_name), code=ErrorCodes.process_fwk)
         else:
             result_handler.failure('Service {0} is not running, please check this.'.format(service_name), code=ErrorCodes.process_fwk)
Ejemplo n.º 20
0
    def restart_framework_and_memcache_services(clients,
                                                logger,
                                                offline_node_ips=None):
        """
        Restart framework and Memcached services
        :param clients: Clients on which to restart these services
        :type clients: dict
        :param logger: Logger object used for logging
        :type logger: ovs.extensions.generic.logger.Logger
        :param offline_node_ips: IP addresses of offline nodes in the cluster
        :type offline_node_ips: list
        :return: None
        """
        from ovs.dal.lists.storagerouterlist import StorageRouterList

        service_manager = ServiceFactory.get_manager()
        master_ips = [sr.ip for sr in StorageRouterList.get_masters()]
        slave_ips = [sr.ip for sr in StorageRouterList.get_slaves()]
        if offline_node_ips is None:
            offline_node_ips = []
        memcached = 'memcached'
        watcher = 'watcher-framework'
        support_agent = 'support-agent'
        for ip in master_ips + slave_ips:
            if ip not in offline_node_ips:
                if service_manager.has_service(watcher, clients[ip]):
                    ServiceFactory.change_service_state(
                        clients[ip], watcher, 'stop', logger)
        for ip in master_ips:
            if ip not in offline_node_ips:
                ServiceFactory.change_service_state(clients[ip], memcached,
                                                    'restart', logger)
        for ip in master_ips + slave_ips:
            if ip not in offline_node_ips:
                if service_manager.has_service(watcher, clients[ip]):
                    ServiceFactory.change_service_state(
                        clients[ip], watcher, 'start', logger)
                if service_manager.has_service(support_agent, clients[ip]):
                    ServiceFactory.change_service_state(
                        clients[ip], support_agent, 'restart', logger)
        VolatileFactory.store = None
Ejemplo n.º 21
0
    def _on_remove(cluster_ip, complete_removal):
        """
        Handles the StorageDriver removal part of a node
        :param cluster_ip: IP of the node which is being removed from the cluster
        :type cluster_ip: str
        :param complete_removal: Unused for StorageDriver, used for AlbaController
        :type complete_removal: bool
        :return: None
        """
        _ = complete_removal

        service_manager = ServiceFactory.get_manager()
        service_name = 'watcher-volumedriver'
        try:
            client = SSHClient(endpoint=cluster_ip, username='******')
            if service_manager.has_service(name=service_name, client=client):
                service_manager.stop_service(name=service_name, client=client)
                service_manager.remove_service(name=service_name,
                                               client=client)
        except (UnableToConnectException, NotAuthenticatedException):
            pass
Ejemplo n.º 22
0
 def check_alba_processes(result_handler):
     """
     Checks the availability of processes for Alba
     :param result_handler: logging object
     :type result_handler: ovs.extensions.healthcheck.result.HCResults
     :return: None
     :rtype: NoneType
     """
     result_handler.info('Checking LOCAL ALBA services: ', add_to_result=False)
     client = SSHClient(System.get_my_storagerouter())
     service_manager = ServiceFactory.get_manager()
     services = [service for service in service_manager.list_services(client=client) if service.startswith(AlbaHealthCheck.MODULE)]
     if len(services) == 0:
         result_handler.skip('Found no LOCAL ALBA services.')
         return
     for service_name in services:
         if service_manager.get_service_status(service_name, client) == 'active':
             result_handler.success('Service {0} is running!'.format(service_name),
                                    code=ErrorCodes.alba_service_running)
         else:
             result_handler.failure('Service {0} is NOT running! '.format(service_name),
                                    code=ErrorCodes.alba_service_down)
Ejemplo n.º 23
0
    def change_service_state(client, name, state, logger=None):
        """
        Starts/stops/restarts a service
        :param client: SSHClient on which to connect and change service state
        :param name: Name of the service
        :param state: State to put the service in
        :param logger: LogHandler Object
        """
        service_manager = ServiceFactory.get_manager()
        action = None
        status = service_manager.get_service_status(name, client=client)
        if status != 'active' and state in ['start', 'restart']:
            if logger is not None:
                logger.debug('{0}: Starting service {1}'.format(
                    client.ip, name))
            service_manager.start_service(name, client=client)
            action = 'Started'
        elif status == 'active' and state == 'stop':
            if logger is not None:
                logger.debug('{0}: Stopping service {1}'.format(
                    client.ip, name))
            service_manager.stop_service(name, client=client)
            action = 'Stopped'
        elif status == 'active' and state == 'restart':
            if logger is not None:
                logger.debug('{0}: Restarting service {1}'.format(
                    client.ip, name))
            service_manager.restart_service(name, client=client)
            action = 'Restarted'

        if action is None:
            print '  [{0}] {1} already {2}'.format(
                client.ip, name, 'running' if status == 'active' else 'halted')
        else:
            if logger is not None:
                logger.debug('{0}: {1} service {2}'.format(
                    client.ip, action, name))
            print '  [{0}] {1} {2}'.format(client.ip, name, action.lower())
Ejemplo n.º 24
0
    def __init__(self, vdisk_guid):
        # type: (str) -> None
        """
        Initializes a new MDSCatchUp
        An instance populates some caches. These cached are cleared once the instance is garbage collected.
        When running MDSCatchup in bulk: add them to a list to speed up the process
        :param vdisk_guid: Guid of the vDisk to catch up for
        :type vdisk_guid: str
        """
        self.id = str(uuid.uuid4())
        self.vdisk = VDisk(vdisk_guid)
        self.mds_key = self._CATCH_UP_VDISK_KEY.format(self.vdisk.guid)
        self.tlog_threshold = Configuration.get(
            'ovs/volumedriver/mds|tlogs_behind', default=100)
        self.volumedriver_service_name = 'ovs-volumedriver_{0}'.format(
            self.vdisk.vpool.name)
        self.mds_client_timeout = Configuration.get(
            'ovs/vpools/{0}/mds_config|mds_client_connection_timeout'.format(
                self.vdisk.vpool_guid),
            default=120)
        self.mds_clients = {}
        self.dry_run = False
        self.catch_up_threads = []
        self.errors = []

        self._service_manager = ServiceFactory.get_manager()
        self._persistent = PersistentFactory.get_client()
        self._log = 'MDS catchup {0} - vDisk {1} (volume id: {2})'.format(
            self.id, self.vdisk.guid, self.vdisk.volume_id)

        self._clients = self.build_clients()
        self._volumedriver_contexts = self.get_volumedriver_contexts()
        self._worker_contexts = self.get_worker_contexts()
        self._worker_context = self._worker_contexts[
            System.get_my_storagerouter()]
        self._relevant_contexts = self._get_all_relevant_contexts(
        )  # All possible contexts (by mixing volumedriver ones with workers)
Ejemplo n.º 25
0
    def remove_services(client, node_type, logger):
        """
        Remove all services managed by OVS
        :param client: Client on which to remove the services
        :type client: ovs_extensions.generic.sshclient.SSHClient
        :param node_type: Type of node, can be 'master' or 'extra'
        :type node_type: str
        :param logger: Logger object used for logging
        :type logger: ovs.extensions.generic.logger.Logger
        :return: None
        """
        Toolbox.log(logger=logger, messages='Removing services')
        service_manager = ServiceFactory.get_manager()
        stop_only = ['rabbitmq-server', 'memcached']
        services = ['workers', 'support-agent', 'watcher-framework']
        if node_type == 'master':
            services += [
                'scheduled-tasks', 'webapp-api', 'volumerouter-consumer'
            ]
            if Toolbox.is_service_internally_managed(
                    service='rabbitmq') is True:
                services.append('rabbitmq-server')
            if Toolbox.is_service_internally_managed(
                    service='memcached') is True:
                services.append('memcached')

        for service in services:
            if service_manager.has_service(service, client=client):
                Toolbox.log(
                    logger=logger,
                    messages='{0} service {1}'.format(
                        'Removing' if service not in stop_only else 'Stopping',
                        service))
                service_manager.stop_service(service, client=client)
                if service not in stop_only:
                    service_manager.remove_service(service, client=client)
Ejemplo n.º 26
0
 def change_scheduled_task(task_name, state, disabled=False, cron=None):
     if not Configuration.exists(celery_key):
         Configuration.set(celery_key, {})
     jobs = Configuration.get(celery_key)
     if state == 'present':
         if disabled:
             jobs[task_name] = None
             output = 'task {0}: disabled'.format(task_name)
         else:
             jobs[task_name] = cron
             settings = ''
             for key, value in cron.iteritems():
                 settings += "{0}: {1} ".format(key, value)
             output = 'task {0}: cron settings {1}'.format(task_name, settings)
     else:
         jobs.pop(task_name, None)
         output = 'task {0}: removed, default settings will be applied.'.format(task_name)
     Configuration.set(celery_key, jobs)
     service_name = 'scheduled-tasks'
     service_manager = ServiceFactory.get_manager()
     for storagerouter in StorageRouterList.get_masters():
         client = SSHClient(storagerouter, username='******')
         service_manager.restart_service(service_name, client=client)
     return output
Ejemplo n.º 27
0
    def migrate():
        """
        Executes async migrations. It doesn't matter too much when they are executed, as long as they get eventually
        executed. This code will typically contain:
        * "dangerous" migration code (it needs certain running services)
        * Migration code depending on a cluster-wide state
        * ...
        * Successfully finishing a piece of migration code, should create an entry in /ovs/framework/migration in case it should not be executed again
        *     Eg: /ovs/framework/migration|stats_monkey_integration: True
        """
        MigrationController._logger.info('Preparing out of band migrations...')

        from ovs.dal.lists.servicetypelist import ServiceTypeList
        from ovs.dal.lists.storagedriverlist import StorageDriverList
        from ovs.dal.lists.storagerouterlist import StorageRouterList
        from ovs.dal.lists.vpoollist import VPoolList
        from ovs.extensions.db.arakooninstaller import ArakoonInstaller
        from ovs.extensions.generic.configuration import Configuration
        from ovs.extensions.generic.sshclient import SSHClient
        from ovs_extensions.generic.toolbox import ExtensionsToolbox
        from ovs.extensions.migration.migration.ovsmigrator import ExtensionMigrator
        from ovs.extensions.packages.packagefactory import PackageFactory
        from ovs_extensions.services.interfaces.systemd import Systemd
        from ovs.extensions.services.servicefactory import ServiceFactory
        from ovs.extensions.storageserver.storagedriver import StorageDriverConfiguration
        from ovs.lib.helpers.storagedriver.installer import StorageDriverInstaller

        MigrationController._logger.info('Start out of band migrations...')
        service_manager = ServiceFactory.get_manager()

        sr_client_map = {}
        for storagerouter in StorageRouterList.get_storagerouters():
            sr_client_map[storagerouter.guid] = SSHClient(endpoint=storagerouter.ip,  # Is triggered during post-update code too during which the ovs-watcher-framework service is still down and thus not refreshing the heartbeat --> use IP i/o StorageRouter
                                                          username='******')

        #########################################################
        # Addition of 'ExecReload' for AlbaProxy SystemD services
        if ServiceFactory.get_service_type() == 'systemd':
            changed_clients = set()
            for storagedriver in StorageDriverList.get_storagedrivers():
                root_client = sr_client_map[storagedriver.storagerouter_guid]
                for alba_proxy in storagedriver.alba_proxies:
                    service = alba_proxy.service
                    service_name = 'ovs-{0}'.format(service.name)
                    if not service_manager.has_service(name=service_name, client=root_client):
                        continue
                    if 'ExecReload=' in root_client.file_read(filename='/lib/systemd/system/{0}.service'.format(service_name)):
                        continue

                    try:
                        service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY, client=root_client, target_name=service_name)
                        changed_clients.add(root_client)
                    except:
                        MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name))
            for root_client in changed_clients:
                root_client.run(['systemctl', 'daemon-reload'])

        ##################################################################
        # Adjustment of open file descriptors for Arakoon services to 8192
        changed_clients = set()
        for storagerouter in StorageRouterList.get_storagerouters():
            root_client = sr_client_map[storagerouter.guid]
            for service_name in service_manager.list_services(client=root_client):
                if not service_name.startswith('ovs-arakoon-'):
                    continue

                if ServiceFactory.get_service_type() == 'systemd':
                    path = '/lib/systemd/system/{0}.service'.format(service_name)
                    check = 'LimitNOFILE=8192'
                else:
                    path = '/etc/init/{0}.conf'.format(service_name)
                    check = 'limit nofile 8192 8192'

                if not root_client.file_exists(path):
                    continue
                if check in root_client.file_read(path):
                    continue

                try:
                    service_manager.regenerate_service(name='ovs-arakoon', client=root_client, target_name=service_name)
                    changed_clients.add(root_client)
                    ExtensionsToolbox.edit_version_file(client=root_client,
                                                        package_name='arakoon',
                                                        old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, service_name))
                except:
                    MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name))
        for root_client in changed_clients:
            root_client.run(['systemctl', 'daemon-reload'])

        #############################
        # Migrate to multiple proxies
        for storagedriver in StorageDriverList.get_storagedrivers():
            vpool = storagedriver.vpool
            root_client = sr_client_map[storagedriver.storagerouter_guid]
            for alba_proxy in storagedriver.alba_proxies:
                # Rename alba_proxy service in model
                service = alba_proxy.service
                old_service_name = 'albaproxy_{0}'.format(vpool.name)
                new_service_name = 'albaproxy_{0}_0'.format(vpool.name)
                if old_service_name != service.name:
                    continue
                service.name = new_service_name
                service.save()

                if not service_manager.has_service(name=old_service_name, client=root_client):
                    continue
                old_configuration_key = '/ovs/framework/hosts/{0}/services/{1}'.format(storagedriver.storagerouter.machine_id, old_service_name)
                if not Configuration.exists(key=old_configuration_key):
                    continue

                # Add '-reboot' to alba_proxy services (because of newly created services and removal of old service)
                ExtensionsToolbox.edit_version_file(client=root_client,
                                                    package_name='alba',
                                                    old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, old_service_name),
                                                    new_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, new_service_name))

                # Register new service and remove old service
                service_manager.add_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY,
                                            client=root_client,
                                            params=Configuration.get(old_configuration_key),
                                            target_name='ovs-{0}'.format(new_service_name))

                # Update scrub proxy config
                proxy_config_key = '/ovs/vpools/{0}/proxies/{1}/config/main'.format(vpool.guid, alba_proxy.guid)
                proxy_config = None if Configuration.exists(key=proxy_config_key) is False else Configuration.get(proxy_config_key)
                if proxy_config is not None:
                    fragment_cache = proxy_config.get(StorageDriverConfiguration.CACHE_FRAGMENT, ['none', {}])
                    if fragment_cache[0] == 'alba' and fragment_cache[1].get('cache_on_write') is True:  # Accelerated ALBA configured
                        fragment_cache_scrub_info = copy.deepcopy(fragment_cache)
                        fragment_cache_scrub_info[1]['cache_on_read'] = False
                        proxy_scrub_config_key = '/ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid)
                        proxy_scrub_config = None if Configuration.exists(key=proxy_scrub_config_key) is False else Configuration.get(proxy_scrub_config_key)
                        if proxy_scrub_config is not None and proxy_scrub_config[StorageDriverConfiguration.CACHE_FRAGMENT] == ['none']:
                            proxy_scrub_config[StorageDriverConfiguration.CACHE_FRAGMENT] = fragment_cache_scrub_info
                            Configuration.set(key=proxy_scrub_config_key, value=proxy_scrub_config)

            # Update 'backend_connection_manager' section
            changes = False
            storagedriver_config = StorageDriverConfiguration(vpool.guid, storagedriver.storagedriver_id)
            if 'backend_connection_manager' not in storagedriver_config.configuration:
                continue

            current_config = storagedriver_config.configuration['backend_connection_manager']
            if current_config.get('backend_type') != 'MULTI':
                changes = True
                backend_connection_manager = {'backend_type': 'MULTI'}
                for index, proxy in enumerate(sorted(storagedriver.alba_proxies, key=lambda pr: pr.service.ports[0])):
                    backend_connection_manager[str(index)] = copy.deepcopy(current_config)
                    # noinspection PyUnresolvedReferences
                    backend_connection_manager[str(index)]['alba_connection_use_rora'] = True
                    # noinspection PyUnresolvedReferences
                    backend_connection_manager[str(index)]['alba_connection_rora_manifest_cache_capacity'] = 5000
                    # noinspection PyUnresolvedReferences
                    for key, value in backend_connection_manager[str(index)].items():
                        if key.startswith('backend_interface'):
                            backend_connection_manager[key] = value
                            # noinspection PyUnresolvedReferences
                            del backend_connection_manager[str(index)][key]
                for key, value in {'backend_interface_retries_on_error': 5,
                                   'backend_interface_retry_interval_secs': 1,
                                   'backend_interface_retry_backoff_multiplier': 2.0}.iteritems():
                    if key not in backend_connection_manager:
                        backend_connection_manager[key] = value
            else:
                backend_connection_manager = current_config
                for value in backend_connection_manager.values():
                    if isinstance(value, dict):
                        for key, val in value.items():
                            if key.startswith('backend_interface'):
                                backend_connection_manager[key] = val
                                changes = True
                                del value[key]
                for key, value in {'backend_interface_retries_on_error': 5,
                                   'backend_interface_retry_interval_secs': 1,
                                   'backend_interface_retry_backoff_multiplier': 2.0}.iteritems():
                    if key not in backend_connection_manager:
                        changes = True
                        backend_connection_manager[key] = value

            if changes is True:
                storagedriver_config.clear_backend_connection_manager()
                storagedriver_config.configure_backend_connection_manager(**backend_connection_manager)
                storagedriver_config.save(root_client)

                # Add '-reboot' to volumedriver services (because of updated 'backend_connection_manager' section)
                ExtensionsToolbox.edit_version_file(client=root_client,
                                                    package_name='volumedriver',
                                                    old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, 'volumedriver_{0}'.format(vpool.name)))
                if service_manager.__class__ == Systemd:
                    root_client.run(['systemctl', 'daemon-reload'])

        ########################################
        # Update metadata_store_bits information
        vpools = VPoolList.get_vpools()
        for vpool in vpools:
            bits = None
            for storagedriver in vpool.storagedrivers:
                key = '/ovs/framework/hosts/{0}/services/volumedriver_{1}'.format(storagedriver.storagerouter.machine_id, vpool.name)
                if Configuration.exists(key=key) and 'METADATASTORE_BITS' not in Configuration.get(key=key):
                    if bits is None:
                        entries = service_manager.extract_from_service_file(name='ovs-volumedriver_{0}'.format(vpool.name),
                                                                            client=sr_client_map[storagedriver.storagerouter_guid],
                                                                            entries=['METADATASTORE_BITS='])
                        if len(entries) == 1:
                            bits = entries[0].split('=')[-1]
                            bits = int(bits) if bits.isdigit() else 5
                    if bits is not None:
                        try:
                            content = Configuration.get(key=key)
                            content['METADATASTORE_BITS'] = bits
                            Configuration.set(key=key, value=content)
                        except:
                            MigrationController._logger.exception('Error updating volumedriver info for vPool {0} on StorageRouter {1}'.format(vpool.name, storagedriver.storagerouter.name))

            if bits is not None:
                vpool.metadata_store_bits = bits
                vpool.save()

        #####################################
        # Update the vPool metadata structure
        def _update_metadata_structure(metadata):
            metadata = copy.deepcopy(metadata)
            cache_structure = {'read': False,
                               'write': False,
                               'is_backend': False,
                               'quota': None,
                               'backend_info': {'name': None,  # Will be filled in when is_backend is true
                                                'backend_guid': None,
                                                'alba_backend_guid': None,
                                                'policies': None,
                                                'preset': None,
                                                'arakoon_config': None,
                                                'connection_info': {'client_id': None,
                                                                    'client_secret': None,
                                                                    'host': None,
                                                                    'port': None,
                                                                    'local': None}}
                               }
            structure_map = {StorageDriverConfiguration.CACHE_BLOCK: {'read': 'block_cache_on_read',
                                                                      'write': 'block_cache_on_write',
                                                                      'quota': 'quota_bc',
                                                                      'backend_prefix': 'backend_bc_{0}'},
                             StorageDriverConfiguration.CACHE_FRAGMENT: {'read': 'fragment_cache_on_read',
                                                                         'write': 'fragment_cache_on_write',
                                                                         'quota': 'quota_fc',
                                                                         'backend_prefix': 'backend_aa_{0}'}}
            if 'arakoon_config' in metadata['backend']:  # Arakoon config should be placed under the backend info
                metadata['backend']['backend_info']['arakoon_config'] = metadata['backend'].pop('arakoon_config')
            if 'connection_info' in metadata['backend']:  # Connection info sohuld be placed under the backend info
                metadata['backend']['backend_info']['connection_info'] = metadata['backend'].pop('connection_info')
            if 'caching_info' not in metadata:  # Caching info is the new key
                would_be_caching_info = {}
                metadata['caching_info'] = would_be_caching_info
                # Extract all caching data for every storagerouter
                current_caching_info = metadata['backend'].pop('caching_info')  # Pop to mutate metadata
                for storagerouter_guid in current_caching_info.iterkeys():
                    current_cache_data = current_caching_info[storagerouter_guid]
                    storagerouter_caching_info = {}
                    would_be_caching_info[storagerouter_guid] = storagerouter_caching_info
                    for cache_type, cache_type_mapping in structure_map.iteritems():
                        new_cache_structure = copy.deepcopy(cache_structure)
                        storagerouter_caching_info[cache_type] = new_cache_structure
                        for new_structure_key, old_structure_key in cache_type_mapping.iteritems():
                            if new_structure_key == 'backend_prefix':
                                # Get possible backend related info
                                metadata_key = old_structure_key.format(storagerouter_guid)
                                if metadata_key not in metadata:
                                    continue
                                backend_data = metadata.pop(metadata_key)  # Pop to mutate metadata
                                new_cache_structure['is_backend'] = True
                                # Copy over the old data
                                new_cache_structure['backend_info']['arakoon_config'] = backend_data['arakoon_config']
                                new_cache_structure['backend_info'].update(backend_data['backend_info'])
                                new_cache_structure['backend_info']['connection_info'].update(backend_data['connection_info'])
                            else:
                                new_cache_structure[new_structure_key] = current_cache_data.get(old_structure_key)
            return metadata

        vpools = VPoolList.get_vpools()
        for vpool in vpools:
            try:
                new_metadata = _update_metadata_structure(vpool.metadata)
                vpool.metadata = new_metadata
                vpool.save()
            except KeyError:
                MigrationController._logger.exception('Exceptions occurred when updating the metadata for vPool {0}'.format(vpool.name))

        ##############################################
        # Always use indent=4 during Configuration set
        def _resave_all_config_entries(config_path='/ovs'):
            """
            Recursive functions which checks every config management key if its a directory or not.
            If not a directory, we retrieve the config and just save it again using the new indentation logic
            """
            for item in Configuration.list(config_path):
                new_path = config_path + '/' + item
                print new_path
                if Configuration.dir_exists(new_path) is True:
                    _resave_all_config_entries(config_path=new_path)
                else:
                    try:
                        _config = Configuration.get(new_path)
                        Configuration.set(new_path, _config)
                    except:
                        _config = Configuration.get(new_path, raw=True)
                        Configuration.set(new_path, _config, raw=True)
        if ExtensionMigrator.THIS_VERSION <= 13:  # There is no way of checking whether this new indentation logic has been applied, so we only perform this for version 13 and lower
            MigrationController._logger.info('Re-saving every configuration setting with new indentation rules')
            _resave_all_config_entries()

        ############################
        # Update some default values
        def _update_manifest_cache_size(_proxy_config_key):
            updated = False
            manifest_cache_size = 500 * 1024 * 1024
            if Configuration.exists(key=_proxy_config_key):
                _proxy_config = Configuration.get(key=_proxy_config_key)
                for cache_type in [StorageDriverConfiguration.CACHE_BLOCK, StorageDriverConfiguration.CACHE_FRAGMENT]:
                    if cache_type in _proxy_config and _proxy_config[cache_type][0] == 'alba':
                        if _proxy_config[cache_type][1]['manifest_cache_size'] != manifest_cache_size:
                            updated = True
                            _proxy_config[cache_type][1]['manifest_cache_size'] = manifest_cache_size
                if _proxy_config['manifest_cache_size'] != manifest_cache_size:
                    updated = True
                    _proxy_config['manifest_cache_size'] = manifest_cache_size

                if updated is True:
                    Configuration.set(key=_proxy_config_key, value=_proxy_config)
            return updated

        for storagedriver in StorageDriverList.get_storagedrivers():
            try:
                vpool = storagedriver.vpool
                root_client = sr_client_map[storagedriver.storagerouter_guid]
                _update_manifest_cache_size('/ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid))  # Generic scrub proxy is deployed every time scrubbing kicks in, so no need to restart these services
                for alba_proxy in storagedriver.alba_proxies:
                    if _update_manifest_cache_size('/ovs/vpools/{0}/proxies/{1}/config/main'.format(vpool.guid, alba_proxy.guid)) is True:
                        # Add '-reboot' to alba_proxy services (because of newly created services and removal of old service)
                        ExtensionsToolbox.edit_version_file(client=root_client,
                                                            package_name='alba',
                                                            old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, alba_proxy.service.name))

                # Update 'backend_connection_manager' section
                changes = False
                storagedriver_config = StorageDriverConfiguration(vpool.guid, storagedriver.storagedriver_id)
                if 'backend_connection_manager' not in storagedriver_config.configuration:
                    continue

                current_config = storagedriver_config.configuration['backend_connection_manager']
                for key, value in current_config.iteritems():
                    if key.isdigit() is True:
                        if value.get('alba_connection_asd_connection_pool_capacity') != 10:
                            changes = True
                            value['alba_connection_asd_connection_pool_capacity'] = 10
                        if value.get('alba_connection_timeout') != 30:
                            changes = True
                            value['alba_connection_timeout'] = 30
                        if value.get('alba_connection_rora_manifest_cache_capacity') != 25000:
                            changes = True
                            value['alba_connection_rora_manifest_cache_capacity'] = 25000

                if changes is True:
                    storagedriver_config.clear_backend_connection_manager()
                    storagedriver_config.configure_backend_connection_manager(**current_config)
                    storagedriver_config.save(root_client)

                    # Add '-reboot' to volumedriver services (because of updated 'backend_connection_manager' section)
                    ExtensionsToolbox.edit_version_file(client=root_client,
                                                        package_name='volumedriver',
                                                        old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, 'volumedriver_{0}'.format(vpool.name)))
            except Exception:
                MigrationController._logger.exception('Updating default configuration values failed for StorageDriver {0}'.format(storagedriver.storagedriver_id))

        ####################################################
        # Adding proxy fail fast as env variable for proxies
        changed_clients = set()
        for storagerouter in StorageRouterList.get_storagerouters():
            root_client = sr_client_map[storagerouter.guid]
            for service_name in service_manager.list_services(client=root_client):
                if not service_name.startswith('ovs-albaproxy_'):
                    continue

                if ServiceFactory.get_service_type() == 'systemd':
                    path = '/lib/systemd/system/{0}.service'.format(service_name)
                    check = 'Environment=ALBA_FAIL_FAST=true'
                else:
                    path = '/etc/init/{0}.conf'.format(service_name)
                    check = 'env ALBA_FAIL_FAST=true'

                if not root_client.file_exists(path):
                    continue
                if check in root_client.file_read(path):
                    continue

                try:
                    service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY, client=root_client, target_name=service_name)
                    changed_clients.add(root_client)
                    ExtensionsToolbox.edit_version_file(client=root_client,
                                                        package_name='alba',
                                                        old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, service_name))
                except:
                    MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name))
        for root_client in changed_clients:
            root_client.run(['systemctl', 'daemon-reload'])

        ######################################
        # Integration of stats monkey (2.10.2)
        if Configuration.get(key='/ovs/framework/migration|stats_monkey_integration', default=False) is False:
            try:
                # Get content of old key into new key
                old_stats_monkey_key = '/statsmonkey/statsmonkey'
                if Configuration.exists(key=old_stats_monkey_key) is True:
                    Configuration.set(key='/ovs/framework/monitoring/stats_monkey', value=Configuration.get(key=old_stats_monkey_key))
                    Configuration.delete(key=old_stats_monkey_key)

                # Make sure to disable the stats monkey by default or take over the current schedule if it was configured manually before
                celery_key = '/ovs/framework/scheduling/celery'
                current_value = None
                scheduling_config = Configuration.get(key=celery_key, default={})
                if 'statsmonkey.run_all_stats' in scheduling_config:  # Old celery task name of the stats monkey
                    current_value = scheduling_config.pop('statsmonkey.run_all_stats')
                scheduling_config['ovs.stats_monkey.run_all'] = current_value
                scheduling_config['alba.stats_monkey.run_all'] = current_value
                Configuration.set(key=celery_key, value=scheduling_config)

                support_key = '/ovs/framework/support'
                support_config = Configuration.get(key=support_key)
                support_config['support_agent'] = support_config.pop('enabled', True)
                support_config['remote_access'] = support_config.pop('enablesupport', False)
                Configuration.set(key=support_key, value=support_config)

                # Make sure once this finished, it never runs again by setting this key to True
                Configuration.set(key='/ovs/framework/migration|stats_monkey_integration', value=True)
            except Exception:
                MigrationController._logger.exception('Integration of stats monkey failed')

        ######################################################
        # Write away cluster ID to a file for back-up purposes
        try:
            cluster_id = Configuration.get(key='/ovs/framework/cluster_id', default=None)
            with open(Configuration.CONFIG_STORE_LOCATION, 'r') as config_file:
                config = json.load(config_file)
            if cluster_id is not None and config.get('cluster_id', None) is None:
                config['cluster_id'] = cluster_id
                with open(Configuration.CONFIG_STORE_LOCATION, 'w') as config_file:
                    json.dump(config, config_file, indent=4)
        except Exception:
            MigrationController._logger.exception('Writing cluster id to a file failed.')

        #########################################################
        # Additional string formatting in Arakoon services (2.11)
        try:
            if Configuration.get(key='/ovs/framework/migration|arakoon_service_update', default=False) is False:
                arakoon_service_names = [ArakoonInstaller.get_service_name_for_cluster(cluster_name=cluster_name) for cluster_name in Configuration.list(key='ovs/arakoon')]
                for storagerouter in StorageRouterList.get_masters():
                    for service_name in arakoon_service_names:
                        config_key = ServiceFactory.SERVICE_CONFIG_KEY.format(storagerouter.machine_id, service_name)
                        if Configuration.exists(key=config_key):
                            config = Configuration.get(key=config_key)
                            config['RUN_FILE_DIR'] = ServiceFactory.RUN_FILE_DIR
                            config['ARAKOON_PKG_NAME'] = PackageFactory.PKG_ARAKOON
                            config['ARAKOON_VERSION_CMD'] = PackageFactory.VERSION_CMD_ARAKOON
                            Configuration.set(key=config_key, value=config)
                # Make sure once this finished, it never runs again by setting this key to True
                Configuration.set(key='/ovs/framework/migration|arakoon_service_update', value=True)
        except Exception:
            MigrationController._logger.exception('Updating the string formatting for the Arakoon services failed')

        ############################################################
        # Additional string formatting in ALBA proxy services (2.11)
        changed_clients = set()
        try:
            if Configuration.get(key='/ovs/framework/migration|alba_proxy_service_update', default=False) is False:
                alba_pkg_name, alba_version_cmd = PackageFactory.get_package_and_version_cmd_for(component=PackageFactory.COMP_ALBA)
                for service in ServiceTypeList.get_by_name('AlbaProxy').services:
                    root_client = sr_client_map[service.storagerouter_guid]
                    config_key = ServiceFactory.SERVICE_CONFIG_KEY.format(service.storagerouter.machine_id, service.name)
                    if Configuration.exists(key=config_key):
                        config = Configuration.get(key=config_key)
                        config['RUN_FILE_DIR'] = ServiceFactory.RUN_FILE_DIR
                        config['ALBA_PKG_NAME'] = alba_pkg_name
                        config['ALBA_VERSION_CMD'] = alba_version_cmd
                        Configuration.set(key=config_key, value=config)
                        service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY,
                                                           client=root_client,
                                                           target_name='ovs-{0}'.format(service.name))
                        changed_clients.add(root_client)

                # Make sure once this finished, it never runs again by setting this key to True
                Configuration.set(key='/ovs/framework/migration|alba_proxy_service_update', value=True)
        except Exception:
            MigrationController._logger.exception('Updating the string formatting for the Arakoon services failed')

        ############################################################
        # Additional string formatting in DTL/VOLDRV services (2.11)
        try:
            if Configuration.get(key='/ovs/framework/migration|voldrv_service_update', default=False) is False:
                sd_pkg_name, sd_version_cmd = PackageFactory.get_package_and_version_cmd_for(component=PackageFactory.COMP_SD)
                for vpool in VPoolList.get_vpools():
                    for storagedriver in vpool.storagedrivers:
                        root_client = sr_client_map[storagedriver.storagerouter_guid]
                        for entry in ['dtl', 'volumedriver']:
                            service_name = '{0}_{1}'.format(entry, vpool.name)
                            service_template = StorageDriverInstaller.SERVICE_TEMPLATE_DTL if entry == 'dtl' else StorageDriverInstaller.SERVICE_TEMPLATE_SD
                            config_key = ServiceFactory.SERVICE_CONFIG_KEY.format(storagedriver.storagerouter.machine_id, service_name)
                            if Configuration.exists(key=config_key):
                                config = Configuration.get(key=config_key)
                                config['RUN_FILE_DIR'] = ServiceFactory.RUN_FILE_DIR
                                config['VOLDRV_PKG_NAME'] = sd_pkg_name
                                config['VOLDRV_VERSION_CMD'] = sd_version_cmd
                                Configuration.set(key=config_key, value=config)
                                service_manager.regenerate_service(name=service_template,
                                                                   client=root_client,
                                                                   target_name='ovs-{0}'.format(service_name))
                                changed_clients.add(root_client)

                # Make sure once this finished, it never runs again by setting this key to True
                Configuration.set(key='/ovs/framework/migration|voldrv_service_update', value=True)
        except Exception:
            MigrationController._logger.exception('Updating the string formatting for the Arakoon services failed')

        #######################################################
        # Storing actual package name in version files (2.11.0) (https://github.com/openvstorage/framework/issues/1876)
        if Configuration.get(key='/ovs/framework/migration|actual_package_name_in_version_file', default=False) is False:
            try:
                voldrv_pkg_name, _ = PackageFactory.get_package_and_version_cmd_for(component=PackageFactory.COMP_SD)
                for storagerouter in StorageRouterList.get_storagerouters():
                    root_client = sr_client_map.get(storagerouter.guid)
                    if root_client is None:
                        continue

                    for file_name in root_client.file_list(directory=ServiceFactory.RUN_FILE_DIR):
                        if not file_name.endswith('.version'):
                            continue
                        file_path = '{0}/{1}'.format(ServiceFactory.RUN_FILE_DIR, file_name)
                        contents = root_client.file_read(filename=file_path)
                        regenerate = False
                        if voldrv_pkg_name == PackageFactory.PKG_VOLDRV_SERVER:
                            if 'volumedriver-server' in contents:
                                regenerate = True
                                contents = contents.replace('volumedriver-server', PackageFactory.PKG_VOLDRV_SERVER)
                                root_client.file_write(filename=file_path, contents=contents)
                        elif voldrv_pkg_name == PackageFactory.PKG_VOLDRV_SERVER_EE:
                            if 'volumedriver-server' in contents or PackageFactory.PKG_VOLDRV_SERVER in contents:
                                regenerate = True
                                contents = contents.replace('volumedriver-server', PackageFactory.PKG_VOLDRV_SERVER_EE)
                                contents = contents.replace(PackageFactory.PKG_VOLDRV_SERVER, PackageFactory.PKG_VOLDRV_SERVER_EE)
                                root_client.file_write(filename=file_path, contents=contents)

                        if regenerate is True:
                            service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_DTL if file_name.startswith('dtl') else StorageDriverInstaller.SERVICE_TEMPLATE_SD,
                                                               client=root_client,
                                                               target_name='ovs-{0}'.format(file_name.split('.')[0]))  # Leave out .version
                            changed_clients.add(root_client)
                Configuration.set(key='/ovs/framework/migration|actual_package_name_in_version_file', value=True)
            except Exception:
                MigrationController._logger.exception('Updating actual package name for version files failed')

        for root_client in changed_clients:
            try:
                root_client.run(['systemctl', 'daemon-reload'])
            except Exception:
                MigrationController._logger.exception('Executing command "systemctl daemon-reload" failed')

        #########################################################
        # Addition of 'Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50' for AlbaProxy SystemD services
        if ServiceFactory.get_service_type() == 'systemd':
            changed_clients = set()
            for storagedriver in StorageDriverList.get_storagedrivers():
                root_client = sr_client_map[storagedriver.storagerouter_guid]
                for alba_proxy in storagedriver.alba_proxies:
                    service = alba_proxy.service
                    service_name = 'ovs-{0}'.format(service.name)
                    if not service_manager.has_service(name=service_name, client=root_client):
                        continue
                    if "Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50" in root_client.file_read(filename='/lib/systemd/system/{0}.service'.format(service_name)):
                        continue
                    try:
                        service_manager.regenerate_service(name='ovs-albaproxy', client=root_client, target_name=service_name)
                        changed_clients.add(root_client)
                    except:
                        MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name))
            for root_client in changed_clients:
                root_client.run(['systemctl', 'daemon-reload'])
        #########################################################
        # Addition of 'Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50' for Arakoon SystemD services
        if ServiceFactory.get_service_type() == 'systemd':
            changed_clients = set()
            for storagerouter in StorageRouterList.get_storagerouters():
                root_client = sr_client_map[storagerouter.guid]
                for service_name in service_manager.list_services(client=root_client):
                    if not service_name.startswith('ovs-arakoon-'):
                        continue
                    if not service_manager.has_service(name=service_name, client=root_client):
                        continue
                    if "Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50" in root_client.file_read(filename='/lib/systemd/system/{0}.service'.format(service_name)):
                        continue
                    try:
                        service_manager.regenerate_service(name='ovs-arakoon', client=root_client, target_name=service_name)
                        changed_clients.add(root_client)
                    except:
                        MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name))
            for root_client in changed_clients:
                root_client.run(['systemctl', 'daemon-reload'])

        MigrationController._logger.info('Finished out of band migrations')
Ejemplo n.º 28
0
    def remove_node(node_ip, silent=None):
        """
        Remove the node with specified IP from the cluster
        :param node_ip: IP of the node to remove
        :type node_ip: str
        :param silent: If silent == '--force-yes' no question will be asked to confirm the removal
        :type silent: str
        :return: None
        """
        from ovs.dal.lists.storagerouterlist import StorageRouterList
        from ovs.lib.storagedriver import StorageDriverController
        from ovs.lib.vpool import VPoolController

        Toolbox.log(logger=NodeRemovalController._logger,
                    messages='Remove node',
                    boxed=True)
        Toolbox.log(
            logger=NodeRemovalController._logger,
            messages=
            'WARNING: Some of these steps may take a very long time, please check the logs for more information\n\n'
        )
        service_manager = ServiceFactory.get_manager()

        ###############
        # VALIDATIONS #
        ###############
        try:
            node_ip = node_ip.strip()
            if not isinstance(node_ip, str):
                raise ValueError('Node IP must be a string')
            if not re.match(SSHClient.IP_REGEX, node_ip):
                raise ValueError('Invalid IP {0} specified'.format(node_ip))

            storage_router_all = sorted(StorageRouterList.get_storagerouters(),
                                        key=lambda k: k.name)
            storage_router_masters = StorageRouterList.get_masters()
            storage_router_all_ips = set(
                [storage_router.ip for storage_router in storage_router_all])
            storage_router_master_ips = set([
                storage_router.ip for storage_router in storage_router_masters
            ])
            storage_router_to_remove = StorageRouterList.get_by_ip(node_ip)
            offline_reasons = {}
            if node_ip not in storage_router_all_ips:
                raise ValueError(
                    'Unknown IP specified\nKnown in model:\n - {0}\nSpecified for removal:\n - {1}'
                    .format('\n - '.join(storage_router_all_ips), node_ip))

            if len(storage_router_all_ips) == 1:
                raise RuntimeError("Removing the only node is not possible")

            if node_ip in storage_router_master_ips and len(
                    storage_router_master_ips) == 1:
                raise RuntimeError(
                    "Removing the only master node is not possible")

            if System.get_my_storagerouter() == storage_router_to_remove:
                raise RuntimeError(
                    'The node to be removed cannot be identical to the node on which the removal is initiated'
                )

            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages='Creating SSH connections to remaining master nodes')
            master_ip = None
            ip_client_map = {}
            storage_routers_offline = []
            storage_router_to_remove_online = True
            for storage_router in storage_router_all:
                try:
                    client = SSHClient(storage_router,
                                       username='******',
                                       timeout=10)
                except (UnableToConnectException, NotAuthenticatedException,
                        TimeOutException) as ex:
                    if isinstance(ex, UnableToConnectException):
                        msg = 'Unable to connect'
                    elif isinstance(ex, NotAuthenticatedException):
                        msg = 'Could not authenticate'
                    elif isinstance(ex, TimeOutException):
                        msg = 'Connection timed out'
                    Toolbox.log(
                        logger=NodeRemovalController._logger,
                        messages='  * Node with IP {0:<15}- {1}'.format(
                            storage_router.ip, msg))
                    offline_reasons[storage_router.ip] = msg
                    storage_routers_offline.append(storage_router)
                    if storage_router == storage_router_to_remove:
                        storage_router_to_remove_online = False
                    continue

                Toolbox.log(
                    logger=NodeRemovalController._logger,
                    messages='  * Node with IP {0:<15}- Successfully connected'
                    .format(storage_router.ip))
                ip_client_map[storage_router.ip] = client
                if storage_router != storage_router_to_remove and storage_router.node_type == 'MASTER':
                    master_ip = storage_router.ip

            if len(ip_client_map) == 0 or master_ip is None:
                raise RuntimeError(
                    'Could not connect to any master node in the cluster')

            storage_router_to_remove.invalidate_dynamics('vdisks_guids')
            if len(
                    storage_router_to_remove.vdisks_guids
            ) > 0:  # vDisks are supposed to be moved away manually before removing a node
                raise RuntimeError(
                    "Still vDisks attached to Storage Router {0}".format(
                        storage_router_to_remove.name))

            internal_memcached = Toolbox.is_service_internally_managed(
                service='memcached')
            internal_rabbit_mq = Toolbox.is_service_internally_managed(
                service='rabbitmq')
            memcached_endpoints = Configuration.get(
                key='/ovs/framework/memcache|endpoints')
            rabbit_mq_endpoints = Configuration.get(
                key='/ovs/framework/messagequeue|endpoints')
            copy_memcached_endpoints = list(memcached_endpoints)
            copy_rabbit_mq_endpoints = list(rabbit_mq_endpoints)
            for endpoint in memcached_endpoints:
                if endpoint.startswith(storage_router_to_remove.ip):
                    copy_memcached_endpoints.remove(endpoint)
            for endpoint in rabbit_mq_endpoints:
                if endpoint.startswith(storage_router_to_remove.ip):
                    copy_rabbit_mq_endpoints.remove(endpoint)
            if len(copy_memcached_endpoints
                   ) == 0 and internal_memcached is True:
                raise RuntimeError(
                    'Removal of provided nodes will result in a complete removal of the memcached service'
                )
            if len(copy_rabbit_mq_endpoints
                   ) == 0 and internal_rabbit_mq is True:
                raise RuntimeError(
                    'Removal of provided nodes will result in a complete removal of the messagequeue service'
                )

            Toolbox.run_hooks(component='noderemoval',
                              sub_component='validate_removal',
                              logger=NodeRemovalController._logger,
                              cluster_ip=storage_router_to_remove.ip)
        except KeyboardInterrupt:
            Toolbox.log(logger=NodeRemovalController._logger, messages='\n')
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages=
                'Removal has been aborted during the validation step. No changes have been applied.',
                boxed=True,
                loglevel='warning')
            sys.exit(1)
        except Exception as exception:
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages=[str(exception)],
                        boxed=True,
                        loglevel='exception')
            sys.exit(1)

        #################
        # CONFIRMATIONS #
        #################
        try:
            interactive = silent != '--force-yes'
            remove_asd_manager = not interactive  # Remove ASD manager if non-interactive else ask
            if interactive is True:
                if len(storage_routers_offline) > 0:
                    Toolbox.log(
                        logger=NodeRemovalController._logger,
                        messages=
                        'Certain nodes appear to be offline. These will not fully removed and will cause issues if they are not really offline.'
                    )
                    Toolbox.log(
                        logger=NodeRemovalController._logger,
                        messages='Offline nodes: {0}'.format(''.join(
                            ('\n  * {0:<15}- {1}.'.format(ip, message)
                             for ip, message in offline_reasons.iteritems()))))
                    valid_node_info = Interactive.ask_yesno(
                        message=
                        'Continue the removal with these being presumably offline?',
                        default_value=False)
                    if valid_node_info is False:
                        Toolbox.log(
                            logger=NodeRemovalController._logger,
                            messages=
                            'Please validate the state of the nodes before removing.',
                            title=True)
                        sys.exit(1)
                proceed = Interactive.ask_yesno(
                    message='Are you sure you want to remove node {0}?'.format(
                        storage_router_to_remove.name),
                    default_value=False)
                if proceed is False:
                    Toolbox.log(logger=NodeRemovalController._logger,
                                messages='Abort removal',
                                title=True)
                    sys.exit(1)

                remove_asd_manager = True
                if storage_router_to_remove_online is True:
                    client = SSHClient(endpoint=storage_router_to_remove,
                                       username='******')
                    if service_manager.has_service(name='asd-manager',
                                                   client=client):
                        remove_asd_manager = Interactive.ask_yesno(
                            message=
                            'Do you also want to remove the ASD manager and related ASDs?',
                            default_value=False)

                if remove_asd_manager is True or storage_router_to_remove_online is False:
                    for fct in Toolbox.fetch_hooks('noderemoval',
                                                   'validate_asd_removal'):
                        validation_output = fct(storage_router_to_remove.ip)
                        if validation_output['confirm'] is True:
                            if Interactive.ask_yesno(
                                    message=validation_output['question'],
                                    default_value=False) is False:
                                remove_asd_manager = False
                                break
        except KeyboardInterrupt:
            Toolbox.log(logger=NodeRemovalController._logger, messages='\n')
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages=
                'Removal has been aborted during the confirmation step. No changes have been applied.',
                boxed=True,
                loglevel='warning')
            sys.exit(1)
        except Exception as exception:
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages=[str(exception)],
                        boxed=True,
                        loglevel='exception')
            sys.exit(1)
        ###########
        # REMOVAL #
        ###########
        try:
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='Starting removal of node {0} - {1}'.format(
                            storage_router_to_remove.name,
                            storage_router_to_remove.ip))
            if storage_router_to_remove_online is False:
                Toolbox.log(
                    logger=NodeRemovalController._logger,
                    messages=
                    '  Marking all Storage Drivers served by Storage Router {0} as offline'
                    .format(storage_router_to_remove.ip))
                StorageDriverController.mark_offline(
                    storagerouter_guid=storage_router_to_remove.guid)

            # Remove vPools
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='  Removing vPools from node'.format(
                            storage_router_to_remove.ip))
            storage_routers_offline_guids = [
                sr.guid for sr in storage_routers_offline
                if sr.guid != storage_router_to_remove.guid
            ]
            for storage_driver in storage_router_to_remove.storagedrivers:
                Toolbox.log(logger=NodeRemovalController._logger,
                            messages='    Removing vPool {0} from node'.format(
                                storage_driver.vpool.name))
                VPoolController.shrink_vpool(
                    storagedriver_guid=storage_driver.guid,
                    offline_storage_router_guids=storage_routers_offline_guids)

            # Demote if MASTER
            if storage_router_to_remove.node_type == 'MASTER':
                NodeTypeController.demote_node(
                    cluster_ip=storage_router_to_remove.ip,
                    master_ip=master_ip,
                    ip_client_map=ip_client_map,
                    unique_id=storage_router_to_remove.machine_id,
                    unconfigure_memcached=internal_memcached,
                    unconfigure_rabbitmq=internal_rabbit_mq,
                    offline_nodes=storage_routers_offline)

            # Stop / remove services
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='Stopping and removing services')
            if storage_router_to_remove_online is True:
                client = SSHClient(endpoint=storage_router_to_remove,
                                   username='******')
                NodeRemovalController.remove_services(
                    client=client,
                    node_type=storage_router_to_remove.node_type.lower(),
                    logger=NodeRemovalController._logger)
                service = 'watcher-config'
                if service_manager.has_service(service, client=client):
                    Toolbox.log(
                        logger=NodeRemovalController._logger,
                        messages='Removing service {0}'.format(service))
                    service_manager.stop_service(service, client=client)
                    service_manager.remove_service(service, client=client)

            Toolbox.run_hooks(component='noderemoval',
                              sub_component='remove',
                              logger=NodeRemovalController._logger,
                              cluster_ip=storage_router_to_remove.ip,
                              complete_removal=remove_asd_manager)

            # Clean up model
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='Removing node from model')
            for service in storage_router_to_remove.services:
                service.delete()
            for disk in storage_router_to_remove.disks:
                for partition in disk.partitions:
                    partition.delete()
                disk.delete()
            for j_domain in storage_router_to_remove.domains:
                j_domain.delete()
            Configuration.delete('/ovs/framework/hosts/{0}'.format(
                storage_router_to_remove.machine_id))

            NodeTypeController.restart_framework_and_memcache_services(
                clients=ip_client_map,
                offline_node_ips=[node.ip for node in storage_routers_offline],
                logger=NodeRemovalController._logger)

            if storage_router_to_remove_online is True:
                client = SSHClient(endpoint=storage_router_to_remove,
                                   username='******')
                client.file_delete(filenames=[CACC_LOCATION])
                client.file_delete(filenames=[CONFIG_STORE_LOCATION])
            storage_router_to_remove.delete()
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='Successfully removed node\n')
        except Exception as exception:
            Toolbox.log(logger=NodeRemovalController._logger, messages='\n')
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages=['An unexpected error occurred:',
                          str(exception)],
                boxed=True,
                loglevel='exception')
            sys.exit(1)
        except KeyboardInterrupt:
            Toolbox.log(logger=NodeRemovalController._logger, messages='\n')
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages=
                'This setup was aborted. Open vStorage may be in an inconsistent state, make sure to validate the installation.',
                boxed=True,
                loglevel='error')
            sys.exit(1)

        if remove_asd_manager is True and storage_router_to_remove_online is True:
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='\nRemoving ASD Manager')
            with remote(storage_router_to_remove.ip, [os]) as rem:
                rem.os.system('asd-manager remove --force-yes')
        Toolbox.log(logger=NodeRemovalController._logger,
                    messages='Remove nodes finished',
                    title=True)
Ejemplo n.º 29
0
    def run_test(cls, vm_info, cluster_info, logger=LOGGER):
        """
        Tests the DTL using a virtual machine which will write in his own filesystem
        Expects last data to be pulled from the DTL and not backend
        :param cluster_info: information about the cluster, contains all dal objects
        :type cluster_info: dict
        :param vm_info: info about the vms
        :param logger: logging instance
        :return: None
        :rtype: NoneType
        """
        source_std = cluster_info['storagedrivers']['source']
        source_client = SSHClient(source_std.storagerouter, username='******')

        compute_str = cluster_info['storagerouters']['compute']
        compute_client = SSHClient(compute_str)

        # setup hypervisor details
        parent_hypervisor = HypervisorFactory().get()
        vm_to_stop = cls.HYPERVISOR_INFO['vms'][source_std.storage_ip]['name']

        vdisk_info = {}
        disk_amount = 0
        for vm_name, vm_object in vm_info.iteritems():
            for vdisk in vm_object['vdisks']:
                # Ignore the cd vdisk as no IO will come from it
                if vdisk.name == vm_object['cd_path'].replace(
                        '.raw', '').split('/')[-1]:
                    continue
                disk_amount += 1
                vdisk_info.update({vdisk.name: vdisk})

        # Cache to validate properties
        values_to_check = {
            'source_std': source_std.serialize(),
            'vdisks': vdisk_info.values()
        }

        with remote(compute_str.ip, [SSHClient]) as rem:
            threads = {'evented': {'io': {'pairs': [], 'r_semaphore': None}}}
            vm_downed = False
            output_files = []
            try:
                for vm_name, vm_data in vm_info.iteritems():
                    vm_client = rem.SSHClient(vm_data['ip'], cls.VM_USERNAME,
                                              cls.VM_PASSWORD)
                    vm_client.file_create('/mnt/data/{0}.raw'.format(
                        vm_data['create_msg']))
                    vm_data['client'] = vm_client
                    # Load dd, md5sum, screen & fio in memory
                    vm_data['client'].run([
                        'dd', 'if=/dev/urandom',
                        'of={0}'.format(cls.VM_RANDOM), 'bs=1M', 'count=2'
                    ])
                    vm_data['client'].run(['md5sum', cls.VM_RANDOM])

                logger.info("Stopping proxy services")
                service_manager = ServiceFactory.get_manager()

                for proxy in source_std.alba_proxies:
                    service_manager.restart_service(proxy.service.name,
                                                    client=source_client)

                logger.info(
                    'Starting to WRITE file while proxy is offline. All data should be stored in the DTL!'
                )
                for vm_name, vm_data in vm_info.iteritems():
                    vm_data['client'].run(
                        'dd if=/dev/urandom of={0} bs=1M count=2'.format(
                            cls.VM_FILENAME).split())
                    original_md5sum = ' '.join(vm_data['client'].run(
                        ['md5sum', cls.VM_FILENAME]).split())
                    vm_data['original_md5sum'] = original_md5sum
                    logger.info('Original MD5SUM for VM {0}: {1}.'.format(
                        vm_name, original_md5sum))
                logger.info('Finished to WRITE file while proxy is offline!')
                logger.info(
                    "Starting fio to generate IO for failing over.".format(
                        cls.IO_TIME))
                io_thread_pairs, monitoring_data, io_r_semaphore = ThreadingHandler.start_io_polling_threads(
                    volume_bundle=vdisk_info)
                threads['evented']['io']['pairs'] = io_thread_pairs
                threads['evented']['io']['r_semaphore'] = io_r_semaphore
                for vm_name, vm_data in vm_info.iteritems():  # Write data
                    screen_names, output_files = DataWriter.write_data_fio(
                        client=vm_data['client'],
                        fio_configuration={
                            'io_size': cls.AMOUNT_TO_WRITE,
                            'configuration': cls.IO_PATTERN
                        },
                        file_locations=[
                            '/mnt/data/{0}.raw'.format(vm_data['create_msg'])
                        ])
                    vm_data['screen_names'] = screen_names
                logger.info(
                    'Doing IO for {0}s before bringing down the node.'.format(
                        cls.IO_TIME))
                ThreadingHandler.keep_threads_running(
                    r_semaphore=io_r_semaphore,
                    threads=io_thread_pairs,
                    shared_resource=monitoring_data,
                    duration=cls.IO_TIME)
                ##############################################
                # Bringing original owner of the volume down #
                ##############################################
                VMHandler.stop_vm(hypervisor=parent_hypervisor,
                                  vmid=vm_to_stop)
                vm_downed = True
                downed_time = time.time()
                time.sleep(cls.IO_REFRESH_RATE * 2)
                # Start IO polling to verify nothing went down
                ThreadingHandler.poll_io(
                    r_semaphore=io_r_semaphore,
                    required_thread_amount=len(io_thread_pairs),
                    shared_resource=monitoring_data,
                    downed_time=downed_time,
                    timeout=cls.HA_TIMEOUT,
                    output_files=output_files,
                    client=compute_client,
                    disk_amount=disk_amount)
                logger.info('Starting to validate move...')
                cls._validate_move(values_to_check)
                logger.info('Finished validating move!')
                logger.info('Validate if DTL is working correctly!')
                unmatching_checksum_vms = []
                for vm_name, vm_data in vm_info.iteritems():
                    current_md5sum = ' '.join(vm_data['client'].run(
                        ['md5sum', cls.VM_FILENAME]).split())
                    if vm_data['original_md5sum'] != current_md5sum:
                        unmatching_checksum_vms.append(vm_name)
                assert len(
                    unmatching_checksum_vms
                ) == 0, 'Not all data was read from the DTL. Checksums do not line up for {}'.format(
                    ', '.join(unmatching_checksum_vms))
                logger.info('DTL is working correctly!')
            finally:
                for thread_category, thread_collection in threads[
                        'evented'].iteritems():
                    ThreadHelper.stop_evented_threads(
                        thread_collection['pairs'],
                        thread_collection['r_semaphore'])
                if vm_downed is True:
                    VMHandler.start_vm(parent_hypervisor, vm_to_stop)
                    logger.debug('Started {0}'.format(vm_to_stop))
                    SystemHelper.idle_till_ovs_is_up(source_std.storage_ip,
                                                     **cls.get_shell_user())
                    # @TODO: Remove when https://github.com/openvstorage/integrationtests/issues/540 is fixed
                    FwkHandler.restart_all()
                for vm_name, vm_data in vm_info.iteritems():
                    for screen_name in vm_data.get('screen_names', []):
                        logger.debug('Stopping screen {0} on {1}.'.format(
                            screen_name, vm_data['client'].ip))
                        vm_data['client'].run(
                            ['screen', '-S', screen_name, '-X', 'quit'])
                    vm_data['screen_names'] = []
Ejemplo n.º 30
0
class VPoolController(object):
    """
    Contains all BLL related to VPools
    """
    _logger = Logger('lib')
    _service_manager = ServiceFactory.get_manager()

    @classmethod
    @ovs_task(name='ovs.storagerouter.add_vpool')
    def add_vpool(cls, parameters):
        """
        Add a vPool to the machine this task is running on
        :param parameters: Parameters for vPool creation
        :type parameters: dict
        :return: None
        :rtype: NoneType
        """
        # TODO: Add logging
        cls._logger.debug('Adding vpool. Parameters: {}'.format(parameters))
        # VALIDATIONS
        if not isinstance(parameters, dict):
            raise ValueError(
                'Parameters passed to create a vPool should be of type dict')

        # Check StorageRouter existence
        storagerouter = StorageRouterList.get_by_ip(
            ip=parameters.get('storagerouter_ip'))
        if storagerouter is None:
            raise RuntimeError('Could not find StorageRouter')

        # Validate requested vPool configurations
        vp_installer = VPoolInstaller(name=parameters.get('vpool_name'))
        vp_installer.validate(storagerouter=storagerouter)

        # Validate requested StorageDriver configurations
        cls._logger.info(
            'vPool {0}: Validating StorageDriver configurations'.format(
                vp_installer.name))
        sd_installer = StorageDriverInstaller(
            vp_installer=vp_installer,
            configurations={
                'storage_ip': parameters.get('storage_ip'),
                'caching_info': parameters.get('caching_info'),
                'backend_info': {
                    'main':
                    parameters.get('backend_info'),
                    StorageDriverConfiguration.CACHE_BLOCK:
                    parameters.get('backend_info_bc'),
                    StorageDriverConfiguration.CACHE_FRAGMENT:
                    parameters.get('backend_info_fc')
                },
                'connection_info': {
                    'main':
                    parameters.get('connection_info'),
                    StorageDriverConfiguration.CACHE_BLOCK:
                    parameters.get('connection_info_bc'),
                    StorageDriverConfiguration.CACHE_FRAGMENT:
                    parameters.get('connection_info_fc')
                },
                'sd_configuration': parameters.get('config_params')
            })

        partitions_mutex = volatile_mutex('add_vpool_partitions_{0}'.format(
            storagerouter.guid))
        try:
            # VPOOL CREATION
            # Create the vPool as soon as possible in the process to be displayed in the GUI (INSTALLING/EXTENDING state)
            if vp_installer.is_new is True:
                vp_installer.create(rdma_enabled=sd_installer.rdma_enabled)
                vp_installer.configure_mds(
                    config=parameters.get('mds_config_params', {}))
            else:
                vp_installer.update_status(status=VPool.STATUSES.EXTENDING)

            # ADDITIONAL VALIDATIONS
            # Check StorageRouter connectivity
            cls._logger.info(
                'vPool {0}: Validating StorageRouter connectivity'.format(
                    vp_installer.name))
            linked_storagerouters = [storagerouter]
            if vp_installer.is_new is False:
                linked_storagerouters += [
                    sd.storagerouter
                    for sd in vp_installer.vpool.storagedrivers
                ]

            sr_client_map = SSHClient.get_clients(
                endpoints=linked_storagerouters, user_names=['ovs', 'root'])
            offline_nodes = sr_client_map.pop('offline')
            if storagerouter in offline_nodes:
                raise RuntimeError(
                    'Node on which the vPool is being {0} is not reachable'.
                    format('created'
                           if vp_installer.is_new is True else 'extended'))

            sr_installer = StorageRouterInstaller(
                root_client=sr_client_map[storagerouter]['root'],
                sd_installer=sd_installer,
                vp_installer=vp_installer,
                storagerouter=storagerouter)

            # When 2 or more jobs simultaneously run on the same StorageRouter, we need to check and create the StorageDriver partitions in locked context
            partitions_mutex.acquire(wait=60)
            sr_installer.partition_info = StorageRouterController.get_partition_info(
                storagerouter_guid=storagerouter.guid)
            sr_installer.validate_vpool_extendable()
            sr_installer.validate_global_write_buffer(
                requested_size=parameters.get('writecache_size', 0))
            sr_installer.validate_local_cache_size(
                requested_proxies=parameters.get('parallelism', {}).get(
                    'proxies', 2))

            # MODEL STORAGEDRIVER AND PARTITION JUNCTIONS
            sd_installer.create()
            sd_installer.create_partitions()
            partitions_mutex.release()

            vp_installer.refresh_metadata()
        except Exception:
            cls._logger.exception(
                'Something went wrong during the validation or modeling of vPool {0} on StorageRouter {1}'
                .format(vp_installer.name, storagerouter.name))
            partitions_mutex.release()
            vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING)
            raise

        # Arakoon setup
        counter = 0
        while counter < 300:
            try:
                if StorageDriverController.manual_voldrv_arakoon_checkup(
                ) is True:
                    break
            except Exception:
                cls._logger.exception(
                    'Arakoon checkup for voldrv cluster failed')
                vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING)
                raise
            counter += 1
            time.sleep(1)
            if counter == 300:
                vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING)
                raise RuntimeError(
                    'Arakoon checkup for the StorageDriver cluster could not be started'
                )

        # Cluster registry
        try:
            vp_installer.configure_cluster_registry(allow_raise=True)
        except Exception:
            if vp_installer.is_new is True:
                vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING)
            else:
                vp_installer.revert_vpool(status=VPool.STATUSES.FAILURE)
            raise

        try:
            sd_installer.setup_proxy_configs()
            sd_installer.configure_storagedriver_service()
            DiskController.sync_with_reality(storagerouter.guid)
            MDSServiceController.prepare_mds_service(
                storagerouter=storagerouter, vpool=vp_installer.vpool)

            # Update the MDS safety if changed via API (vpool.configuration will be available at this point also for the newly added StorageDriver)
            vp_installer.vpool.invalidate_dynamics('configuration')
            if vp_installer.mds_safety is not None and vp_installer.vpool.configuration[
                    'mds_config']['mds_safety'] != vp_installer.mds_safety:
                Configuration.set(
                    key='/ovs/vpools/{0}/mds_config|mds_safety'.format(
                        vp_installer.vpool.guid),
                    value=vp_installer.mds_safety)

            sd_installer.start_services(
            )  # Create and start watcher volumedriver, DTL, proxies and StorageDriver services

            # Post creation/extension checkups
            mds_config_set = MDSServiceController.get_mds_storagedriver_config_set(
                vpool=vp_installer.vpool, offline_nodes=offline_nodes)
            for sr, clients in sr_client_map.iteritems():
                for current_storagedriver in [
                        sd for sd in sr.storagedrivers
                        if sd.vpool_guid == vp_installer.vpool.guid
                ]:
                    storagedriver_config = StorageDriverConfiguration(
                        vpool_guid=vp_installer.vpool.guid,
                        storagedriver_id=current_storagedriver.storagedriver_id
                    )
                    if storagedriver_config.config_missing is False:
                        # Filesystem section in StorageDriver configuration are all parameters used for vDisks created directly on the filesystem
                        # So when a vDisk gets created on the filesystem, these MDSes will be assigned to them
                        storagedriver_config.configure_filesystem(
                            fs_metadata_backend_mds_nodes=mds_config_set[
                                sr.guid])
                        storagedriver_config.save(client=clients['ovs'])

            # Everything's reconfigured, refresh new cluster configuration
            for current_storagedriver in vp_installer.vpool.storagedrivers:
                if current_storagedriver.storagerouter not in sr_client_map:
                    continue
                vp_installer.vpool.storagedriver_client.update_cluster_node_configs(
                    str(current_storagedriver.storagedriver_id),
                    req_timeout_secs=10)
        except Exception:
            cls._logger.exception('vPool {0}: Creation failed'.format(
                vp_installer.name))
            vp_installer.update_status(status=VPool.STATUSES.FAILURE)
            raise

        # When a node is offline, we can run into errors, but also when 1 or more volumes are not running
        # Scheduled tasks below, so don't really care whether they succeed or not
        try:
            VDiskController.dtl_checkup(vpool_guid=vp_installer.vpool.guid,
                                        ensure_single_timeout=600)
        except:
            pass
        for vdisk in vp_installer.vpool.vdisks:
            try:
                MDSServiceController.ensure_safety(vdisk_guid=vdisk.guid)
            except:
                pass
        vp_installer.update_status(status=VPool.STATUSES.RUNNING)
        cls._logger.info('Add vPool {0} ended successfully'.format(
            vp_installer.name))

    # @classmethod
    # @ovs_task(name='ovs.vpool.extend_vpool')
    # def extend_vpool(cls, ...):
    # TODO: Make sure extend and create cannot be executed at the same time for the same vPool

    @classmethod
    @ovs_task(name='ovs.vpool.shrink_vpool')
    def shrink_vpool(cls,
                     storagedriver_guid,
                     offline_storage_router_guids=list()):
        """
        Removes a StorageDriver (if its the last StorageDriver for a vPool, the vPool is removed as well)
        :param storagedriver_guid: Guid of the StorageDriver to remove
        :type storagedriver_guid: str
        :param offline_storage_router_guids: Guids of StorageRouters which are offline and will be removed from cluster.
                                             WHETHER VPOOL WILL BE DELETED DEPENDS ON THIS
        :type offline_storage_router_guids: list
        :return: None
        :rtype: NoneType
        """
        # TODO: Add logging
        # TODO: Unit test individual pieces of code
        # Validations
        storagedriver = StorageDriver(storagedriver_guid)
        storagerouter = storagedriver.storagerouter
        cls._logger.info(
            'StorageDriver {0} - Deleting StorageDriver {1}'.format(
                storagedriver.guid, storagedriver.name))

        vp_installer = VPoolInstaller(name=storagedriver.vpool.name)
        vp_installer.validate(storagedriver=storagedriver)

        sd_installer = StorageDriverInstaller(vp_installer=vp_installer,
                                              storagedriver=storagedriver)

        cls._logger.info(
            'StorageDriver {0} - Checking availability of related StorageRouters'
            .format(storagedriver.guid, storagedriver.name))
        sr_client_map = SSHClient.get_clients(endpoints=[
            sd.storagerouter for sd in vp_installer.vpool.storagedrivers
        ],
                                              user_names=['root'])
        sr_installer = StorageRouterInstaller(root_client=sr_client_map.get(
            storagerouter, {}).get('root'),
                                              storagerouter=storagerouter,
                                              vp_installer=vp_installer,
                                              sd_installer=sd_installer)

        offline_srs = sr_client_map.pop('offline')
        if sorted([sr.guid for sr in offline_srs
                   ]) != sorted(offline_storage_router_guids):
            raise RuntimeError('Not all StorageRouters are reachable')

        if storagerouter not in offline_srs:
            mtpt_pids = sr_installer.root_client.run(
                "lsof -t +D '/mnt/{0}' || true".format(
                    vp_installer.name.replace(r"'", r"'\''")),
                allow_insecure=True).splitlines()
            if len(mtpt_pids) > 0:
                raise RuntimeError(
                    'vPool cannot be deleted. Following processes keep the vPool mount point occupied: {0}'
                    .format(', '.join(mtpt_pids)))

        # Retrieve reachable StorageDrivers
        reachable_storagedrivers = []
        for sd in vp_installer.vpool.storagedrivers:
            if sd.storagerouter not in sr_client_map:
                # StorageRouter is offline
                continue

            sd_key = '/ovs/vpools/{0}/hosts/{1}/config'.format(
                vp_installer.vpool.guid, sd.storagedriver_id)
            if Configuration.exists(sd_key) is True:
                path = Configuration.get_configuration_path(sd_key)
                with remote(sd.storagerouter.ip,
                            [LocalStorageRouterClient]) as rem:
                    try:
                        lsrc = rem.LocalStorageRouterClient(path)
                        lsrc.server_revision(
                        )  # 'Cheap' call to verify whether volumedriver is responsive
                        cls._logger.info(
                            'StorageDriver {0} - Responsive StorageDriver {1} on node with IP {2}'
                            .format(storagedriver.guid, sd.name,
                                    sd.storagerouter.ip))
                        reachable_storagedrivers.append(sd)
                    except Exception as exception:
                        if not is_connection_failure(exception):
                            raise

        if len(reachable_storagedrivers) == 0:
            raise RuntimeError(
                'Could not find any responsive node in the cluster')

        # Start removal
        if vp_installer.storagedriver_amount > 1:
            vp_installer.update_status(status=VPool.STATUSES.SHRINKING)
        else:
            vp_installer.update_status(status=VPool.STATUSES.DELETING)

        # Clean up stale vDisks
        cls._logger.info('StorageDriver {0} - Removing stale vDisks'.format(
            storagedriver.guid))
        VDiskController.remove_stale_vdisks(vpool=vp_installer.vpool)

        # Reconfigure the MDSes
        cls._logger.info('StorageDriver {0} - Reconfiguring MDSes'.format(
            storagedriver.guid))
        for vdisk_guid in storagerouter.vdisks_guids:
            try:
                MDSServiceController.ensure_safety(
                    vdisk_guid=vdisk_guid,
                    excluded_storagerouter_guids=[storagerouter.guid] +
                    offline_storage_router_guids)
            except Exception:
                cls._logger.exception(
                    'StorageDriver {0} - vDisk {1} - Ensuring MDS safety failed'
                    .format(storagedriver.guid, vdisk_guid))

        # Validate that all MDSes on current StorageRouter have been moved away
        # Ensure safety does not always throw an error, that's why we perform this check here instead of in the Exception clause of above code
        vdisks = []
        for mds in vp_installer.mds_services:
            for junction in mds.vdisks:
                vdisk = junction.vdisk
                if vdisk in vdisks:
                    continue
                vdisks.append(vdisk)
                cls._logger.critical(
                    'StorageDriver {0} - vDisk {1} {2} - MDS Services have not been migrated away'
                    .format(storagedriver.guid, vdisk.guid, vdisk.name))
        if len(vdisks) > 0:
            # Put back in RUNNING, so it can be used again. Errors keep on displaying in GUI now anyway
            vp_installer.update_status(status=VPool.STATUSES.RUNNING)
            raise RuntimeError(
                'Not all MDS Services have been successfully migrated away')

        # Start with actual removal
        errors_found = False
        if storagerouter not in offline_srs:
            errors_found &= sd_installer.stop_services()

        errors_found &= vp_installer.configure_cluster_registry(
            exclude=[storagedriver], apply_on=reachable_storagedrivers)
        errors_found &= vp_installer.update_node_distance_map()
        errors_found &= vp_installer.remove_mds_services()
        errors_found &= sd_installer.clean_config_management()
        errors_found &= sd_installer.clean_model()

        if storagerouter not in offline_srs:
            errors_found &= sd_installer.clean_directories(
                mountpoints=StorageRouterController.get_mountpoints(
                    client=sr_installer.root_client))

            try:
                DiskController.sync_with_reality(
                    storagerouter_guid=storagerouter.guid)
            except Exception:
                cls._logger.exception(
                    'StorageDriver {0} - Synchronizing disks with reality failed'
                    .format(storagedriver.guid))
                errors_found = True

        if vp_installer.storagedriver_amount > 1:
            # Update the vPool metadata and run DTL checkup
            vp_installer.vpool.metadata['caching_info'].pop(
                sr_installer.storagerouter.guid, None)
            vp_installer.vpool.save()

            try:
                VDiskController.dtl_checkup(vpool_guid=vp_installer.vpool.guid,
                                            ensure_single_timeout=600)
            except Exception:
                cls._logger.exception(
                    'StorageDriver {0} - DTL checkup failed for vPool {1} with guid {2}'
                    .format(storagedriver.guid, vp_installer.name,
                            vp_installer.vpool.guid))
        else:
            cls._logger.info(
                'StorageDriver {0} - Removing vPool from model'.format(
                    storagedriver.guid))
            # Clean up model
            try:
                vp_installer.vpool.delete()
            except Exception:
                errors_found = True
                cls._logger.exception(
                    'StorageDriver {0} - Cleaning up vPool from the model failed'
                    .format(storagedriver.guid))
            Configuration.delete('/ovs/vpools/{0}'.format(
                vp_installer.vpool.guid))

        cls._logger.info('StorageDriver {0} - Running MDS checkup'.format(
            storagedriver.guid))
        try:
            MDSServiceController.mds_checkup()
        except Exception:
            cls._logger.exception(
                'StorageDriver {0} - MDS checkup failed'.format(
                    storagedriver.guid))

        # Update vPool status
        if errors_found is True:
            if vp_installer.storagedriver_amount > 1:
                vp_installer.update_status(status=VPool.STATUSES.FAILURE)
            raise RuntimeError(
                '1 or more errors occurred while trying to remove the StorageDriver. Please check the logs for more information'
            )

        if vp_installer.storagedriver_amount > 1:
            vp_installer.update_status(status=VPool.STATUSES.RUNNING)
        cls._logger.info(
            'StorageDriver {0} - Deleted StorageDriver {1}'.format(
                storagedriver.guid, storagedriver.name))

        if len(VPoolList.get_vpools()) == 0:
            cluster_name = ArakoonInstaller.get_cluster_name('voldrv')
            if ArakoonInstaller.get_arakoon_metadata_by_cluster_name(
                    cluster_name=cluster_name)['internal'] is True:
                cls._logger.debug(
                    'StorageDriver {0} - Removing Arakoon cluster {1}'.format(
                        storagedriver.guid, cluster_name))
                try:
                    installer = ArakoonInstaller(cluster_name=cluster_name)
                    installer.load()
                    installer.delete_cluster()
                except Exception:
                    cls._logger.exception(
                        'StorageDriver {0} - Delete voldrv Arakoon cluster failed'
                        .format(storagedriver.guid))
                service_type = ServiceTypeList.get_by_name(
                    ServiceType.SERVICE_TYPES.ARAKOON)
                service_name = ArakoonInstaller.get_service_name_for_cluster(
                    cluster_name=cluster_name)
                for service in list(service_type.services):
                    if service.name == service_name:
                        service.delete()

        # Remove watcher volumedriver service if last StorageDriver on current StorageRouter
        if len(
                storagerouter.storagedrivers
        ) == 0 and storagerouter not in offline_srs:  # ensure client is initialized for StorageRouter
            try:
                if cls._service_manager.has_service(
                        ServiceFactory.SERVICE_WATCHER_VOLDRV,
                        client=sr_installer.root_client):
                    cls._service_manager.stop_service(
                        ServiceFactory.SERVICE_WATCHER_VOLDRV,
                        client=sr_installer.root_client)
                    cls._service_manager.remove_service(
                        ServiceFactory.SERVICE_WATCHER_VOLDRV,
                        client=sr_installer.root_client)
            except Exception:
                cls._logger.exception(
                    'StorageDriver {0} - {1} service deletion failed'.format(
                        storagedriver.guid,
                        ServiceFactory.SERVICE_WATCHER_VOLDRV))

    @staticmethod
    @ovs_task(name='ovs.vpool.up_and_running')
    @log('VOLUMEDRIVER_TASK')
    def up_and_running(storagedriver_id):
        """
        Volumedriver informs us that the service is completely started. Post-start events can be executed
        :param storagedriver_id: ID of the storagedriver
        """
        storagedriver = StorageDriverList.get_by_storagedriver_id(
            storagedriver_id)
        if storagedriver is None:
            raise RuntimeError(
                'A Storage Driver with id {0} could not be found.'.format(
                    storagedriver_id))
        storagedriver.startup_counter += 1
        storagedriver.save()

    # noinspection PyTypeChecker
    @staticmethod
    @ovs_task(name='ovs.storagerouter.create_hprm_config_files')
    def create_hprm_config_files(vpool_guid, local_storagerouter_guid,
                                 parameters):
        """
        Create the required configuration files to be able to make use of HPRM (aka PRACC)
        This configuration will be zipped and made available for download
        :param vpool_guid: The guid of the VPool for which a HPRM manager needs to be deployed
        :type vpool_guid: str
        :param local_storagerouter_guid: The guid of the StorageRouter the API was requested on
        :type local_storagerouter_guid: str
        :param parameters: Additional information required for the HPRM configuration files
        :type parameters: dict
        :return: Name of the zipfile containing the configuration files
        :rtype: str
        """
        # Validations
        required_params = {
            'port': (int, {
                'min': 1,
                'max': 65535
            }),
            'identifier': (str, ExtensionsToolbox.regex_vpool)
        }
        ExtensionsToolbox.verify_required_params(
            actual_params=parameters, required_params=required_params)
        vpool = VPool(vpool_guid)
        identifier = parameters['identifier']
        config_path = None
        local_storagerouter = StorageRouter(local_storagerouter_guid)
        for sd in vpool.storagedrivers:
            if len(sd.alba_proxies) == 0:
                raise ValueError(
                    'No ALBA proxies configured for vPool {0} on StorageRouter {1}'
                    .format(vpool.name, sd.storagerouter.name))
            config_path = '/ovs/vpools/{0}/proxies/{1}/config/{{0}}'.format(
                vpool.guid, sd.alba_proxies[0].guid)

        if config_path is None:
            raise ValueError(
                'vPool {0} has not been extended any StorageRouter'.format(
                    vpool.name))
        proxy_cfg = Configuration.get(key=config_path.format('main'))

        cache_info = {}
        arakoons = {}
        cache_types = VPool.CACHES.values()
        if not any(ctype in parameters for ctype in cache_types):
            raise ValueError(
                'At least one cache type should be passed: {0}'.format(
                    ', '.join(cache_types)))
        for ctype in cache_types:
            if ctype not in parameters:
                continue
            required_dict = {'read': (bool, None), 'write': (bool, None)}
            required_params.update({ctype: (dict, required_dict)})
            ExtensionsToolbox.verify_required_params(
                actual_params=parameters, required_params=required_params)
            read = parameters[ctype]['read']
            write = parameters[ctype]['write']
            if read is False and write is False:
                cache_info[ctype] = ['none']
                continue
            path = parameters[ctype].get('path')
            if path is not None:
                path = path.strip()
                if not path or path.endswith(
                        '/.') or '..' in path or '/./' in path:
                    raise ValueError('Invalid path specified')
                required_dict.update({
                    'path': (str, None),
                    'size': (int, {
                        'min': 1,
                        'max': 10 * 1024
                    })
                })
                ExtensionsToolbox.verify_required_params(
                    actual_params=parameters, required_params=required_params)
                while '//' in path:
                    path = path.replace('//', '/')
                cache_info[ctype] = [
                    'local', {
                        'path': path,
                        'max_size': parameters[ctype]['size'] * 1024**3,
                        'cache_on_read': read,
                        'cache_on_write': write
                    }
                ]
            else:
                required_dict.update({
                    'backend_info': (dict, {
                        'preset': (str, ExtensionsToolbox.regex_preset),
                        'alba_backend_guid':
                        (str, ExtensionsToolbox.regex_guid),
                        'alba_backend_name':
                        (str, ExtensionsToolbox.regex_backend)
                    }),
                    'connection_info': (dict, {
                        'host': (str, ExtensionsToolbox.regex_ip, False),
                        'port': (int, {
                            'min': 1,
                            'max': 65535
                        }, False),
                        'client_id':
                        (str, ExtensionsToolbox.regex_guid, False),
                        'client_secret': (str, None, False)
                    })
                })
                ExtensionsToolbox.verify_required_params(
                    actual_params=parameters, required_params=required_params)
                connection_info = parameters[ctype]['connection_info']
                if connection_info[
                        'host']:  # Remote Backend for accelerated Backend
                    alba_backend_guid = parameters[ctype]['backend_info'][
                        'alba_backend_guid']
                    ovs_client = OVSClient.get_instance(
                        connection_info=connection_info)
                    arakoon_config = VPoolShared.retrieve_alba_arakoon_config(
                        alba_backend_guid=alba_backend_guid,
                        ovs_client=ovs_client)
                    arakoons[ctype] = ArakoonClusterConfig.convert_config_to(
                        arakoon_config, return_type='INI')
                else:  # Local Backend for accelerated Backend
                    alba_backend_name = parameters[ctype]['backend_info'][
                        'alba_backend_name']
                    if Configuration.exists(key='/ovs/arakoon/{0}-abm/config'.
                                            format(alba_backend_name),
                                            raw=True) is False:
                        raise ValueError(
                            'Arakoon cluster for ALBA Backend {0} could not be retrieved'
                            .format(alba_backend_name))
                    arakoons[ctype] = Configuration.get(
                        key='/ovs/arakoon/{0}-abm/config'.format(
                            alba_backend_name),
                        raw=True)
                cache_info[ctype] = [
                    'alba', {
                        'albamgr_cfg_url':
                        '/etc/hprm/{0}/{1}_cache_arakoon.ini'.format(
                            identifier, ctype),
                        'bucket_strategy': [
                            '1-to-1', {
                                'prefix':
                                vpool.guid,
                                'preset':
                                parameters[ctype]['backend_info']['preset']
                            }
                        ],
                        'manifest_cache_size':
                        proxy_cfg['manifest_cache_size'],
                        'cache_on_read':
                        read,
                        'cache_on_write':
                        write
                    }
                ]

        tgz_name = 'hprm_config_files_{0}_{1}.tgz'.format(
            identifier, vpool.name)
        config = {
            'ips': ['127.0.0.1'],
            'port': parameters['port'],
            'pracc': {
                'uds_path': '/var/run/hprm/{0}/uds_path'.format(identifier),
                'max_clients': 1000,
                'max_read_buf_size':
                64 * 1024,  # Buffer size for incoming requests (in bytes)
                'thread_pool_size': 64
            },  # Amount of threads
            'transport': 'tcp',
            'log_level': 'info',
            'read_preference': proxy_cfg['read_preference'],
            'albamgr_cfg_url': '/etc/hprm/{0}/arakoon.ini'.format(identifier),
            'manifest_cache_size': proxy_cfg['manifest_cache_size']
        }
        file_contents_map = {}
        for ctype in cache_types:
            if ctype in cache_info:
                config['{0}_cache'.format(ctype)] = cache_info[ctype]
            if ctype in arakoons:
                file_contents_map[
                    '/opt/OpenvStorage/config/{0}/{1}_cache_arakoon.ini'.
                    format(identifier, ctype)] = arakoons[ctype]
        file_contents_map.update({
            '/opt/OpenvStorage/config/{0}/config.json'.format(identifier):
            json.dumps(config, indent=4),
            '/opt/OpenvStorage/config/{0}/arakoon.ini'.format(identifier):
            Configuration.get(key=config_path.format('abm'), raw=True)
        })

        local_client = SSHClient(endpoint=local_storagerouter)
        local_client.dir_create(
            directories='/opt/OpenvStorage/config/{0}'.format(identifier))
        local_client.dir_create(
            directories='/opt/OpenvStorage/webapps/frontend/downloads')
        for file_name, contents in file_contents_map.iteritems():
            local_client.file_write(contents=contents, filename=file_name)
        local_client.run(command=[
            'tar', '--transform', 's#^config/{0}#{0}#'.format(identifier),
            '-czf', '/opt/OpenvStorage/webapps/frontend/downloads/{0}'.format(
                tgz_name), 'config/{0}'.format(identifier)
        ])
        local_client.dir_delete(
            directories='/opt/OpenvStorage/config/{0}'.format(identifier))
        return tgz_name

    @staticmethod
    def retrieve_alba_arakoon_config(alba_backend_guid, ovs_client):
        """
        Retrieve the ALBA Arakoon configuration
        WARNING: YOU DO NOT BELONG HERE, PLEASE MOVE TO YOUR OWN PLUGIN
        :param alba_backend_guid: Guid of the ALBA Backend
        :type alba_backend_guid: str
        :param ovs_client: OVS client object
        :type ovs_client: OVSClient
        :return: Arakoon configuration information
        :rtype: dict
        """
        task_id = ovs_client.get(
            '/alba/backends/{0}/get_config_metadata'.format(alba_backend_guid))
        successful, arakoon_config = ovs_client.wait_for_task(task_id,
                                                              timeout=300)
        if successful is False:
            raise RuntimeError(
                'Could not load metadata from environment {0}'.format(
                    ovs_client.ip))
        return arakoon_config
Ejemplo n.º 31
0
 def _get_service_manager(cls):
     return ServiceFactory.get_manager()
Ejemplo n.º 32
0
class StorageRouterController(object):
    """
    Contains all BLL related to StorageRouter
    """
    _logger = Logger('lib')
    _log_level = LOG_LEVEL_MAPPING[_logger.getEffectiveLevel()]
    _os_manager = OSFactory.get_manager()
    _service_manager = ServiceFactory.get_manager()

    # noinspection PyCallByClass,PyTypeChecker
    storagerouterclient.Logger.setupLogging(
        Logger.load_path('storagerouterclient'), _log_level)
    # noinspection PyArgumentList
    storagerouterclient.Logger.enableLogging()

    @staticmethod
    @ovs_task(name='ovs.storagerouter.ping')
    def ping(storagerouter_guid, timestamp):
        """
        Update a StorageRouter's celery heartbeat
        :param storagerouter_guid: Guid of the StorageRouter to update
        :type storagerouter_guid: str
        :param timestamp: Timestamp to compare to
        :type timestamp: float
        :return: None
        :rtype: NoneType
        """
        with volatile_mutex(
                'storagerouter_heartbeat_{0}'.format(storagerouter_guid)):
            storagerouter = StorageRouter(storagerouter_guid)
            if timestamp > storagerouter.heartbeats.get('celery', 0):
                storagerouter.heartbeats['celery'] = timestamp
                storagerouter.save()

    @staticmethod
    @ovs_task(name='ovs.storagerouter.get_metadata')
    def get_metadata(storagerouter_guid):
        """
        Gets physical information about the specified StorageRouter
        :param storagerouter_guid: StorageRouter guid to retrieve the metadata for
        :type storagerouter_guid: str
        :return: Metadata information about the StorageRouter
        :rtype: dict
        """
        return {
            'partitions':
            StorageRouterController.get_partition_info(storagerouter_guid),
            'ipaddresses':
            StorageRouterController.get_ip_addresses(storagerouter_guid),
            'scrub_available':
            StorageRouterController.check_scrub_partition_present()
        }

    @staticmethod
    def get_ip_addresses(storagerouter_guid):
        """
        Retrieves the IP addresses of a StorageRouter
        :param storagerouter_guid: Guid of the StorageRouter
        :return: List of IP addresses
        :rtype: list
        """
        client = SSHClient(endpoint=StorageRouter(storagerouter_guid))
        return StorageRouterController._os_manager.get_ip_addresses(
            client=client)

    @staticmethod
    def get_partition_info(storagerouter_guid):
        """
        Retrieves information about the partitions of a Storagerouter
        :param storagerouter_guid: Guid of the Storagerouter
        :type storagerouter_guid: str
        :return: dict with information about the partitions
        :rtype: dict
        """
        storagerouter = StorageRouter(storagerouter_guid)
        client = SSHClient(endpoint=storagerouter)
        services_mds = ServiceTypeList.get_by_name(
            ServiceType.SERVICE_TYPES.MD_SERVER).services
        services_arakoon = [
            service for service in ServiceTypeList.get_by_name(
                ServiceType.SERVICE_TYPES.ARAKOON).services
            if service.name != 'arakoon-ovsdb' and service.is_internal is True
        ]

        partitions = dict((role, []) for role in DiskPartition.ROLES)
        for disk in storagerouter.disks:
            for disk_partition in disk.partitions:
                claimed_space_by_fwk = 0
                used_space_by_system = 0
                available_space_by_system = 0
                for storagedriver_partition in disk_partition.storagedrivers:
                    claimed_space_by_fwk += storagedriver_partition.size if storagedriver_partition.size is not None else 0
                    if client.dir_exists(storagedriver_partition.path):
                        try:
                            used_space_by_system += int(
                                client.run([
                                    'du', '-B', '1', '-d', '0',
                                    storagedriver_partition.path
                                ],
                                           timeout=5).split('\t')[0])
                        except Exception as ex:
                            StorageRouterController._logger.warning(
                                'Failed to get directory usage for {0}. {1}'.
                                format(storagedriver_partition.path, ex))

                if disk_partition.mountpoint is not None:
                    for alias in disk_partition.aliases:
                        StorageRouterController._logger.info(
                            'Verifying disk partition usage by checking path {0}'
                            .format(alias))
                        disk_partition_device = client.file_read_link(
                            path=alias)
                        try:
                            available_space_by_system = int(
                                client.run([
                                    'df', '-B', '1', '--output=avail',
                                    disk_partition_device
                                ],
                                           timeout=5).splitlines()[-1])
                            break
                        except Exception as ex:
                            StorageRouterController._logger.warning(
                                'Failed to get partition usage for {0}. {1}'.
                                format(disk_partition.mountpoint, ex))

                for role in disk_partition.roles:
                    size = 0 if disk_partition.size is None else disk_partition.size
                    if available_space_by_system > 0:
                        # Take available space reported by df then add back used by roles so that the only used space reported is space not managed by us
                        available = available_space_by_system + used_space_by_system - claimed_space_by_fwk
                    else:
                        available = size - claimed_space_by_fwk  # Subtract size for roles which have already been claimed by other vpools (but not necessarily already been fully used)

                    in_use = any(junction
                                 for junction in disk_partition.storagedrivers
                                 if junction.role == role)
                    if role == DiskPartition.ROLES.DB:
                        for service in services_arakoon:
                            if service.storagerouter_guid == storagerouter_guid:
                                in_use = True
                                break
                        for service in services_mds:
                            if service.storagerouter_guid == storagerouter_guid:
                                in_use = True
                                break

                    partitions[role].append({
                        'ssd':
                        disk.is_ssd,
                        'guid':
                        disk_partition.guid,
                        'size':
                        size,
                        'in_use':
                        in_use,
                        'usable':
                        True,  # Sizes smaller than 1GiB and smaller than 5% of largest WRITE partition will be un-usable
                        'available':
                        available if available > 0 else 0,
                        'mountpoint':
                        disk_partition.
                        folder,  # Equals to mount point unless mount point is root ('/'), then we pre-pend mount point with '/mnt/storage'
                        'storagerouter_guid':
                        storagerouter_guid
                    })

        # Strip out WRITE caches which are smaller than 5% of largest write cache size and smaller than 1GiB
        writecache_sizes = []
        for partition_info in partitions[DiskPartition.ROLES.WRITE]:
            writecache_sizes.append(partition_info['available'])
        largest_write_cache = max(
            writecache_sizes) if len(writecache_sizes) > 0 else 0
        for index, size in enumerate(writecache_sizes):
            if size < largest_write_cache * 5 / 100 or size < 1024**3:
                partitions[DiskPartition.ROLES.WRITE][index]['usable'] = False

        return partitions

    @staticmethod
    @ovs_task(name='ovs.storagerouter.get_version_info')
    def get_version_info(storagerouter_guid):
        """
        Returns version information regarding a given StorageRouter
        :param storagerouter_guid: StorageRouter guid to get version information for
        :type storagerouter_guid: str
        :return: Version information
        :rtype: dict
        """
        package_manager = PackageFactory.get_manager()
        client = SSHClient(StorageRouter(storagerouter_guid))
        return {
            'storagerouter_guid':
            storagerouter_guid,
            'versions':
            dict((pkg_name, str(version)) for pkg_name, version in
                 package_manager.get_installed_versions(client).iteritems())
        }

    @staticmethod
    @ovs_task(name='ovs.storagerouter.get_support_info')
    def get_support_info():
        """
        Returns support information for the entire cluster
        :return: Support information
        :rtype: dict
        """
        celery_scheduling = Configuration.get(
            key='/ovs/framework/scheduling/celery', default={})
        stats_monkey_enabled = any(
            celery_scheduling.get(key) is not None for key in
            ['ovs.stats_monkey.run_all', 'alba.stats_monkey.run_all'])
        return {
            'cluster_id':
            Configuration.get(key='/ovs/framework/cluster_id'),
            'stats_monkey':
            stats_monkey_enabled,
            'support_agent':
            Configuration.get(key='/ovs/framework/support|support_agent'),
            'remote_access':
            Configuration.get(key='ovs/framework/support|remote_access'),
            'stats_monkey_config':
            Configuration.get(key='ovs/framework/monitoring/stats_monkey',
                              default={})
        }

    @staticmethod
    @ovs_task(name='ovs.storagerouter.get_support_metadata')
    def get_support_metadata():
        """
        Returns support metadata for a given StorageRouter. This should be a routed task!
        :return: Metadata of the StorageRouter
        :rtype: dict
        """
        return SupportAgent().get_heartbeat_data()

    @staticmethod
    @ovs_task(name='ovs.storagerouter.get_logfiles')
    def get_logfiles(local_storagerouter_guid):
        """
        Collects logs, moves them to a web-accessible location and returns log tgz's filename
        :param local_storagerouter_guid: StorageRouter guid to retrieve log files on
        :type local_storagerouter_guid: str
        :return: Name of tgz containing the logs
        :rtype: str
        """
        this_storagerouter = System.get_my_storagerouter()
        this_client = SSHClient(this_storagerouter, username='******')
        logfile = this_client.run(['ovs', 'collect', 'logs']).strip()
        logfilename = logfile.split('/')[-1]

        storagerouter = StorageRouter(local_storagerouter_guid)
        webpath = '/opt/OpenvStorage/webapps/frontend/downloads'
        client = SSHClient(storagerouter, username='******')
        client.dir_create(webpath)
        client.file_upload('{0}/{1}'.format(webpath, logfilename), logfile)
        client.run(['chmod', '666', '{0}/{1}'.format(webpath, logfilename)])
        return logfilename

    @staticmethod
    @ovs_task(name='ovs.storagerouter.get_proxy_config')
    def get_proxy_config(vpool_guid, storagerouter_guid):
        """
        Gets the ALBA proxy for a given StorageRouter and vPool
        :param storagerouter_guid: Guid of the StorageRouter on which the ALBA proxy is configured
        :type storagerouter_guid: str
        :param vpool_guid: Guid of the vPool for which the proxy is configured
        :type vpool_guid: str
        :return: The ALBA proxy configuration
        :rtype: dict
        """
        vpool = VPool(vpool_guid)
        storagerouter = StorageRouter(storagerouter_guid)
        for sd in vpool.storagedrivers:
            if sd.storagerouter_guid == storagerouter.guid:
                if len(sd.alba_proxies) == 0:
                    raise ValueError(
                        'No ALBA proxies configured for vPool {0} on StorageRouter {1}'
                        .format(vpool.name, storagerouter.name))
                return Configuration.get(
                    '/ovs/vpools/{0}/proxies/{1}/config/main'.format(
                        vpool.guid, sd.alba_proxies[0].guid))
        raise ValueError(
            'vPool {0} has not been extended to StorageRouter {1}'.format(
                vpool.name, storagerouter.name))

    @staticmethod
    @ovs_task(name='ovs.storagerouter.configure_support')
    def configure_support(support_info):
        """
        Configures support on all StorageRouters
        :param support_info: Information about which components should be configured
            {'stats_monkey': True,  # Enable/disable the stats monkey scheduled task
             'support_agent': True,  # Responsible for enabling the ovs-support-agent service, which collects heart beat data
             'remote_access': False,  # Cannot be True when support agent is False. Is responsible for opening an OpenVPN tunnel to allow for remote access
             'stats_monkey_config': {}}  # Dict with information on how to configure the stats monkey (Only required when enabling the stats monkey
        :type support_info: dict
        :return: None
        :rtype: NoneType
        """
        ExtensionsToolbox.verify_required_params(actual_params=support_info,
                                                 required_params={
                                                     'stats_monkey':
                                                     (bool, None, False),
                                                     'remote_access':
                                                     (bool, None, False),
                                                     'support_agent':
                                                     (bool, None, False),
                                                     'stats_monkey_config':
                                                     (dict, None, False)
                                                 })
        # All settings are optional, so if nothing is specified, no need to change anything
        if len(support_info) == 0:
            StorageRouterController._logger.warning(
                'Configure support called without any specific settings. Doing nothing'
            )
            return

        # Collect information
        support_agent_key = '/ovs/framework/support|support_agent'
        support_agent_new = support_info.get('support_agent')
        support_agent_old = Configuration.get(key=support_agent_key)
        support_agent_change = support_agent_new is not None and support_agent_old != support_agent_new

        remote_access_key = '/ovs/framework/support|remote_access'
        remote_access_new = support_info.get('remote_access')
        remote_access_old = Configuration.get(key=remote_access_key)
        remote_access_change = remote_access_new is not None and remote_access_old != remote_access_new

        stats_monkey_celery_key = '/ovs/framework/scheduling/celery'
        stats_monkey_config_key = '/ovs/framework/monitoring/stats_monkey'
        stats_monkey_new_config = support_info.get('stats_monkey_config')
        stats_monkey_old_config = Configuration.get(
            key=stats_monkey_config_key, default={})
        stats_monkey_celery_config = Configuration.get(
            key=stats_monkey_celery_key, default={})
        stats_monkey_new = support_info.get('stats_monkey')
        stats_monkey_old = stats_monkey_celery_config.get(
            'ovs.stats_monkey.run_all'
        ) is not None or stats_monkey_celery_config.get(
            'alba.stats_monkey.run_all') is not None
        stats_monkey_change = stats_monkey_new is not None and (
            stats_monkey_old != stats_monkey_new
            or stats_monkey_new_config != stats_monkey_old_config)

        # Make sure support agent is enabled when trying to enable remote access
        if remote_access_new is True:
            if support_agent_new is False or (support_agent_new is None
                                              and support_agent_old is False):
                raise RuntimeError(
                    'Remote access cannot be enabled without the heart beat enabled'
                )

        # Collect root_client information
        root_clients = {}
        for storagerouter in StorageRouterList.get_storagerouters():
            try:
                root_clients[storagerouter] = SSHClient(endpoint=storagerouter,
                                                        username='******')
            except UnableToConnectException:
                raise RuntimeError('Not all StorageRouters are reachable')

        if stats_monkey_new is True:
            ExtensionsToolbox.verify_required_params(
                actual_params=stats_monkey_new_config,
                required_params={
                    'host': (str, ExtensionsToolbox.regex_ip),
                    'port': (int, {
                        'min': 1,
                        'max': 65535
                    }),
                    'database': (str, None),
                    'interval': (int, {
                        'min': 1,
                        'max': 86400
                    }),
                    'transport': (str, ['influxdb', 'redis', 'graphite']),
                    'environment': (str, None)
                })
            if stats_monkey_new_config['transport'] in ['influxdb', 'reddis']:
                ExtensionsToolbox.verify_required_params(
                    actual_params=stats_monkey_new_config,
                    required_params={'password': (str, None)})

            if stats_monkey_new_config['transport'] == 'influxdb':
                ExtensionsToolbox.verify_required_params(
                    actual_params=stats_monkey_new_config,
                    required_params={'username': (str, None)})

        # Configure remote access
        if remote_access_change is True:
            Configuration.set(key=remote_access_key, value=remote_access_new)
            cid = Configuration.get('/ovs/framework/cluster_id').replace(
                r"'", r"'\''")
            for storagerouter, root_client in root_clients.iteritems():
                if remote_access_new is False:
                    StorageRouterController._logger.info(
                        'Un-configuring remote access on StorageRouter {0}'.
                        format(root_client.ip))
                    nid = storagerouter.machine_id.replace(r"'", r"'\''")
                    service_name = 'openvpn@ovs_{0}-{1}'.format(cid, nid)
                    if StorageRouterController._service_manager.has_service(
                            name=service_name, client=root_client):
                        StorageRouterController._service_manager.stop_service(
                            name=service_name, client=root_client)
                    root_client.file_delete(filenames=['/etc/openvpn/ovs_*'])

        # Configure support agent
        if support_agent_change is True:
            service_name = 'support-agent'
            Configuration.set(key=support_agent_key, value=support_agent_new)
            for root_client in root_clients.itervalues():
                if support_agent_new is True:
                    StorageRouterController._logger.info(
                        'Configuring support agent on StorageRouter {0}'.
                        format(root_client.ip))
                    if StorageRouterController._service_manager.has_service(
                            name=service_name, client=root_client) is False:
                        StorageRouterController._service_manager.add_service(
                            name=service_name, client=root_client)
                    StorageRouterController._service_manager.restart_service(
                        name=service_name, client=root_client)
                else:
                    StorageRouterController._logger.info(
                        'Un-configuring support agent on StorageRouter {0}'.
                        format(root_client.ip))
                    if StorageRouterController._service_manager.has_service(
                            name=service_name, client=root_client):
                        StorageRouterController._service_manager.stop_service(
                            name=service_name, client=root_client)
                        StorageRouterController._service_manager.remove_service(
                            name=service_name, client=root_client)

        # Configure stats monkey
        if stats_monkey_change is True:
            # 2 keys matter here:
            #    - /ovs/framework/scheduling/celery --> used to check whether the stats monkey is disabled or not
            #    - /ovs/framework/monitoring/stats_monkey --> contains the actual configuration parameters when enabling the stats monkey, such as host, port, username, ...
            service_name = 'scheduled-tasks'
            if stats_monkey_new is True:  # Enable the scheduled task by removing the key
                StorageRouterController._logger.info(
                    'Configuring stats monkey')
                interval = stats_monkey_new_config['interval']
                # The scheduled task cannot be configured to run more than once a minute, so for intervals < 60, the stats monkey task handles this itself
                StorageRouterController._logger.debug(
                    'Requested interval to run at: {0}'.format(interval))
                Configuration.set(key=stats_monkey_config_key,
                                  value=stats_monkey_new_config)
                if interval > 0:
                    days, hours, minutes, _ = ExtensionsToolbox.convert_to_days_hours_minutes_seconds(
                        seconds=interval)
                    if days == 1:  # Max interval is 24 * 60 * 60, so once every day at 3 AM
                        schedule = {'hour': '3'}
                    elif hours > 0:
                        schedule = {'hour': '*/{0}'.format(hours)}
                    else:
                        schedule = {'minute': '*/{0}'.format(minutes)}
                    stats_monkey_celery_config[
                        'ovs.stats_monkey.run_all'] = schedule
                    stats_monkey_celery_config[
                        'alba.stats_monkey.run_all'] = schedule
                    StorageRouterController._logger.debug(
                        'Configured schedule is: {0}'.format(schedule))
                else:
                    stats_monkey_celery_config.pop('ovs.stats_monkey.run_all',
                                                   None)
                    stats_monkey_celery_config.pop('alba.stats_monkey.run_all',
                                                   None)
            else:  # Disable the scheduled task by setting the values for the celery tasks to None
                StorageRouterController._logger.info(
                    'Un-configuring stats monkey')
                stats_monkey_celery_config['ovs.stats_monkey.run_all'] = None
                stats_monkey_celery_config['alba.stats_monkey.run_all'] = None

            Configuration.set(key=stats_monkey_celery_key,
                              value=stats_monkey_celery_config)
            for storagerouter in StorageRouterList.get_masters():
                root_client = root_clients[storagerouter]
                StorageRouterController._logger.debug(
                    'Restarting ovs-scheduled-tasks service on node with IP {0}'
                    .format(root_client.ip))
                StorageRouterController._service_manager.restart_service(
                    name=service_name, client=root_client)

    @staticmethod
    @ovs_task(name='ovs.storagerouter.mountpoint_exists')
    def mountpoint_exists(name, storagerouter_guid):
        """
        Checks whether a given mount point for a vPool exists
        :param name: Name of the mount point to check
        :type name: str
        :param storagerouter_guid: Guid of the StorageRouter on which to check for mount point existence
        :type storagerouter_guid: str
        :return: True if mount point not in use else False
        :rtype: bool
        """
        client = SSHClient(StorageRouter(storagerouter_guid))
        return client.dir_exists(directory='/mnt/{0}'.format(name))

    @staticmethod
    @ovs_task(name='ovs.storagerouter.refresh_hardware')
    def refresh_hardware(storagerouter_guid):
        """
        Refreshes all hardware related information
        :param storagerouter_guid: Guid of the StorageRouter to refresh the hardware on
        :type storagerouter_guid: str
        :return: None
        :rtype: NoneType
        """
        StorageRouterController.set_rdma_capability(storagerouter_guid)
        DiskController.sync_with_reality(storagerouter_guid)

    @staticmethod
    def set_rdma_capability(storagerouter_guid):
        """
        Check if the StorageRouter has been reconfigured to be able to support RDMA
        :param storagerouter_guid: Guid of the StorageRouter to check and set
        :type storagerouter_guid: str
        :return: None
        :rtype: NoneType
        """
        storagerouter = StorageRouter(storagerouter_guid)
        client = SSHClient(storagerouter, username='******')
        rdma_capable = False
        with remote(client.ip, [os], username='******') as rem:
            for root, dirs, files in rem.os.walk('/sys/class/infiniband'):
                for directory in dirs:
                    ports_dir = '/'.join([root, directory, 'ports'])
                    if not rem.os.path.exists(ports_dir):
                        continue
                    for sub_root, sub_dirs, _ in rem.os.walk(ports_dir):
                        if sub_root != ports_dir:
                            continue
                        for sub_directory in sub_dirs:
                            state_file = '/'.join(
                                [sub_root, sub_directory, 'state'])
                            if rem.os.path.exists(state_file):
                                if 'ACTIVE' in client.run(['cat', state_file]):
                                    rdma_capable = True
        storagerouter.rdma_capable = rdma_capable
        storagerouter.save()

    @staticmethod
    @ovs_task(name='ovs.storagerouter.configure_disk',
              ensure_single_info={
                  'mode': 'CHAINED',
                  'global_timeout': 1800
              })
    def configure_disk(storagerouter_guid, disk_guid, partition_guid, offset,
                       size, roles):
        """
        Configures a partition
        :param storagerouter_guid: Guid of the StorageRouter to configure a disk on
        :type storagerouter_guid: str
        :param disk_guid: Guid of the disk to configure
        :type disk_guid: str
        :param partition_guid: Guid of the partition on the disk
        :type partition_guid: str
        :param offset: Offset for the partition
        :type offset: int
        :param size: Size of the partition
        :type size: int
        :param roles: Roles assigned to the partition
        :type roles: list
        :return: None
        :rtype: NoneType
        """
        # Validations
        storagerouter = StorageRouter(storagerouter_guid)
        for role in roles:
            if role not in DiskPartition.ROLES or role == DiskPartition.ROLES.BACKEND:
                raise RuntimeError('Invalid role specified: {0}'.format(role))
        disk = Disk(disk_guid)
        if disk.storagerouter_guid != storagerouter_guid:
            raise RuntimeError(
                'The given Disk is not on the given StorageRouter')
        for partition in disk.partitions:
            if DiskPartition.ROLES.BACKEND in partition.roles:
                raise RuntimeError('The given Disk is in use by a Backend')

        if len({DiskPartition.ROLES.DB, DiskPartition.ROLES.DTL}.intersection(
                set(roles))) > 0:
            roles_on_sr = StorageRouterController._get_roles_on_storagerouter(
                storagerouter.ip)
            for role in [DiskPartition.ROLES.DB, DiskPartition.ROLES.DTL]:
                if role in roles_on_sr and role in roles and roles_on_sr[role][
                        0] != disk.name:  # DB and DTL roles still have to be unassignable
                    raise RoleDuplicationException(
                        'Disk {0} cannot have the {1} role due to presence on disk {2}'
                        .format(disk.name, role, roles_on_sr[role][0]))

        # Create partition
        if partition_guid is None:
            StorageRouterController._logger.debug(
                'Creating new partition - Offset: {0} bytes - Size: {1} bytes - Roles: {2}'
                .format(offset, size, roles))
            with remote(storagerouter.ip, [DiskTools], username='******') as rem:
                if len(disk.aliases) == 0:
                    raise ValueError(
                        'Disk {0} does not have any aliases'.format(disk.name))
                rem.DiskTools.create_partition(disk_alias=disk.aliases[0],
                                               disk_size=disk.size,
                                               partition_start=offset,
                                               partition_size=size)
            DiskController.sync_with_reality(storagerouter_guid)
            disk = Disk(disk_guid)
            end_point = offset + size
            partition = None
            for part in disk.partitions:
                if offset < part.offset + part.size and end_point > part.offset:
                    partition = part
                    break

            if partition is None:
                raise RuntimeError(
                    'No new partition detected on disk {0} after having created 1'
                    .format(disk.name))
            StorageRouterController._logger.debug('Partition created')
        else:
            StorageRouterController._logger.debug('Using existing partition')
            partition = DiskPartition(partition_guid)
            if partition.disk_guid != disk_guid:
                raise RuntimeError(
                    'The given DiskPartition is not on the given Disk')
            if partition.filesystem in [
                    'swap', 'linux_raid_member', 'LVM2_member'
            ]:
                raise RuntimeError(
                    "It is not allowed to assign roles on partitions of type: ['swap', 'linux_raid_member', 'LVM2_member']"
                )
            metadata = StorageRouterController.get_metadata(storagerouter_guid)
            partition_info = metadata['partitions']
            removed_roles = set(partition.roles) - set(roles)
            used_roles = []
            for role in removed_roles:
                for info in partition_info[role]:
                    if info['in_use'] and info['guid'] == partition.guid:
                        used_roles.append(role)
            if len(used_roles) > 0:
                raise RuntimeError(
                    'Roles in use cannot be removed. Used roles: {0}'.format(
                        ', '.join(used_roles)))

        # Add filesystem
        if partition.filesystem is None or partition_guid is None:
            StorageRouterController._logger.debug('Creating filesystem')
            if len(partition.aliases) == 0:
                raise ValueError(
                    'Partition with offset {0} does not have any aliases'.
                    format(partition.offset))
            with remote(storagerouter.ip, [DiskTools], username='******') as rem:
                rem.DiskTools.make_fs(partition_alias=partition.aliases[0])
            DiskController.sync_with_reality(storagerouter_guid)
            partition = DiskPartition(partition.guid)
            if partition.filesystem not in ['ext4', 'xfs']:
                raise RuntimeError('Unexpected filesystem')
            StorageRouterController._logger.debug('Filesystem created')

        # Mount the partition and add to FSTab
        if partition.mountpoint is None:
            StorageRouterController._logger.debug('Configuring mount point')
            with remote(storagerouter.ip, [DiskTools], username='******') as rem:
                counter = 1
                mountpoint = '/mnt/{0}{1}'.format(
                    'ssd' if disk.is_ssd else 'hdd', counter)
                while True:
                    if not rem.DiskTools.mountpoint_exists(mountpoint):
                        break
                    counter += 1
                    mountpoint = '/mnt/{0}{1}'.format(
                        'ssd' if disk.is_ssd else 'hdd', counter)
                StorageRouterController._logger.debug(
                    'Found mount point: {0}'.format(mountpoint))
                rem.DiskTools.add_fstab(partition_aliases=partition.aliases,
                                        mountpoint=mountpoint,
                                        filesystem=partition.filesystem)
                rem.DiskTools.mount(mountpoint)
            DiskController.sync_with_reality(storagerouter_guid)
            partition = DiskPartition(partition.guid)
            if partition.mountpoint != mountpoint:
                raise RuntimeError('Unexpected mount point')
            StorageRouterController._logger.debug('Mount point configured')
        partition.roles = roles
        partition.save()
        StorageRouterController._logger.debug('Partition configured')

    @staticmethod
    def check_scrub_partition_present():
        """
        Checks whether at least 1 scrub partition is present on any StorageRouter
        :return: True if at least 1 SCRUB role present in the cluster else False
        :rtype: bool
        """
        for storage_router in StorageRouterList.get_storagerouters():
            for disk in storage_router.disks:
                for partition in disk.partitions:
                    if DiskPartition.ROLES.SCRUB in partition.roles:
                        return True
        return False

    @staticmethod
    def get_mountpoints(client):
        """
        Retrieve the mount points
        :param client: SSHClient to retrieve the mount points on
        :return: List of mount points
        :rtype: list[str]
        """
        mountpoints = []
        for mountpoint in client.run(['mount', '-v']).strip().splitlines():
            mp = mountpoint.split(' ')[2] if len(
                mountpoint.split(' ')) > 2 else None
            if mp and not mp.startswith('/dev') and not mp.startswith(
                    '/proc') and not mp.startswith(
                        '/sys') and not mp.startswith(
                            '/run') and not mp.startswith(
                                '/mnt/alba-asd') and mp != '/':
                mountpoints.append(mp)
        return mountpoints

    @staticmethod
    def _retrieve_alba_arakoon_config(alba_backend_guid, ovs_client):
        """
        Retrieve the ALBA Arakoon configuration
        :param alba_backend_guid: Guid of the ALBA Backend
        :type alba_backend_guid: str
        :param ovs_client: OVS client object
        :type ovs_client: OVSClient
        :return: Arakoon configuration information
        :rtype: dict
        """
        task_id = ovs_client.get(
            '/alba/backends/{0}/get_config_metadata'.format(alba_backend_guid))
        successful, arakoon_config = ovs_client.wait_for_task(task_id,
                                                              timeout=300)
        if successful is False:
            raise RuntimeError(
                'Could not load metadata from environment {0}'.format(
                    ovs_client.ip))
        return arakoon_config

    @staticmethod
    def _revert_vpool_status(vpool,
                             status=VPool.STATUSES.RUNNING,
                             storagedriver=None,
                             client=None,
                             dirs_created=None):
        """
        Remove the vPool being created or revert the vPool being extended
        :return: None
        :rtype: NoneType
        """
        vpool.status = status
        vpool.save()

        if status == VPool.STATUSES.RUNNING:
            if len(dirs_created) > 0:
                try:
                    client.dir_delete(directories=dirs_created)
                except Exception:
                    StorageRouterController._logger.warning(
                        'Failed to clean up following directories: {0}'.format(
                            ', '.join(dirs_created)))

            if storagedriver is not None:
                for sdp in storagedriver.partitions:
                    sdp.delete()
                for proxy in storagedriver.alba_proxies:
                    proxy.delete()
                storagedriver.delete()
            if len(vpool.storagedrivers) == 0:
                vpool.delete()
                if Configuration.dir_exists(
                        key='/ovs/vpools/{0}'.format(vpool.guid)):
                    Configuration.delete(
                        key='/ovs/vpools/{0}'.format(vpool.guid))

    @staticmethod
    def _get_roles_on_storagerouter(ip):
        """
        returns a set with the roles present on the storagerouter
        :param ip: string with ip of the storagerouter
        :return: Dict
        """
        sr = StorageRouterList.get_by_ip(ip)
        roles_on_sr = {}
        for sr_disk in sr.disks:
            for partition in sr_disk.partitions:
                for part_role in partition.roles:
                    if part_role not in roles_on_sr:
                        roles_on_sr[part_role] = [sr_disk.name]
                    else:
                        roles_on_sr[part_role].append(sr_disk.name)
        return roles_on_sr
Ejemplo n.º 33
0
    def migrate():
        """
        Executes async migrations. It doesn't matter too much when they are executed, as long as they get eventually
        executed. This code will typically contain:
        * "dangerous" migration code (it needs certain running services)
        * Migration code depending on a cluster-wide state
        * ...
        """
        MigrationController._logger.info('Preparing out of band migrations...')

        from ovs.dal.lists.storagedriverlist import StorageDriverList
        from ovs.dal.lists.storagerouterlist import StorageRouterList
        from ovs.dal.lists.vpoollist import VPoolList
        from ovs.extensions.generic.configuration import Configuration
        from ovs.extensions.generic.sshclient import SSHClient
        from ovs_extensions.generic.toolbox import ExtensionsToolbox
        from ovs_extensions.services.interfaces.systemd import Systemd
        from ovs.extensions.services.servicefactory import ServiceFactory
        from ovs.extensions.storageserver.storagedriver import StorageDriverConfiguration
        from ovs.lib.generic import GenericController

        MigrationController._logger.info('Start out of band migrations...')
        service_manager = ServiceFactory.get_manager()

        sr_client_map = {}
        for storagerouter in StorageRouterList.get_storagerouters():
            sr_client_map[storagerouter.guid] = SSHClient(endpoint=storagerouter,
                                                          username='******')

        #########################################################
        # Addition of 'ExecReload' for AlbaProxy SystemD services
        if ServiceFactory.get_service_type() == 'systemd':
            changed_clients = set()
            for storagedriver in StorageDriverList.get_storagedrivers():
                root_client = sr_client_map[storagedriver.storagerouter_guid]
                for alba_proxy in storagedriver.alba_proxies:
                    service = alba_proxy.service
                    service_name = 'ovs-{0}'.format(service.name)
                    if not service_manager.has_service(name=service_name, client=root_client):
                        continue
                    if 'ExecReload=' in root_client.file_read(filename='/lib/systemd/system/{0}.service'.format(service_name)):
                        continue

                    try:
                        service_manager.regenerate_service(name='ovs-albaproxy', client=root_client, target_name=service_name)
                        changed_clients.add(root_client)
                    except:
                        MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name))
            for root_client in changed_clients:
                root_client.run(['systemctl', 'daemon-reload'])

        ##################################################################
        # Adjustment of open file descriptors for Arakoon services to 8192
        changed_clients = set()
        for storagerouter in StorageRouterList.get_storagerouters():
            root_client = sr_client_map[storagerouter.guid]
            for service_name in service_manager.list_services(client=root_client):
                if not service_name.startswith('ovs-arakoon-'):
                    continue

                if ServiceFactory.get_service_type() == 'systemd':
                    path = '/lib/systemd/system/{0}.service'.format(service_name)
                    check = 'LimitNOFILE=8192'
                else:
                    path = '/etc/init/{0}.conf'.format(service_name)
                    check = 'limit nofile 8192 8192'

                if not root_client.file_exists(path):
                    continue
                if check in root_client.file_read(path):
                    continue

                try:
                    service_manager.regenerate_service(name='ovs-arakoon', client=root_client, target_name=service_name)
                    changed_clients.add(root_client)
                    ExtensionsToolbox.edit_version_file(client=root_client, package_name='arakoon', old_service_name=service_name)
                except:
                    MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name))
        for root_client in changed_clients:
            root_client.run(['systemctl', 'daemon-reload'])

        #############################
        # Migrate to multiple proxies
        for storagedriver in StorageDriverList.get_storagedrivers():
            vpool = storagedriver.vpool
            root_client = sr_client_map[storagedriver.storagerouter_guid]
            for alba_proxy in storagedriver.alba_proxies:
                # Rename alba_proxy service in model
                service = alba_proxy.service
                old_service_name = 'albaproxy_{0}'.format(vpool.name)
                new_service_name = 'albaproxy_{0}_0'.format(vpool.name)
                if old_service_name != service.name:
                    continue
                service.name = new_service_name
                service.save()

                if not service_manager.has_service(name=old_service_name, client=root_client):
                    continue
                old_configuration_key = '/ovs/framework/hosts/{0}/services/{1}'.format(storagedriver.storagerouter.machine_id, old_service_name)
                if not Configuration.exists(key=old_configuration_key):
                    continue

                # Add '-reboot' to alba_proxy services (because of newly created services and removal of old service)
                ExtensionsToolbox.edit_version_file(client=root_client,
                                                    package_name='alba',
                                                    old_service_name=old_service_name,
                                                    new_service_name=new_service_name)

                # Register new service and remove old service
                service_manager.add_service(name='ovs-albaproxy',
                                            client=root_client,
                                            params=Configuration.get(old_configuration_key),
                                            target_name='ovs-{0}'.format(new_service_name))

                # Update scrub proxy config
                proxy_config_key = '/ovs/vpools/{0}/proxies/{1}/config/main'.format(vpool.guid, alba_proxy.guid)
                proxy_config = None if Configuration.exists(key=proxy_config_key) is False else Configuration.get(proxy_config_key)
                if proxy_config is not None:
                    fragment_cache = proxy_config.get('fragment_cache', ['none', {}])
                    if fragment_cache[0] == 'alba' and fragment_cache[1].get('cache_on_write') is True:  # Accelerated ALBA configured
                        fragment_cache_scrub_info = copy.deepcopy(fragment_cache)
                        fragment_cache_scrub_info[1]['cache_on_read'] = False
                        proxy_scrub_config_key = '/ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid)
                        proxy_scrub_config = None if Configuration.exists(key=proxy_scrub_config_key) is False else Configuration.get(proxy_scrub_config_key)
                        if proxy_scrub_config is not None and proxy_scrub_config['fragment_cache'] == ['none']:
                            proxy_scrub_config['fragment_cache'] = fragment_cache_scrub_info
                            Configuration.set(proxy_scrub_config_key,
                                              json.dumps(proxy_scrub_config, indent=4),
                                              raw=True)

            # Update 'backend_connection_manager' section
            changes = False
            storagedriver_config = StorageDriverConfiguration('storagedriver', vpool.guid, storagedriver.storagedriver_id)
            storagedriver_config.load()
            if 'backend_connection_manager' not in storagedriver_config.configuration:
                continue

            current_config = storagedriver_config.configuration['backend_connection_manager']
            if current_config.get('backend_type') != 'MULTI':
                changes = True
                backend_connection_manager = {'backend_type': 'MULTI'}
                for index, proxy in enumerate(sorted(storagedriver.alba_proxies, key=lambda pr: pr.service.ports[0])):
                    backend_connection_manager[str(index)] = copy.deepcopy(current_config)
                    # noinspection PyUnresolvedReferences
                    backend_connection_manager[str(index)]['alba_connection_use_rora'] = True
                    # noinspection PyUnresolvedReferences
                    backend_connection_manager[str(index)]['alba_connection_rora_manifest_cache_capacity'] = 5000
                    # noinspection PyUnresolvedReferences
                    for key, value in backend_connection_manager[str(index)].items():
                        if key.startswith('backend_interface'):
                            backend_connection_manager[key] = value
                            # noinspection PyUnresolvedReferences
                            del backend_connection_manager[str(index)][key]
                for key, value in {'backend_interface_retries_on_error': 5,
                                   'backend_interface_retry_interval_secs': 1,
                                   'backend_interface_retry_backoff_multiplier': 2.0}.iteritems():
                    if key not in backend_connection_manager:
                        backend_connection_manager[key] = value
            else:
                backend_connection_manager = current_config
                for value in backend_connection_manager.values():
                    if isinstance(value, dict):
                        for key, val in value.items():
                            if key.startswith('backend_interface'):
                                backend_connection_manager[key] = val
                                changes = True
                                del value[key]
                for key, value in {'backend_interface_retries_on_error': 5,
                                   'backend_interface_retry_interval_secs': 1,
                                   'backend_interface_retry_backoff_multiplier': 2.0}.iteritems():
                    if key not in backend_connection_manager:
                        changes = True
                        backend_connection_manager[key] = value

            if changes is True:
                storagedriver_config.clear_backend_connection_manager()
                storagedriver_config.configure_backend_connection_manager(**backend_connection_manager)
                storagedriver_config.save(root_client)

                # Add '-reboot' to volumedriver services (because of updated 'backend_connection_manager' section)
                ExtensionsToolbox.edit_version_file(client=root_client,
                                                    package_name='volumedriver',
                                                    old_service_name='volumedriver_{0}'.format(vpool.name))
                if service_manager.ImplementationClass == Systemd:
                    root_client.run(['systemctl', 'daemon-reload'])

        ########################################
        # Update metadata_store_bits information
        for vpool in VPoolList.get_vpools():
            bits = None
            for storagedriver in vpool.storagedrivers:
                key = '/ovs/framework/hosts/{0}/services/volumedriver_{1}'.format(storagedriver.storagerouter.machine_id, vpool.name)
                if Configuration.exists(key=key) and 'METADATASTORE_BITS' not in Configuration.get(key=key):
                    if bits is None:
                        entries = service_manager.extract_from_service_file(name='ovs-volumedriver_{0}'.format(vpool.name),
                                                                            client=sr_client_map[storagedriver.storagerouter_guid],
                                                                            entries=['METADATASTORE_BITS='])
                        if len(entries) == 1:
                            bits = entries[0].split('=')[-1]
                            bits = int(bits) if bits.isdigit() else 5
                    if bits is not None:
                        try:
                            content = Configuration.get(key=key)
                            content['METADATASTORE_BITS'] = bits
                            Configuration.set(key=key, value=content)
                        except:
                            MigrationController._logger.exception('Error updating volumedriver info for vPool {0} on StorageRouter {1}'.format(vpool.name, storagedriver.storagerouter.name))

            if bits is not None:
                vpool.metadata_store_bits = bits
                vpool.save()

        MigrationController._logger.info('Finished out of band migrations')
        GenericController.refresh_package_information()
Ejemplo n.º 34
0
    def validate_cluster(cluster_name='ovsdb'):
        """
        Validate if the chosen cluster is
         * deployed on all required nodes
         * running on all required nodes
         * working correctly on all required nodes

        :param cluster_name: name of a existing arakoon cluster (DEFAULT=ovsdb)
        :type cluster_name: str
        :return:
        """
        ArakoonValidation.LOGGER.info("Starting validating arakoon cluster")
        master_storagerouters = [
            storagerouter.ip
            for storagerouter in StorageRouterList.get_masters()
        ]
        assert len(master_storagerouters
                   ) >= 2, 'Environment has only `{0}` node(s)'.format(
                       len(master_storagerouters))

        master_storagerouters.sort()
        arakoon_service_name = "ovs-arakoon-{0}".format(cluster_name)
        service_manager = ServiceFactory.get_manager()
        for storagerouter_ip in master_storagerouters:
            client = SSHClient(storagerouter_ip, username='******')
            # check if service file is available
            ArakoonValidation.LOGGER.info(
                "Validating if cluster service `{0}` is available on node `{1}`"
                .format(cluster_name, storagerouter_ip))
            assert service_manager.has_service(arakoon_service_name, client), "Service file of `{0}` does not exists on storagerouter `{1}`"\
                .format(cluster_name, storagerouter_ip)
            # check if service is running on system
            ArakoonValidation.LOGGER.info(
                "Validating if cluster service `{0}` is running on node `{1}`".
                format(cluster_name, storagerouter_ip))
            assert service_manager.get_service_status(arakoon_service_name, client) == 'active', \
                "Service of `{0}` is not running on storagerouter `{1}`".format(cluster_name, storagerouter_ip)

        # perform nop, get and set on cluster
        key = 'integration-tests-{0}'.format(str(uuid.uuid4()))
        value = str(time.time())
        ArakoonValidation.LOGGER.info(
            "Validating if cluster `{0}` works".format(cluster_name))
        # determine if there is a healthy cluster
        configuration = Configuration.get(
            '/ovs/arakoon/{0}/config'.format(cluster_name), raw=True)
        client = PyrakoonStore(cluster_name, configuration)
        client.nop()
        # perform set, get & compare
        client.set(key, value)
        get_value = client.get(key)
        assert get_value == value, "Value mismatch on cluster `{0}`, get value `{1}`, " \
                                   "expected value `{2}` on key `{3}`".format(cluster_name, get_value, value, key)

        # perform delete
        client.delete(key)
        try:
            assert not client.get(key), "Key `{0}` still exists on cluster `{1}` after deleting it"\
                .format(key, cluster_name)
        except KeyNotFoundException:
            # key not found so test has passed
            assert True

        ArakoonValidation.LOGGER.info("Finished validating arakoon cluster")