Esempio n. 1
0
    def add_services(client, node_type, logger):
        """
        Add the services required by the OVS cluster
        :param client: Client on which to add the services
        :type client: ovs.extensions.generic.sshclient.SSHClient
        :param node_type: Type of node ('master' or 'extra')
        :type node_type: str
        :param logger: Logger object used for logging
        :type logger: ovs.log.log_handler.LogHandler
        :return: None
        """
        Toolbox.log(logger=logger, messages='Adding services')
        services = {}
        worker_queue = System.get_my_machine_id(client=client)
        if node_type == 'master':
            worker_queue += ',ovs_masters'
            services.update({'memcached': {'MEMCACHE_NODE_IP': client.ip, 'WORKER_QUEUE': worker_queue},
                             'rabbitmq-server': {'MEMCACHE_NODE_IP': client.ip, 'WORKER_QUEUE': worker_queue},
                             'scheduled-tasks': {},
                             'webapp-api': {},
                             'volumerouter-consumer': {}})
        services.update({'workers': {'WORKER_QUEUE': worker_queue},
                         'watcher-framework': {}})

        for service_name, params in services.iteritems():
            if not ServiceManager.has_service(service_name, client):
                Toolbox.log(logger=logger, messages='Adding service {0}'.format(service_name))
                ServiceManager.add_service(name=service_name, params=params, client=client)
Esempio n. 2
0
    def _configure_amqp_to_volumedriver():
        Toolbox.log(logger=NodeTypeController._logger,
                    messages='Update existing vPools')
        login = Configuration.get('/ovs/framework/messagequeue|user')
        password = Configuration.get('/ovs/framework/messagequeue|password')
        protocol = Configuration.get('/ovs/framework/messagequeue|protocol')

        uris = []
        for endpoint in Configuration.get(
                '/ovs/framework/messagequeue|endpoints'):
            uris.append({
                'amqp_uri':
                '{0}://{1}:{2}@{3}'.format(protocol, login, password, endpoint)
            })

        if Configuration.dir_exists('/ovs/vpools'):
            for vpool_guid in Configuration.list('/ovs/vpools'):
                for storagedriver_id in Configuration.list(
                        '/ovs/vpools/{0}/hosts'.format(vpool_guid)):
                    storagedriver_config = StorageDriverConfiguration(
                        vpool_guid, storagedriver_id)
                    storagedriver_config.configure_event_publisher(
                        events_amqp_routing_key=Configuration.get(
                            '/ovs/framework/messagequeue|queues.storagedriver'
                        ),
                        events_amqp_uris=uris)
                    storagedriver_config.save()
Esempio n. 3
0
    def check_rabbitmq_and_enable_ha_mode(client, logger):
        """
        Verify RabbitMQ is running properly and enable HA mode
        :param client: Client on which to check RabbitMQ
        :type client: ovs_extensions.generic.sshclient.SSHClient
        :param logger: Logger object used for logging
        :type logger: ovs.log.log_handler.LogHandler
        :return: None
        """
        service_manager = ServiceFactory.get_manager()
        if not service_manager.has_service('rabbitmq-server', client):
            raise RuntimeError(
                'Service rabbitmq-server has not been added on node {0}'.
                format(client.ip))
        rabbitmq_running, same_process = service_manager.is_rabbitmq_running(
            client=client)
        if rabbitmq_running is False or same_process is False:
            Toolbox.change_service_state(client, 'rabbitmq-server', 'restart',
                                         logger)

        time.sleep(5)
        client.run([
            'rabbitmqctl', 'set_policy', 'ha-all', '^(volumerouter|ovs_.*)$',
            '{"ha-mode":"all"}'
        ])
Esempio n. 4
0
    def configure_avahi(client, node_name, node_type, logger):
        """
        Configure Avahi
        :param client: Client on which to configure avahi
        :type client: ovs_extensions.generic.sshclient.SSHClient
        :param node_name: Name of the node to set in Avahi
        :type node_name: str
        :param node_type: Type of the node ('master' or 'extra')
        :type node_type: str
        :param logger: Logger object used for logging
        :type logger: ovs.extensions.generic.logger.Logger
        :return: None
        """
        valid_avahi = NodeTypeController.validate_avahi_cluster_name(
            ip=client.ip,
            cluster_name=Configuration.get('/ovs/framework/cluster_name'),
            node_name=node_name)
        if valid_avahi[0] is False:
            raise RuntimeError(valid_avahi[1])
        Toolbox.log(logger=logger, messages='Announcing service')
        client.file_write(
            NodeTypeController.avahi_filename,
            """<?xml version="1.0" standalone='no'?>
<!--*-nxml-*-->
<!DOCTYPE service-group SYSTEM "avahi-service.dtd">
<!-- $Id$ -->
<service-group>
    <name replace-wildcards="yes">{0}</name>
    <service>
        <type>_ovs_{1}_node._tcp</type>
        <port>443</port>
    </service>
</service-group>""".format(valid_avahi[1], node_type))
        ServiceFactory.change_service_state(client, 'avahi-daemon', 'restart',
                                            NodeTypeController._logger)
Esempio n. 5
0
    def configure_avahi(client, node_name, node_type, logger):
        """
        Configure Avahi
        :param client: Client on which to configure avahi
        :type client: ovs.extensions.generic.sshclient.SSHClient
        :param node_name: Name of the node to set in Avahi
        :type node_name: str
        :param node_type: Type of the node ('master' or 'extra')
        :type node_type: str
        :param logger: Logger object used for logging
        :type logger: ovs.log.log_handler.LogHandler
        :return: None
        """
        cluster_name = Configuration.get('/ovs/framework/cluster_name')
        Toolbox.log(logger=logger, messages='Announcing service')
        client.file_write(NodeTypeController.avahi_filename, """<?xml version="1.0" standalone='no'?>
<!--*-nxml-*-->
<!DOCTYPE service-group SYSTEM "avahi-service.dtd">
<!-- $Id$ -->
<service-group>
    <name replace-wildcards="yes">ovs_cluster_{0}_{1}_{3}</name>
    <service>
        <type>_ovs_{2}_node._tcp</type>
        <port>443</port>
    </service>
</service-group>""".format(cluster_name, node_name, node_type, client.ip.replace('.', '_')))
        Toolbox.change_service_state(client, 'avahi-daemon', 'restart', NodeTypeController._logger)
Esempio n. 6
0
 def apply(license_string):
     """
     Applies a license. It will apply as much licenses as possible, however, it won't fail on invalid licenses as it
     will simply skip them.
     """
     try:
         clients = {}
         storagerouters = StorageRouterList.get_storagerouters()
         try:
             for storagerouter in storagerouters:
                 clients[storagerouter] = SSHClient(storagerouter.ip)
         except UnableToConnectException:
             raise RuntimeError('Not all StorageRouters are reachable')
         data = LicenseController._decode(license_string)
         for component in data:
             cdata = data[component]
             name = cdata['name']
             data = cdata['data']
             token = cdata['token']
             valid_until = float(
                 cdata['valid_until']) if 'valid_until' in cdata else None
             if valid_until is not None and valid_until <= time.time():
                 continue
             signature = cdata['signature'] if 'signature' in cdata else None
             validate_functions = Toolbox.fetch_hooks(
                 'license', '{0}.validate'.format(component))
             apply_functions = Toolbox.fetch_hooks(
                 'license', '{0}.apply'.format(component))
             if len(validate_functions) == 1 and len(apply_functions) == 1:
                 valid, metadata = validate_functions[0](
                     component=component, data=data, signature=signature)
                 if valid is True:
                     success = apply_functions[0](component=component,
                                                  data=data,
                                                  signature=signature)
                     if success is True:
                         license_object = LicenseList.get_by_component(
                             component)
                         if license_object is None:
                             license_object = License()
                         license_object.component = component
                         license_object.name = name
                         license_object.token = token
                         license_object.data = data
                         license_object.valid_until = valid_until
                         license_object.signature = signature
                         license_object.save()
         license_contents = []
         for lic in LicenseList.get_licenses():
             license_contents.append(lic.hash)
         for storagerouter in storagerouters:
             client = clients[storagerouter]
             client.file_write('/opt/OpenvStorage/config/licenses',
                               '{0}\n'.format('\n'.join(license_contents)))
     except Exception, ex:
         logger.exception('Error applying license: {0}'.format(ex))
         return None
    def get_metadata(storagerouter):
        """
        Retrieve metadata for a Storage Router
        Example return value:
            {'ipaddresses': ['10.100.174.254', '172.22.1.100', '192.168.122.1'],
             'partitions': {'BACKEND': [{'available': 1000202043392,
                                         'guid': '9ec473ad-5c3f-4fdb-a4ef-c99bb4449025',
                                         'in_use': False,
                                         'mountpoint': u'/mnt/alba-asd/hiu8WiD7sCfVF2IKRa5U1VZLOBS3H75W',
                                         'size': 1000202043392,
                                         'ssd': False,
                                         'storagerouter_guid': u'f5155bc2-b238-4a94-b6ce-b5600e65607a'}],
                            'DB': [{'available': 425200713728,
                                    'guid': 'c0064548-c0be-474d-a66b-da65639831f8',
                                    'in_use': False,
                                    'mountpoint': '/mnt/storage',
                                    'size': 425200713728,
                                    'ssd': False,
                                    'storagerouter_guid': u'f5155bc2-b238-4a94-b6ce-b5600e65607a'}],
                            'SCRUB': [{'available': 340160570983,
                                       'guid': 'c0064548-c0be-474d-a66b-da65639831f8',
                                       'in_use': False,
                                       'mountpoint': '/mnt/storage',
                                       'size': 425200713728,
                                       'ssd': False,
                                       'storagerouter_guid': u'f5155bc2-b238-4a94-b6ce-b5600e65607a'}],
                            'WRITE': [{'available': 60016295936,
                                       'guid': '0d167ced-5a5f-47aa-b890-45b923b686c4',
                                       'in_use': False,
                                       'mountpoint': u'/mnt/ssd2',
                                       'size': 60016295936,
                                       'ssd': True,
                                       'storagerouter_guid': u'f5155bc2-b238-4a94-b6ce-b5600e65607a'}]},
             'scrub_available': True,
             'writecache_size': 60016295936}

        :param storagerouter: Storage Router to retrieve metadata for
        :return: Metadata
        """
        result, metadata = GeneralStorageRouter.api.execute_post_action(component='storagerouters',
                                                                        guid=storagerouter.guid,
                                                                        action='get_metadata',
                                                                        data={},
                                                                        wait=True,
                                                                        timeout=300)
        assert result is True, 'Retrieving metadata failed for Storage Router {0}'.format(storagerouter.name)

        required_params = {'ipaddresses': (list, Toolbox.regex_ip),
                           'partitions': (dict, None),
                           'scrub_available': (bool, None),
                           'writecache_size': (int, {'min': 0})}
        Toolbox.verify_required_params(required_params=required_params,
                                       actual_params=metadata,
                                       exact_match=True)
        return metadata
Esempio n. 8
0
    def refresh_package_information():
        """
        Retrieve and store the package information of all StorageRouters
        :return: None
        """
        GenericController._logger.info('Updating package information')
        threads = []
        information = {}
        all_storagerouters = StorageRouterList.get_storagerouters()
        for storagerouter in all_storagerouters:
            information[storagerouter.ip] = {}
            for fct in Toolbox.fetch_hooks('update', 'get_package_info_multi'):
                try:
                    # We make use of these clients in Threads --> cached = False
                    client = SSHClient(endpoint=storagerouter,
                                       username='******',
                                       cached=False)
                except UnableToConnectException:
                    information[storagerouter.ip]['errors'] = [
                        'StorageRouter {0} is inaccessible'.format(
                            storagerouter.name)
                    ]
                    break
                thread = Thread(target=fct, args=(client, information))
                thread.start()
                threads.append(thread)

        for fct in Toolbox.fetch_hooks('update', 'get_package_info_single'):
            thread = Thread(target=fct, args=(information, ))
            thread.start()
            threads.append(thread)

        for thread in threads:
            thread.join()

        errors = []
        copy_information = copy.deepcopy(information)
        for ip, info in information.iteritems():
            if len(info.get('errors', [])) > 0:
                errors.extend(
                    ['{0}: {1}'.format(ip, error) for error in info['errors']])
                copy_information.pop(ip)

        for storagerouter in all_storagerouters:
            info = copy_information.get(storagerouter.ip, {})
            if 'errors' in info:
                info.pop('errors')
            storagerouter.package_information = info
            storagerouter.save()

        if len(errors) > 0:
            errors = [str(error) for error in set(errors)]
            raise Exception(' - {0}'.format('\n - '.join(errors)))
Esempio n. 9
0
 def retrieve_storagerouter_info_via_host(ip, password):
     """
     Retrieve the storagerouters from model
     """
     storagerouters = {}
     try:
         from ovs.dal.lists.storagerouterlist import StorageRouterList
         with remote(ip_info=ip, modules=[StorageRouterList], username='******', password=password, strict_host_key_checking=False) as rem:
             for sr in rem.StorageRouterList.get_storagerouters():
                 storagerouters[sr.name] = {'ip': sr.ip,
                                            'type': sr.node_type.lower()}
     except Exception as ex:
         Toolbox.log(logger=NodeTypeController._logger, messages='Error loading storagerouters: {0}'.format(ex), loglevel='exception', silent=True)
     return storagerouters
Esempio n. 10
0
 def configure_memcached(client, logger):
     """
     Configure Memcached
     :param client: Client on which to configure Memcached
     :type client: ovs.extensions.generic.sshclient.SSHClient
     :param logger: Logger object used for logging
     :type logger: ovs.log.log_handler.LogHandler
     :return: None
     """
     Toolbox.log(logger=logger, messages='Setting up Memcached')
     client.run(['sed', '-i', 's/^-l.*/-l 0.0.0.0/g', '/etc/memcached.conf'])
     client.run(['sed', '-i', 's/^-m.*/-m 1024/g', '/etc/memcached.conf'])
     client.run(['sed', '-i', '-E', 's/^-v(.*)/# -v\1/g', '/etc/memcached.conf'])  # Put all -v, -vv, ... back in comment
     client.run(['sed', '-i', 's/^# -v[^v]*$/-v/g', '/etc/memcached.conf'])     # Uncomment only -v
Esempio n. 11
0
    def restart_framework_and_memcache_services(clients, logger, offline_node_ips=None):
        """
        Restart framework and Memcached services
        :param clients: Clients on which to restart these services
        :type clients: dict
        :param logger: Logger object used for logging
        :type logger: ovs.log.log_handler.LogHandler
        :param offline_node_ips: IP addresses of offline nodes in the cluster
        :type offline_node_ips: list
        :return: None
        """
        from ovs.dal.lists.storagerouterlist import StorageRouterList

        master_ips = [sr.ip for sr in StorageRouterList.get_masters()]
        slave_ips = [sr.ip for sr in StorageRouterList.get_slaves()]
        if offline_node_ips is None:
            offline_node_ips = []
        memcached = 'memcached'
        watcher = 'watcher-framework'
        support_agent = 'support-agent'
        for ip in master_ips + slave_ips:
            if ip not in offline_node_ips:
                if ServiceManager.has_service(watcher, clients[ip]):
                    Toolbox.change_service_state(clients[ip], watcher, 'stop', logger)
        for ip in master_ips:
            if ip not in offline_node_ips:
                Toolbox.change_service_state(clients[ip], memcached, 'restart', logger)
        for ip in master_ips + slave_ips:
            if ip not in offline_node_ips:
                if ServiceManager.has_service(watcher, clients[ip]):
                    Toolbox.change_service_state(clients[ip], watcher, 'start', logger)
                if ServiceManager.has_service(support_agent, clients[ip]):
                    Toolbox.change_service_state(clients[ip], support_agent, 'restart', logger)
        VolatileFactory.store = None
    def validate_alba_backend_removal(alba_backend_info):
        """
        Validate whether the backend has been deleted properly
        alba_backend_info should be a dictionary containing:
            - guid
            - name
            - maintenance_service_names
        :param alba_backend_info: Information about the backend
        :return: None
        """
        Toolbox.verify_required_params(actual_params=alba_backend_info,
                                       required_params={'name': (str, None),
                                                        'guid': (str, Toolbox.regex_guid),
                                                        'maintenance_service_names': (list, None)},
                                       exact_match=True)

        alba_backend_guid = alba_backend_info['guid']
        alba_backend_name = alba_backend_info['name']
        backend = GeneralBackend.get_by_name(alba_backend_name)
        assert backend is None,\
            'Still found a backend in the model with name {0}'.format(alba_backend_name)

        # Validate services removed from model
        for service in GeneralService.get_services_by_name(ServiceType.SERVICE_TYPES.ALBA_MGR):
            assert service.name != '{0}-abm'.format(alba_backend_name),\
                'An AlbaManager service has been found with name {0}'.format(alba_backend_name)
        for service in GeneralService.get_services_by_name(ServiceType.SERVICE_TYPES.NS_MGR):
            assert service.name.startswith('{0}-nsm_'.format(alba_backend_name)) is False,\
                'An NamespaceManager service has been found with name {0}'.format(alba_backend_name)

        # Validate ALBA backend configuration structure
        alba_backend_key = '/ovs/alba/backends'
        actual_configuration_keys = [key for key in Configuration.list(alba_backend_key)]
        assert alba_backend_guid not in actual_configuration_keys,\
            'Configuration still contains an entry in {0} with guid {1}'.format(alba_backend_key, alba_backend_guid)

        # Validate Arakoon configuration structure
        arakoon_keys = [key for key in Configuration.list('/ovs/arakoon') if key.startswith(alba_backend_name)]
        assert len(arakoon_keys) == 0,\
            'Configuration still contains configurations for clusters: {0}'.format(', '.join(arakoon_keys))

        # Validate services
        for storagerouter in GeneralStorageRouter.get_storage_routers():
            root_client = SSHClient(endpoint=storagerouter, username='******')
            maintenance_services = alba_backend_info['maintenance_service_names']
            abm_arakoon_service_name = 'ovs-arakoon-{0}-abm'.format(alba_backend_name)
            nsm_arakoon_service_name = 'ovs-arakoon-{0}-nsm_0'.format(alba_backend_name)
            for service_name in [abm_arakoon_service_name, nsm_arakoon_service_name] + maintenance_services:
                assert GeneralService.has_service(name=service_name, client=root_client) is False,\
                    'Service {0} still deployed on Storage Router {1}'.format(service_name, storagerouter.name)
Esempio n. 13
0
 def validate(license_string):
     """
     Validates a license with the various components
     """
     try:
         result = {}
         data = LicenseController._decode(license_string)
         for component in data:
             cdata = data[component]
             name = cdata['name']
             data = cdata['data']
             _ = cdata['token']
             valid_until = float(
                 cdata['valid_until']) if 'valid_until' in cdata else None
             if valid_until is not None and valid_until <= time.time():
                 result[component] = False
                 continue
             signature = cdata['signature'] if 'signature' in cdata else None
             validate_functions = Toolbox.fetch_hooks(
                 'license', '{0}.validate'.format(component))
             apply_functions = Toolbox.fetch_hooks(
                 'license', '{0}.apply'.format(component))
             if len(validate_functions) == 1 and len(apply_functions) == 1:
                 try:
                     valid, metadata = validate_functions[0](
                         component=component,
                         data=data,
                         signature=signature)
                 except Exception, ex:
                     logger.debug(
                         'Error validating license for {0}: {1}'.format(
                             component, ex))
                     valid = False
                     metadata = None
                 if valid is False:
                     logger.debug('Invalid license for {0}: {1}'.format(
                         component, license_string))
                     result[component] = False
                 else:
                     result[component] = {
                         'valid_until': valid_until,
                         'metadata': metadata,
                         'name': name
                     }
             else:
                 logger.debug(
                     'No validate nor apply functions found for {0}'.format(
                         component))
                 result[component] = False
         return result
Esempio n. 14
0
 def apply(license_string):
     """
     Applies a license. It will apply as much licenses as possible, however, it won't fail on invalid licenses as it
     will simply skip them.
     """
     try:
         clients = {}
         storagerouters = StorageRouterList.get_storagerouters()
         try:
             for storagerouter in storagerouters:
                 clients[storagerouter] = SSHClient(storagerouter.ip)
         except UnableToConnectException:
             raise RuntimeError('Not all StorageRouters are reachable')
         data = LicenseController._decode(license_string)
         for component in data:
             cdata = data[component]
             name = cdata['name']
             data = cdata['data']
             token = cdata['token']
             valid_until = float(cdata['valid_until']) if 'valid_until' in cdata else None
             if valid_until is not None and valid_until <= time.time():
                 continue
             signature = cdata['signature'] if 'signature' in cdata else None
             validate_functions = Toolbox.fetch_hooks('license', '{0}.validate'.format(component))
             apply_functions = Toolbox.fetch_hooks('license', '{0}.apply'.format(component))
             if len(validate_functions) == 1 and len(apply_functions) == 1:
                 valid, metadata = validate_functions[0](component=component, data=data, signature=signature)
                 if valid is True:
                     success = apply_functions[0](component=component, data=data, signature=signature)
                     if success is True:
                         license_object = LicenseList.get_by_component(component)
                         if license_object is None:
                             license_object = License()
                         license_object.component = component
                         license_object.name = name
                         license_object.token = token
                         license_object.data = data
                         license_object.valid_until = valid_until
                         license_object.signature = signature
                         license_object.save()
         license_contents = []
         for lic in LicenseList.get_licenses():
             license_contents.append(lic.hash)
         for storagerouter in storagerouters:
             client = clients[storagerouter]
             client.file_write('/opt/OpenvStorage/config/licenses', '{0}\n'.format('\n'.join(license_contents)))
     except Exception, ex:
         LicenseController._logger.exception('Error applying license: {0}'.format(ex))
         return None
Esempio n. 15
0
    def refresh_package_information():
        """
        Retrieve and store the package information of all StorageRouters
        :return: None
        """
        GenericController._logger.info('Updating package information')
        threads = []
        information = {}
        all_storagerouters = StorageRouterList.get_storagerouters()
        for storagerouter in all_storagerouters:
            information[storagerouter.ip] = {}
            for function in Toolbox.fetch_hooks('update', 'get_package_info_multi'):
                try:
                    # We make use of these clients in Threads --> cached = False
                    client = SSHClient(endpoint=storagerouter, username='******', cached=False)
                except UnableToConnectException:
                    information[storagerouter.ip]['errors'] = ['StorageRouter {0} is inaccessible'.format(storagerouter.name)]
                    break
                thread = Thread(target=function,
                                args=(client, information))
                thread.start()
                threads.append(thread)

        for function in Toolbox.fetch_hooks('update', 'get_package_info_single'):
            thread = Thread(target=function,
                            args=(information,))
            thread.start()
            threads.append(thread)

        for thread in threads:
            thread.join()

        errors = []
        copy_information = copy.deepcopy(information)
        for ip, info in information.iteritems():
            if len(info.get('errors', [])) > 0:
                errors.extend(['{0}: {1}'.format(ip, error) for error in info['errors']])
                copy_information.pop(ip)

        for storagerouter in all_storagerouters:
            info = copy_information.get(storagerouter.ip, {})
            if 'errors' in info:
                info.pop('errors')
            storagerouter.package_information = info
            storagerouter.save()

        if len(errors) > 0:
            errors = [str(error) for error in set(errors)]
            raise Exception(' - {0}'.format('\n - '.join(errors)))
Esempio n. 16
0
 def remove(license_guid):
     """
     Removes a license
     """
     clients = {}
     storagerouters = StorageRouterList.get_storagerouters()
     try:
         for storagerouter in storagerouters:
             clients[storagerouter] = SSHClient(storagerouter.ip)
     except UnableToConnectException:
         raise RuntimeError('Not all StorageRouters are reachable')
     lic = License(license_guid)
     if lic.can_remove is True:
         remove_functions = Toolbox.fetch_hooks(
             'license', '{0}.remove'.format(lic.component))
         result = remove_functions[0](component=lic.component,
                                      data=lic.data,
                                      valid_until=lic.valid_until,
                                      signature=lic.signature)
         if result is True:
             lic.delete()
             license_contents = []
             for lic in LicenseList.get_licenses():
                 license_contents.append(lic.hash)
             for storagerouter in storagerouters:
                 client = clients[storagerouter]
                 client.file_write(
                     '/opt/OpenvStorage/config/licenses',
                     '{0}\n'.format('\n'.join(license_contents)))
         return result
     return None
Esempio n. 17
0
    def _get_package_information(self):
        versions_dict = {self._client.ip: {}}
        # ALba is always installed with OpenvStorage. The current split however offloads retrieving Alba information to the AlbaNode which is not
        # present for non openvstorage-hc installs. Therefore explicititely request the alba information like this (otherwise update will get jeopardized).
        final_dict = {}
        threads = []

        for fct in Toolbox.fetch_hooks(
                component='update', sub_component='get_package_info_cluster'):
            thread = Thread(target=fct, args=(self._client, versions_dict))
            thread.start()
            threads.append(thread)

        for thread in threads:
            thread.join()

        for versions in versions_dict[self._client.ip].itervalues():
            for package, version in versions.iteritems():
                if package in final_dict:
                    if version != final_dict[package]:
                        final_dict[package] = min(version, final_dict[package])
                else:
                    final_dict[package] = version
        return OrderedDict(
            (key, self._stringify_looseversion(value))
            for key, value in sorted(final_dict.items(), key=lambda v: v[0]))
Esempio n. 18
0
 def _can_remove(self):
     """
     Can be removed
     """
     return len(
         Toolbox.fetch_hooks('license',
                             '{0}.remove'.format(self.component))) == 1
Esempio n. 19
0
 def remove(license_guid):
     """
     Removes a license
     """
     clients = {}
     storagerouters = StorageRouterList.get_storagerouters()
     try:
         for storagerouter in storagerouters:
             clients[storagerouter] = SSHClient(storagerouter.ip)
     except UnableToConnectException:
         raise RuntimeError('Not all StorageRouters are reachable')
     lic = License(license_guid)
     if lic.can_remove is True:
         remove_functions = Toolbox.fetch_hooks('license', '{0}.remove'.format(lic.component))
         result = remove_functions[0](component=lic.component, data=lic.data, valid_until=lic.valid_until, signature=lic.signature)
         if result is True:
             lic.delete()
             license_contents = []
             for lic in LicenseList.get_licenses():
                 license_contents.append(lic.hash)
             for storagerouter in storagerouters:
                 client = clients[storagerouter]
                 client.file_write('/opt/OpenvStorage/config/licenses', '{0}\n'.format('\n'.join(license_contents)))
         return result
     return None
Esempio n. 20
0
    def check_rabbitmq_and_enable_ha_mode(client, logger):
        """
        Verify RabbitMQ is running properly and enable HA mode
        :param client: Client on which to check RabbitMQ
        :type client: ovs.extensions.generic.sshclient.SSHClient
        :param logger: Logger object used for logging
        :type logger: ovs.log.log_handler.LogHandler
        :return: None
        """
        if not ServiceManager.has_service('rabbitmq-server', client):
            raise RuntimeError('Service rabbitmq-server has not been added on node {0}'.format(client.ip))
        rabbitmq_running, same_process = ServiceManager.is_rabbitmq_running(client=client)
        if rabbitmq_running is False or same_process is False:
            Toolbox.change_service_state(client, 'rabbitmq-server', 'restart', logger)

        time.sleep(5)
        client.run(['rabbitmqctl', 'set_policy', 'ha-all', '^(volumerouter|ovs_.*)$', '{"ha-mode":"all"}'])
Esempio n. 21
0
 def avahi_installed(client, logger):
     """
     Verify whether Avahi is installed
     :param client: Client on which to check for Avahi
     :type client: ovs_extensions.generic.sshclient.SSHClient
     :param logger: Logger object used for logging
     :type logger: ovs.extensions.generic.logger.Logger
     :return: True if Avahi is installed, False otherwise
     :rtype: bool
     """
     installed = client.run(['which', 'avahi-daemon'], allow_nonzero=True)
     if installed == '':
         Toolbox.log(logger=logger, messages='Avahi not installed')
         return False
     else:
         Toolbox.log(logger=logger, messages='Avahi installed')
         return True
Esempio n. 22
0
 def avahi_installed(client, logger):
     """
     Verify whether Avahi is installed
     :param client: Client on which to check for Avahi
     :type client: ovs.extensions.generic.sshclient.SSHClient
     :param logger: Logger object used for logging
     :type logger: ovs.log.log_handler.LogHandler
     :return: True if Avahi is installed, False otherwise
     :rtype: bool
     """
     installed = client.run(['which', 'avahi-daemon'], allow_nonzero=True)
     if installed == '':
         Toolbox.log(logger=logger, messages='Avahi not installed')
         return False
     else:
         Toolbox.log(logger=logger, messages='Avahi installed')
         return True
Esempio n. 23
0
 def configure_memcached(client, logger):
     """
     Configure Memcached
     :param client: Client on which to configure Memcached
     :type client: ovs_extensions.generic.sshclient.SSHClient
     :param logger: Logger object used for logging
     :type logger: ovs.extensions.generic.logger.Logger
     :return: None
     """
     Toolbox.log(logger=logger, messages='Setting up Memcached')
     client.run(
         ['sed', '-i', 's/^-l.*/-l 0.0.0.0/g', '/etc/memcached.conf'])
     client.run(['sed', '-i', 's/^-m.*/-m 1024/g', '/etc/memcached.conf'])
     client.run(
         ['sed', '-i', '-E', 's/^-v(.*)/# -v\1/g',
          '/etc/memcached.conf'])  # Put all -v, -vv, ... back in comment
     client.run(['sed', '-i', 's/^# -v[^v]*$/-v/g',
                 '/etc/memcached.conf'])  # Uncomment only -v
Esempio n. 24
0
 def run_backend_domain_hooks(backend_guid):
     """
     Run hooks when the Backend Domains have been updated
     :param backend_guid: Guid of the Backend to update
     :type backend_guid: str
     :return: None
     """
     for function in Toolbox.fetch_hooks('backend', 'domains-update'):
         function(backend_guid=backend_guid)
Esempio n. 25
0
 def run_backend_domain_hooks(backend_guid):
     """
     Run hooks when the Backend Domains have been updated
     :param backend_guid: Guid of the Backend to update
     :type backend_guid: str
     :return: None
     """
     for fct in Toolbox.fetch_hooks('backend', 'domains-update'):
         fct(backend_guid=backend_guid)
Esempio n. 26
0
    def _configure_amqp_to_volumedriver():
        Toolbox.log(logger=NodeTypeController._logger, messages='Update existing vPools')
        login = Configuration.get('/ovs/framework/messagequeue|user')
        password = Configuration.get('/ovs/framework/messagequeue|password')
        protocol = Configuration.get('/ovs/framework/messagequeue|protocol')

        uris = []
        for endpoint in Configuration.get('/ovs/framework/messagequeue|endpoints'):
            uris.append({'amqp_uri': '{0}://{1}:{2}@{3}'.format(protocol, login, password, endpoint)})

        if Configuration.dir_exists('/ovs/vpools'):
            for vpool_guid in Configuration.list('/ovs/vpools'):
                for storagedriver_id in Configuration.list('/ovs/vpools/{0}/hosts'.format(vpool_guid)):
                    storagedriver_config = StorageDriverConfiguration('storagedriver', vpool_guid, storagedriver_id)
                    storagedriver_config.load()
                    storagedriver_config.configure_event_publisher(events_amqp_routing_key=Configuration.get('/ovs/framework/messagequeue|queues.storagedriver'),
                                                                   events_amqp_uris=uris)
                    storagedriver_config.save()
Esempio n. 27
0
    def add_services(client, node_type, logger):
        """
        Add the services required by the OVS cluster
        :param client: Client on which to add the services
        :type client: ovs_extensions.generic.sshclient.SSHClient
        :param node_type: Type of node ('master' or 'extra')
        :type node_type: str
        :param logger: Logger object used for logging
        :type logger: ovs.extensions.generic.logger.Logger
        :return: None
        """
        Toolbox.log(logger=logger, messages='Adding services')
        service_manager = ServiceFactory.get_manager()
        services = {}
        worker_queue = System.get_my_machine_id(client=client)
        if node_type == 'master':
            worker_queue += ',ovs_masters'
            services.update({
                'memcached': {
                    'MEMCACHE_NODE_IP': client.ip,
                    'WORKER_QUEUE': worker_queue
                },
                'rabbitmq-server': {
                    'MEMCACHE_NODE_IP': client.ip,
                    'WORKER_QUEUE': worker_queue
                },
                'scheduled-tasks': {},
                'webapp-api': {},
                'volumerouter-consumer': {}
            })
        services.update({
            'workers': {
                'WORKER_QUEUE': worker_queue
            },
            'watcher-framework': {}
        })

        for service_name, params in services.iteritems():
            if not service_manager.has_service(service_name, client):
                Toolbox.log(logger=logger,
                            messages='Adding service {0}'.format(service_name))
                service_manager.add_service(name=service_name,
                                            params=params,
                                            client=client)
Esempio n. 28
0
    def install_plugins():
        """
        (Re)load plugins
        """
        if ServiceManager.has_service('ovs-watcher-framework',
                                      SSHClient('127.0.0.1', username='******')):
            # If the watcher is running, 'ovs setup' was executed and we need to restart everything to load
            # the plugin. In the other case, the plugin will be loaded once 'ovs setup' is executed
            from ovs.dal.lists.storagerouterlist import StorageRouterList
            clients = []
            try:
                for storagerouter in StorageRouterList.get_storagerouters():
                    clients.append(SSHClient(storagerouter, username='******'))
            except UnableToConnectException:
                raise RuntimeError('Not all StorageRouters are reachable')

            for client in clients:
                for service_name in ['watcher-framework', 'memcached']:
                    ServiceManager.stop_service(service_name, client=client)
                    wait = 30
                    while wait > 0:
                        if ServiceManager.get_service_status(
                                service_name, client=client) is False:
                            break
                        time.sleep(1)
                        wait -= 1
                    if wait == 0:
                        raise RuntimeError(
                            'Could not stop service: {0}'.format(service_name))

            for client in clients:
                for service_name in ['memcached', 'watcher-framework']:
                    ServiceManager.start_service(service_name, client=client)
                    wait = 30
                    while wait > 0:
                        if ServiceManager.get_service_status(
                                service_name, client=client) is True:
                            break
                        time.sleep(1)
                        wait -= 1
                    if wait == 0:
                        raise RuntimeError(
                            'Could not start service: {0}'.format(
                                service_name))

            from ovs.dal.helpers import Migration
            Migration.migrate()

            from ovs.lib.helpers.toolbox import Toolbox
            ip = System.get_my_storagerouter().ip
            functions = Toolbox.fetch_hooks('plugin', 'postinstall')
            for function in functions:
                function(ip=ip)
Esempio n. 29
0
    def remove_services(client, node_type, logger):
        """
        Remove all services managed by OVS
        :param client: Client on which to remove the services
        :type client: ovs.extensions.generic.sshclient.SSHClient
        :param node_type: Type of node, can be 'master' or 'extra'
        :type node_type: str
        :param logger: Logger object used for logging
        :type logger: ovs.log.log_handler.LogHandler
        :return: None
        """
        Toolbox.log(logger=logger, messages="Removing services")
        stop_only = ["rabbitmq-server", "memcached"]
        services = ["workers", "support-agent", "watcher-framework"]
        if node_type == "master":
            services += ["scheduled-tasks", "webapp-api", "volumerouter-consumer"]
            if Toolbox.is_service_internally_managed(service="rabbitmq") is True:
                services.append("rabbitmq-server")
            if Toolbox.is_service_internally_managed(service="memcached") is True:
                services.append("memcached")

        for service in services:
            if ServiceManager.has_service(service, client=client):
                Toolbox.log(
                    logger=logger,
                    messages="{0} service {1}".format("Removing" if service not in stop_only else "Stopping", service),
                )
                ServiceManager.stop_service(service, client=client)
                if service not in stop_only:
                    ServiceManager.remove_service(service, client=client)
Esempio n. 30
0
 def change_services_state(services, ssh_clients, action):
     """
     Stop/start services on SSH clients
     If action is start, we ignore errors and try to start other services on other nodes
     """
     services = list(services)
     if action == 'start':
         services.reverse()  # Start services again in reverse order of stopping
     for service_name in services:
         for ssh_client in ssh_clients:
             description = 'stopping' if action == 'stop' else 'starting' if action == 'start' else 'restarting'
             try:
                 if ServiceManager.has_service(service_name, client=ssh_client):
                     Toolbox.change_service_state(client=ssh_client,
                                                  name=service_name,
                                                  state=action,
                                                  logger=UpdateController._logger)
             except Exception as exc:
                 UpdateController._logger.warning('{0}: Something went wrong {1} service {2}: {3}'.format(ssh_client.ip, description, service_name, exc))
                 if action == 'stop':
                     return False
     return True
Esempio n. 31
0
 def validate(license_string):
     """
     Validates a license with the various components
     """
     try:
         result = {}
         data = LicenseController._decode(license_string)
         for component in data:
             cdata = data[component]
             name = cdata['name']
             data = cdata['data']
             _ = cdata['token']
             valid_until = float(cdata['valid_until']) if 'valid_until' in cdata else None
             if valid_until is not None and valid_until <= time.time():
                 result[component] = False
                 continue
             signature = cdata['signature'] if 'signature' in cdata else None
             validate_functions = Toolbox.fetch_hooks('license', '{0}.validate'.format(component))
             apply_functions = Toolbox.fetch_hooks('license', '{0}.apply'.format(component))
             if len(validate_functions) == 1 and len(apply_functions) == 1:
                 try:
                     valid, metadata = validate_functions[0](component=component, data=data, signature=signature)
                 except Exception, ex:
                     LicenseController._logger.debug('Error validating license for {0}: {1}'.format(component, ex))
                     valid = False
                     metadata = None
                 if valid is False:
                     LicenseController._logger.debug('Invalid license for {0}: {1}'.format(component, license_string))
                     result[component] = False
                 else:
                     result[component] = {'valid_until': valid_until,
                                          'metadata': metadata,
                                          'name': name}
             else:
                 LicenseController._logger.debug('No validate nor apply functions found for {0}'.format(component))
                 result[component] = False
         return result
Esempio n. 32
0
 def _change_services_state(services, ssh_clients, action):
     """
     Stop/start services on SSH clients
     If action is start, we ignore errors and try to start other services on other nodes
     """
     if action == 'start':
         services.reverse(
         )  # Start services again in reverse order of stopping
     for service_name in services:
         for ssh_client in ssh_clients:
             description = 'stopping' if action == 'stop' else 'starting' if action == 'start' else 'restarting'
             try:
                 if ServiceManager.has_service(service_name,
                                               client=ssh_client):
                     UpdateController._log_message(
                         '{0} service {1}'.format(description.capitalize(),
                                                  service_name),
                         ssh_client.ip)
                     Toolbox.change_service_state(
                         client=ssh_client,
                         name=service_name,
                         state=action,
                         logger=UpdateController._logger)
                     UpdateController._log_message(
                         '{0} service {1}'.format(
                             'Stopped' if action == 'stop' else 'Started'
                             if action == 'start' else 'Restarted',
                             service_name), ssh_client.ip)
             except Exception as exc:
                 UpdateController._log_message(
                     'Something went wrong {0} service {1}: {2}'.format(
                         description, service_name, exc),
                     ssh_client.ip,
                     severity='warning')
                 if action == 'stop':
                     return False
     return True
Esempio n. 33
0
 def _change_services_state(services, ssh_clients, action):
     """
     Stop/start services on SSH clients
     If action is start, we ignore errors and try to start other services on other nodes
     """
     if action == 'start':
         services.reverse()  # Start services again in reverse order of stopping
     for service_name in services:
         for ssh_client in ssh_clients:
             description = 'stopping' if action == 'stop' else 'starting' if action == 'start' else 'restarting'
             try:
                 if ServiceManager.has_service(service_name, client=ssh_client):
                     UpdateController._log_message('{0} service {1}'.format(description.capitalize(), service_name),
                                                   ssh_client.ip)
                     Toolbox.change_service_state(client=ssh_client,
                                                  name=service_name,
                                                  state=action,
                                                  logger=UpdateController._logger)
                     UpdateController._log_message('{0} service {1}'.format('Stopped' if action == 'stop' else 'Started' if action == 'start' else 'Restarted', service_name), ssh_client.ip)
             except Exception as exc:
                 UpdateController._log_message('Something went wrong {0} service {1}: {2}'.format(description, service_name, exc), ssh_client.ip, severity='warning')
                 if action == 'stop':
                     return False
     return True
Esempio n. 34
0
 def merge_package_information():
     """
     Retrieve the package information from the model for both StorageRouters and ALBA Nodes and merge it
     :return: Package information for all StorageRouters and ALBA nodes
     :rtype: dict
     """
     package_info = dict((storagerouter.ip, storagerouter.package_information) for storagerouter in StorageRouterList.get_storagerouters())
     for function in Toolbox.fetch_hooks('update', 'merge_package_info'):
         output = function()
         for ip in output:
             if ip in package_info:
                 package_info[ip].update(output[ip])
             else:
                 package_info[ip] = output[ip]
     return package_info
Esempio n. 35
0
 def retrieve_storagerouter_info_via_host(ip, password):
     """
     Retrieve the storagerouters from model
     """
     storagerouters = {}
     try:
         from ovs.dal.lists.storagerouterlist import StorageRouterList
         with remote(ip_info=ip,
                     modules=[StorageRouterList],
                     username='******',
                     password=password,
                     strict_host_key_checking=False) as rem:
             for sr in rem.StorageRouterList.get_storagerouters():
                 storagerouters[sr.name] = {
                     'ip': sr.ip,
                     'type': sr.node_type.lower()
                 }
     except Exception as ex:
         Toolbox.log(
             logger=NodeTypeController._logger,
             messages='Error loading storagerouters: {0}'.format(ex),
             loglevel='exception',
             silent=True)
     return storagerouters
Esempio n. 36
0
    def install_plugins():
        """
        (Re)load plugins
        """
        manager = ServiceFactory.get_manager()
        if manager.has_service('ovs-watcher-framework',
                               SSHClient('127.0.0.1', username='******')):
            # If the watcher is running, 'ovs setup' was executed and we need to restart everything to load
            # the plugin. In the other case, the plugin will be loaded once 'ovs setup' is executed
            print 'Installing plugin into Open vStorage'
            from ovs.dal.lists.storagerouterlist import StorageRouterList
            clients = {}
            masters = StorageRouterList.get_masters()
            slaves = StorageRouterList.get_slaves()
            try:
                for sr in masters + slaves:
                    clients[sr] = SSHClient(sr, username='******')
            except UnableToConnectException:
                raise RuntimeError('Not all StorageRouters are reachable')
            memcached = 'memcached'
            watcher = 'watcher-framework'
            for sr in masters + slaves:
                if manager.has_service(watcher, clients[sr]):
                    print '- Stopping watcher on {0} ({1})'.format(
                        sr.name, sr.ip)
                    manager.stop_service(watcher, clients[sr])
            for sr in masters:
                print '- Restarting memcached on {0} ({1})'.format(
                    sr.name, sr.ip)
                manager.restart_service(memcached, clients[sr])
            for sr in masters + slaves:
                if manager.has_service(watcher, clients[sr]):
                    print '- Starting watcher on {0} ({1})'.format(
                        sr.name, sr.ip)
                    manager.start_service(watcher, clients[sr])

            print '- Execute model migrations'
            from ovs.dal.helpers import Migration
            Migration.migrate()

            from ovs.lib.helpers.toolbox import Toolbox
            ip = System.get_my_storagerouter().ip
            functions = Toolbox.fetch_hooks('plugin', 'postinstall')
            if len(functions) > 0:
                print '- Execute post installation scripts'
            for fct in functions:
                fct(ip=ip)
            print 'Installing plugin into Open vStorage: Completed'
Esempio n. 37
0
    def install_plugins():
        """
        (Re)load plugins
        """
        if ServiceManager.has_service('ovs-watcher-framework', SSHClient('127.0.0.1', username='******')):
            # If the watcher is running, 'ovs setup' was executed and we need to restart everything to load
            # the plugin. In the other case, the plugin will be loaded once 'ovs setup' is executed
            from ovs.dal.lists.storagerouterlist import StorageRouterList
            clients = []
            try:
                for storagerouter in StorageRouterList.get_storagerouters():
                    clients.append(SSHClient(storagerouter, username='******'))
            except UnableToConnectException:
                raise RuntimeError('Not all StorageRouters are reachable')

            for client in clients:
                for service_name in ['watcher-framework', 'memcached']:
                    ServiceManager.stop_service(service_name, client=client)
                    wait = 30
                    while wait > 0:
                        if ServiceManager.get_service_status(service_name, client=client) is False:
                            break
                        time.sleep(1)
                        wait -= 1
                    if wait == 0:
                        raise RuntimeError('Could not stop service: {0}'.format(service_name))

            for client in clients:
                for service_name in ['memcached', 'watcher-framework']:
                    ServiceManager.start_service(service_name, client=client)
                    wait = 30
                    while wait > 0:
                        if ServiceManager.get_service_status(service_name, client=client) is True:
                            break
                        time.sleep(1)
                        wait -= 1
                    if wait == 0:
                        raise RuntimeError('Could not start service: {0}'.format(service_name))

            from ovs.dal.helpers import Migration
            Migration.migrate()

            from ovs.lib.helpers.toolbox import Toolbox
            ip = System.get_my_storagerouter().ip
            functions = Toolbox.fetch_hooks('plugin', 'postinstall')
            for function in functions:
                function(ip=ip)
Esempio n. 38
0
    def restart_framework_and_memcache_services(clients,
                                                logger,
                                                offline_node_ips=None):
        """
        Restart framework and Memcached services
        :param clients: Clients on which to restart these services
        :type clients: dict
        :param logger: Logger object used for logging
        :type logger: ovs.log.log_handler.LogHandler
        :param offline_node_ips: IP addresses of offline nodes in the cluster
        :type offline_node_ips: list
        :return: None
        """
        from ovs.dal.lists.storagerouterlist import StorageRouterList

        service_manager = ServiceFactory.get_manager()
        master_ips = [sr.ip for sr in StorageRouterList.get_masters()]
        slave_ips = [sr.ip for sr in StorageRouterList.get_slaves()]
        if offline_node_ips is None:
            offline_node_ips = []
        memcached = 'memcached'
        watcher = 'watcher-framework'
        support_agent = 'support-agent'
        for ip in master_ips + slave_ips:
            if ip not in offline_node_ips:
                if service_manager.has_service(watcher, clients[ip]):
                    Toolbox.change_service_state(clients[ip], watcher, 'stop',
                                                 logger)
        for ip in master_ips:
            if ip not in offline_node_ips:
                Toolbox.change_service_state(clients[ip], memcached, 'restart',
                                             logger)
        for ip in master_ips + slave_ips:
            if ip not in offline_node_ips:
                if service_manager.has_service(watcher, clients[ip]):
                    Toolbox.change_service_state(clients[ip], watcher, 'start',
                                                 logger)
                if service_manager.has_service(support_agent, clients[ip]):
                    Toolbox.change_service_state(clients[ip], support_agent,
                                                 'restart', logger)
        VolatileFactory.store = None
Esempio n. 39
0
    def get_update_information_all():
        """
        Retrieve the update information for all StorageRouters
        This contains information about
            - downtime of model, GUI, vPools, proxies, ...
            - services that will be restarted
            - packages that will be updated
            - prerequisites that have not been met
        :return: Information about the update
        :rtype: dict
        """
        information = {}
        for function in Toolbox.fetch_hooks('update', 'information'):
            function(information=information)

        for component, info in copy.deepcopy(information).iteritems():
            if len(info['packages']) == 0:
                information.pop(component)
        return information
Esempio n. 40
0
    def install_plugins():
        """
        (Re)load plugins
        """
        if ServiceManager.has_service('ovs-watcher-framework', SSHClient('127.0.0.1', username='******')):
            # If the watcher is running, 'ovs setup' was executed and we need to restart everything to load
            # the plugin. In the other case, the plugin will be loaded once 'ovs setup' is executed
            print 'Installing plugin into Open vStorage'
            from ovs.dal.lists.storagerouterlist import StorageRouterList
            clients = {}
            masters = StorageRouterList.get_masters()
            slaves = StorageRouterList.get_slaves()
            try:
                for sr in masters + slaves:
                    clients[sr] = SSHClient(sr, username='******')
            except UnableToConnectException:
                raise RuntimeError('Not all StorageRouters are reachable')
            memcached = 'memcached'
            watcher = 'watcher-framework'
            for sr in masters + slaves:
                if ServiceManager.has_service(watcher, clients[sr]):
                    print '- Stopping watcher on {0} ({1})'.format(sr.name, sr.ip)
                    ServiceManager.stop_service(watcher, clients[sr])
            for sr in masters:
                print '- Restarting memcached on {0} ({1})'.format(sr.name, sr.ip)
                ServiceManager.restart_service(memcached, clients[sr])
            for sr in masters + slaves:
                if ServiceManager.has_service(watcher, clients[sr]):
                    print '- Starting watcher on {0} ({1})'.format(sr.name, sr.ip)
                    ServiceManager.start_service(watcher, clients[sr])

            print '- Execute model migrations'
            from ovs.dal.helpers import Migration
            Migration.migrate()

            from ovs.lib.helpers.toolbox import Toolbox
            ip = System.get_my_storagerouter().ip
            functions = Toolbox.fetch_hooks('plugin', 'postinstall')
            if len(functions) > 0:
                print '- Execute post installation scripts'
            for function in functions:
                function(ip=ip)
            print 'Installing plugin into Open vStorage: Completed'
Esempio n. 41
0
    def remove_services(client, node_type, logger):
        """
        Remove all services managed by OVS
        :param client: Client on which to remove the services
        :type client: ovs_extensions.generic.sshclient.SSHClient
        :param node_type: Type of node, can be 'master' or 'extra'
        :type node_type: str
        :param logger: Logger object used for logging
        :type logger: ovs.extensions.generic.logger.Logger
        :return: None
        """
        Toolbox.log(logger=logger, messages='Removing services')
        service_manager = ServiceFactory.get_manager()
        stop_only = ['rabbitmq-server', 'memcached']
        services = ['workers', 'support-agent', 'watcher-framework']
        if node_type == 'master':
            services += [
                'scheduled-tasks', 'webapp-api', 'volumerouter-consumer'
            ]
            if Toolbox.is_service_internally_managed(
                    service='rabbitmq') is True:
                services.append('rabbitmq-server')
            if Toolbox.is_service_internally_managed(
                    service='memcached') is True:
                services.append('memcached')

        for service in services:
            if service_manager.has_service(service, client=client):
                Toolbox.log(
                    logger=logger,
                    messages='{0} service {1}'.format(
                        'Removing' if service not in stop_only else 'Stopping',
                        service))
                service_manager.stop_service(service, client=client)
                if service not in stop_only:
                    service_manager.remove_service(service, client=client)
Esempio n. 42
0
    def dtl_checkup(vpool_guid=None, vdisk_guid=None, storagerouters_to_exclude=None):
        """
        Check DTL for all volumes
        :param vpool_guid:                vPool to check the DTL configuration of all its disks
        :type vpool_guid:                 String

        :param vdisk_guid:                Virtual Disk to check its DTL configuration
        :type vdisk_guid:                 String

        :param storagerouters_to_exclude: Storage Routers to exclude from possible targets
        :type storagerouters_to_exclude:  List

        :return:                          None
        """
        if vpool_guid is not None and vdisk_guid is not None:
            raise ValueError('vpool and vdisk are mutually exclusive')
        if storagerouters_to_exclude is None:
            storagerouters_to_exclude = []

        from ovs.lib.vpool import VPoolController

        logger.info('DTL checkup started')
        required_params = {'dtl_mode': (str, StorageDriverClient.VPOOL_DTL_MODE_MAP.keys()),
                           'dtl_enabled': (bool, None)}
        vdisk = VDisk(vdisk_guid) if vdisk_guid else None
        vpool = VPool(vpool_guid) if vpool_guid else None
        errors_found = False
        root_client_map = {}
        vpool_dtl_config_cache = {}
        vdisks = VDiskList.get_vdisks() if vdisk is None and vpool is None else vpool.vdisks if vpool is not None else [vdisk]
        for vdisk in vdisks:
            logger.info('    Verifying vDisk {0} with guid {1}'.format(vdisk.name, vdisk.guid))
            vdisk.invalidate_dynamics(['storagedriver_client', 'storagerouter_guid'])
            if vdisk.storagedriver_client is None:
                continue

            vpool = vdisk.vpool
            if vpool.guid not in vpool_dtl_config_cache:
                vpool_config = VPoolController.get_configuration(vpool.guid)  # Config on vPool is permanent for DTL settings
                vpool_dtl_config_cache[vpool.guid] = vpool_config
                Toolbox.verify_required_params(required_params, vpool_config)

            volume_id = str(vdisk.volume_id)
            vpool_config = vpool_dtl_config_cache[vpool.guid]
            dtl_vpool_enabled = vpool_config['dtl_enabled']
            try:
                current_dtl_config = vdisk.storagedriver_client.get_dtl_config(volume_id)
                current_dtl_config_mode = vdisk.storagedriver_client.get_dtl_config_mode(volume_id)
            except RuntimeError as rte:
                # Can occur when a volume has not been stolen yet from a dead node
                logger.error('Retrieving DTL configuration from storage driver failed with error: {0}'.format(rte))
                errors_found = True
                continue

            if dtl_vpool_enabled is False and (current_dtl_config is None or current_dtl_config.host == 'null'):
                logger.info('    DTL is globally disabled for vPool {0} with guid {1}'.format(vpool.name, vpool.guid))
                vdisk.storagedriver_client.set_manual_dtl_config(volume_id, None)
                continue
            elif current_dtl_config_mode == DTLConfigMode.MANUAL and (current_dtl_config is None or current_dtl_config.host == 'null'):
                logger.info('    DTL is disabled for virtual disk {0} with guid {1}'.format(vdisk.name, vdisk.guid))
                continue

            storage_router = StorageRouter(vdisk.storagerouter_guid)
            available_storagerouters = []
            # 1. Check available storage routers in the backup failure domain
            if storage_router.secondary_failure_domain is not None:
                for storagerouter in storage_router.secondary_failure_domain.primary_storagerouters:
                    if vpool.guid not in storagerouter.vpools_guids:
                        continue
                    if storagerouter not in root_client_map:
                        try:
                            root_client = SSHClient(storagerouter, username='******')
                        except UnableToConnectException:
                            logger.warning('    Storage Router with IP {0} of vDisk {1} is not reachable'.format(storagerouter.ip, vdisk.name))
                            continue
                        root_client_map[storagerouter] = root_client
                    else:
                        root_client = root_client_map[storagerouter]
                    if ServiceManager.get_service_status('dtl_{0}'.format(vpool.name), client=root_client) is True:
                        available_storagerouters.append(storagerouter)
            # 2. Check available storage routers in the same failure domain as current storage router
            if len(available_storagerouters) == 0:
                for storagerouter in storage_router.primary_failure_domain.primary_storagerouters:
                    if vpool.guid not in storagerouter.vpools_guids or storagerouter == storage_router:
                        continue
                    if storagerouter not in root_client_map:
                        try:
                            root_client = SSHClient(storagerouter, username='******')
                        except UnableToConnectException:
                            logger.warning('    Storage Router with IP {0} of vDisk {1} is not reachable'.format(storagerouter.ip, vdisk.name))
                            continue
                        root_client_map[storagerouter] = root_client
                    else:
                        root_client = root_client_map[storagerouter]
                    if ServiceManager.get_service_status('dtl_{0}'.format(vpool.name), client=root_client) is True:
                        available_storagerouters.append(storagerouter)

            # Remove storage routers to exclude
            for sr_guid in storagerouters_to_exclude:
                sr_to_exclude = StorageRouter(sr_guid)
                if sr_to_exclude in available_storagerouters:
                    available_storagerouters.remove(sr_to_exclude)

            if len(available_storagerouters) == 0:
                logger.info('    No Storage Routers could be found as valid DTL target')
                vdisk.storagedriver_client.set_manual_dtl_config(volume_id, None)
                continue

            # Check whether reconfiguration is required
            reconfigure_required = False
            if current_dtl_config is None:
                logger.info('        No DTL configuration found, but there are Storage Routers available')
                reconfigure_required = True
            elif current_dtl_config_mode == DTLConfigMode.AUTOMATIC:
                logger.info('        DTL configuration set to AUTOMATIC, switching to manual')
                reconfigure_required = True
            else:
                dtl_host = current_dtl_config.host
                dtl_port = current_dtl_config.port
                storage_drivers = [sd for sd in vpool.storagedrivers if sd.storagerouter.ip == dtl_host]

                logger.info('        DTL host: {0}'.format(dtl_host or '-'))
                logger.info('        DTL port: {0}'.format(dtl_port or '-'))
                if dtl_host not in [sr.ip for sr in available_storagerouters]:
                    logger.info('        Host not in available Storage Routers')
                    reconfigure_required = True
                elif dtl_port != storage_drivers[0].ports[2]:
                    logger.info('        Configured port does not match expected port ({0} vs {1})'.format(dtl_port, storage_drivers[0].ports[2]))
                    reconfigure_required = True

            # Perform the reconfiguration
            if reconfigure_required is True:
                logger.info('        Reconfigure required')
                index = random.randint(0, len(available_storagerouters) - 1)
                dtl_target = available_storagerouters[index]
                storage_drivers = [sd for sd in vpool.storagedrivers if sd.storagerouter == dtl_target]
                if len(storage_drivers) == 0:
                    raise ValueError('Could not retrieve related storagedriver')

                port = storage_drivers[0].ports[2]
                vpool_dtl_mode = vpool_config.get('dtl_mode', StorageDriverClient.FRAMEWORK_DTL_ASYNC)
                logger.info('        DTL config that will be set -->  Host: {0}, Port: {1}, Mode: {2}'.format(dtl_target.ip, port, vpool_dtl_mode))
                dtl_config = DTLConfig(str(dtl_target.ip), port, StorageDriverClient.VDISK_DTL_MODE_MAP[vpool_dtl_mode])
                vdisk.storagedriver_client.set_manual_dtl_config(volume_id, dtl_config)
        if errors_found is True:
            logger.error('DTL checkup ended with errors')
            raise Exception('DTL checkup failed with errors. Please check /var/log/ovs/lib.log for more information')
        logger.info('DTL checkup ended')
Esempio n. 43
0
    def set_config_params(vdisk_guid, new_config_params):
        """
        Sets configuration parameters for a given vdisk.
        :param vdisk_guid: Guid of the virtual disk to set the configuration parameters for
        :param new_config_params: New configuration parameters
        """
        required_params = {'dtl_mode': (str, StorageDriverClient.VDISK_DTL_MODE_MAP.keys()),
                           'sco_size': (int, StorageDriverClient.TLOG_MULTIPLIER_MAP.keys()),
                           'dedupe_mode': (str, StorageDriverClient.VDISK_DEDUPE_MAP.keys()),
                           'write_buffer': (int, {'min': 128, 'max': 10 * 1024}),
                           'cache_strategy': (str, StorageDriverClient.VDISK_CACHE_MAP.keys()),
                           'readcache_limit': (int, {'min': 1, 'max': 10 * 1024}, False)}

        if new_config_params.get('dtl_target') is not None:
            required_params.update({'dtl_target': (str, Toolbox.regex_ip)})

        Toolbox.verify_required_params(required_params, new_config_params)

        if new_config_params['dtl_mode'] != 'no_sync' and new_config_params.get('dtl_target') is None:
            raise Exception('If DTL mode is Asynchronous or Synchronous, a target IP should always be specified')

        errors = False
        vdisk = VDisk(vdisk_guid)
        volume_id = str(vdisk.volume_id)
        old_config_params = VDiskController.get_config_params(vdisk.guid)

        # 1st update SCO size, because this impacts TLOG multiplier which on its turn impacts write buffer
        new_sco_size = new_config_params['sco_size']
        old_sco_size = old_config_params['sco_size']
        if new_sco_size != old_sco_size:
            write_buffer = float(new_config_params['write_buffer'])
            tlog_multiplier = StorageDriverClient.TLOG_MULTIPLIER_MAP[new_sco_size]
            sco_factor = write_buffer / tlog_multiplier / new_sco_size
            try:
                logger.info('Updating property sco_size on vDisk {0} to {1}'.format(vdisk_guid, new_sco_size))
                vdisk.storagedriver_client.set_sco_multiplier(volume_id, new_sco_size / 4 * 1024)
                vdisk.storagedriver_client.set_tlog_multiplier(volume_id, tlog_multiplier)
                vdisk.storagedriver_client.set_sco_cache_max_non_disposable_factor(volume_id, sco_factor)
                logger.info('Updated property sco_size')
            except Exception as ex:
                logger.error('Error updating "sco_size": {0}'.format(ex))
                errors = True

        # 2nd Check for DTL changes
        new_dtl_mode = new_config_params['dtl_mode']
        old_dtl_mode = old_config_params['dtl_mode']
        new_dtl_target = new_config_params.get('dtl_target')
        old_dtl_target = old_config_params['dtl_target']
        if old_dtl_mode != new_dtl_mode or new_dtl_target != old_dtl_target:
            if old_dtl_mode != new_dtl_mode and new_dtl_mode == 'no_sync':
                logger.info('Disabling DTL for vDisk {0}'.format(vdisk_guid))
                vdisk.storagedriver_client.set_manual_dtl_config(volume_id, None)
            elif (new_dtl_target is not None and new_dtl_target != old_dtl_target or old_dtl_mode != new_dtl_mode) and new_dtl_mode != 'no_sync':
                logger.info('Changing DTL to use global values for vDisk {0}'.format(vdisk_guid))
                sr_target = StorageRouterList.get_by_ip(new_dtl_target)
                if sr_target is None:
                    logger.error('Failed to retrieve Storage Router with IP {0}'.format(new_dtl_target))
                    errors = True
                for sd in sr_target.storagedrivers:
                    if sd.vpool == vdisk.vpool:
                        dtl_config = DTLConfig(str(new_dtl_target), sd.ports[2], StorageDriverClient.VDISK_DTL_MODE_MAP[new_dtl_mode])
                        vdisk.storagedriver_client.set_manual_dtl_config(volume_id, dtl_config)
                        break
                else:
                    logger.error('Failed to retrieve Storage Driver with IP {0}'.format(new_dtl_target))
                    errors = True

        # 2nd update rest
        for key in required_params:
            try:
                if key in ['sco_size', 'dtl_mode', 'dtl_target']:
                    continue

                new_value = new_config_params[key]
                old_value = old_config_params[key]
                if new_value != old_value:
                    logger.info('Updating property {0} on vDisk {1} from to {2}'.format(key, vdisk_guid, new_value))
                    if key == 'dedupe_mode':
                        vdisk.storagedriver_client.set_readcache_mode(volume_id, StorageDriverClient.VDISK_DEDUPE_MAP[new_value])
                    elif key == 'write_buffer':
                        tlog_multiplier = vdisk.storagedriver_client.get_tlog_multiplier(volume_id) or StorageDriverClient.TLOG_MULTIPLIER_MAP[new_sco_size]
                        sco_factor = float(new_value) / tlog_multiplier / new_sco_size
                        vdisk.storagedriver_client.set_sco_cache_max_non_disposable_factor(volume_id, sco_factor)
                    elif key == 'cache_strategy':
                        vdisk.storagedriver_client.set_readcache_behaviour(volume_id, StorageDriverClient.VDISK_CACHE_MAP[new_value])
                    elif key == 'readcache_limit':
                        vol_info = vdisk.storagedriver_client.info_volume(volume_id)
                        block_size = vol_info.lba_size * vol_info.cluster_multiplier or 4096
                        limit = new_value * 1024 * 1024 * 1024 / block_size if new_value else None
                        vdisk.storagedriver_client.set_readcache_limit(volume_id, limit)
                    else:
                        raise KeyError('Unsupported property provided: "{0}"'.format(key))
                    logger.info('Updated property {0}'.format(key))
            except Exception as ex:
                logger.error('Error updating "{0}": {1}'.format(key, ex))
                errors = True
        if errors is True:
            raise Exception('Failed to update the values for vDisk {0}'.format(vdisk.name))
Esempio n. 44
0
    def set_config_params(vdisk_guid, new_config_params, old_config_params):
        """
        Sets configuration parameters for a given vdisk.
        """
        required_params = {
                           # 'dtl_mode': (str, StorageDriverClient.VDISK_DTL_MODE_MAP.keys()),
                           'sco_size': (int, StorageDriverClient.TLOG_MULTIPLIER_MAP.keys()),
                           'dedupe_mode': (str, StorageDriverClient.VDISK_DEDUPE_MAP.keys()),
                           'dtl_enabled': (bool, None),
                           # 'dtl_location': (str, None),
                           'write_buffer': (int, {'min': 128, 'max': 10 * 1024}),
                           'cache_strategy': (str, StorageDriverClient.VDISK_CACHE_MAP.keys()),
                           'readcache_limit': (int, {'min': 1, 'max': 10 * 1024}, False)}

        Toolbox.verify_required_params(required_params, new_config_params)
        Toolbox.verify_required_params(required_params, old_config_params)

        errors = False
        vdisk = VDisk(vdisk_guid)
        volume_id = str(vdisk.volume_id)
        old_sco_size = old_config_params['sco_size']
        new_sco_size = new_config_params['sco_size']

        # 1st update SCO size, because this impacts TLOG multiplier which on its turn impacts write buffer
        if new_sco_size != old_sco_size:
            write_buffer = float(new_config_params['write_buffer'])
            tlog_multiplier = StorageDriverClient.TLOG_MULTIPLIER_MAP[new_sco_size]
            sco_factor = write_buffer / tlog_multiplier / new_sco_size
            try:
                logger.info('Updating property sco_size on vDisk {0} from {1} to {2}'.format(vdisk_guid, old_sco_size, new_sco_size))
                vdisk.storagedriver_client.set_sco_multiplier(volume_id, new_sco_size / 4 * 1024)
                vdisk.storagedriver_client.set_tlog_multiplier(volume_id, tlog_multiplier)
                vdisk.storagedriver_client.set_sco_cache_max_non_disposable_factor(volume_id, sco_factor)
                logger.info('Updated property sco_size')
            except Exception as ex:
                logger.error('Error updating "sco_size": {0}'.format(ex))
                errors = True

        # 2nd update rest
        for key, old_value in old_config_params.iteritems():
            if key.startswith('dtl') or key == 'sco_size':
                continue
            new_value = new_config_params[key]
            if new_value != old_value:
                try:
                    logger.info('Updating property {0} on vDisk {1} from {2} to {3}'.format(key, vdisk_guid, old_value, new_value))
                    if key == 'cache_strategy':
                        vdisk.storagedriver_client.set_readcache_behaviour(volume_id, StorageDriverClient.VDISK_CACHE_MAP[new_value])
                    elif key == 'dedupe_mode':
                        vdisk.storagedriver_client.set_readcache_mode(volume_id, StorageDriverClient.VDISK_DEDUPE_MAP[new_value])
                    elif key == 'write_buffer':
                        tlog_multiplier = vdisk.storagedriver_client.get_tlog_multiplier(volume_id) or StorageDriverClient.TLOG_MULTIPLIER_MAP[new_sco_size]
                        sco_factor = float(new_value) / tlog_multiplier / new_sco_size
                        vdisk.storagedriver_client.set_sco_cache_max_non_disposable_factor(volume_id, sco_factor)
                    elif key == 'readcache_limit':
                        volume_info = vdisk.storagedriver_client.info_volume(volume_id)
                        block_size = volume_info.lba_size * volume_info.cluster_multiplier or 4096
                        limit = new_value * 1024 * 1024 * 1024 / block_size if new_value else None
                        vdisk.storagedriver_client.set_readcache_limit(volume_id, limit)
                    else:
                        raise KeyError('Unsupported property provided: "{0}"'.format(key))
                    logger.info('Updated property {0}'.format(key))
                except Exception as ex:
                    logger.error('Error updating "{0}": {1}'.format(key, ex))
                    errors = True
        if errors is True:
            raise Exception('Failed to update the values for vDisk {0}'.format(vdisk.name))
Esempio n. 45
0
    def promote_or_demote_node(node_action,
                               cluster_ip=None,
                               execute_rollback=False):
        """
        Promotes or demotes the local node
        :param node_action: Demote or promote
        :type node_action: str
        :param cluster_ip: IP of node to promote or demote
        :type cluster_ip: str
        :param execute_rollback: In case of failure revert the changes made
        :type execute_rollback: bool
        :return: None
        """

        if node_action not in ('promote', 'demote'):
            raise ValueError('Nodes can only be promoted or demoted')

        Toolbox.log(logger=NodeTypeController._logger,
                    messages='Open vStorage Setup - {0}'.format(
                        node_action.capitalize()),
                    boxed=True)
        try:
            Toolbox.log(logger=NodeTypeController._logger,
                        messages='Collecting information',
                        title=True)

            machine_id = System.get_my_machine_id()
            if Configuration.get('/ovs/framework/hosts/{0}/setupcompleted'.
                                 format(machine_id)) is False:
                raise RuntimeError('No local OVS setup found.')

            if cluster_ip and not re.match(Toolbox.regex_ip, cluster_ip):
                raise RuntimeError(
                    'Incorrect IP provided ({0})'.format(cluster_ip))

            if cluster_ip:
                client = SSHClient(endpoint=cluster_ip)
                machine_id = System.get_my_machine_id(client)

            node_type = Configuration.get(
                '/ovs/framework/hosts/{0}/type'.format(machine_id))
            if node_action == 'promote' and node_type == 'MASTER':
                raise RuntimeError('This node is already master.')
            elif node_action == 'demote' and node_type == 'EXTRA':
                raise RuntimeError('This node should be a master.')
            elif node_type not in ['MASTER', 'EXTRA']:
                raise RuntimeError('This node is not correctly configured.')

            master_ip = None
            offline_nodes = []

            online = True
            target_client = None
            if node_action == 'demote' and cluster_ip:  # Demote an offline node
                from ovs.dal.lists.storagerouterlist import StorageRouterList
                from ovs.lib.storagedriver import StorageDriverController

                ip = cluster_ip
                unique_id = None
                ip_client_map = {}
                for storage_router in StorageRouterList.get_storagerouters():
                    try:
                        client = SSHClient(storage_router.ip, username='******')
                        if storage_router.node_type == 'MASTER':
                            master_ip = storage_router.ip
                        ip_client_map[storage_router.ip] = client
                    except UnableToConnectException:
                        if storage_router.ip == cluster_ip:
                            online = False
                            unique_id = storage_router.machine_id
                            StorageDriverController.mark_offline(
                                storagerouter_guid=storage_router.guid)
                        offline_nodes.append(storage_router)
                if online is True:
                    raise RuntimeError(
                        "If the node is online, please use 'ovs setup demote' executed on the node you wish to demote"
                    )
                if master_ip is None:
                    raise RuntimeError(
                        'Failed to retrieve another responsive MASTER node')

            else:
                target_password = Toolbox.ask_validate_password(
                    ip='127.0.0.1', logger=NodeTypeController._logger)
                target_client = SSHClient('127.0.0.1',
                                          username='******',
                                          password=target_password)

                unique_id = System.get_my_machine_id(target_client)
                ip = Configuration.get(
                    '/ovs/framework/hosts/{0}/ip'.format(unique_id))

                storagerouter_info = NodeTypeController.retrieve_storagerouter_info_via_host(
                    ip=target_client.ip, password=target_password)
                node_ips = [
                    sr_info['ip']
                    for sr_info in storagerouter_info.itervalues()
                ]
                master_node_ips = [
                    sr_info['ip']
                    for sr_info in storagerouter_info.itervalues()
                    if sr_info['type'] == 'master' and sr_info['ip'] != ip
                ]
                if len(master_node_ips) == 0:
                    if node_action == 'promote':
                        raise RuntimeError('No master node could be found')
                    else:
                        raise RuntimeError(
                            'It is not possible to remove the only master')

                master_ip = master_node_ips[0]
                ip_client_map = dict(
                    (node_ip, SSHClient(node_ip, username='******'))
                    for node_ip in node_ips)

            if node_action == 'demote':
                for cluster_name in Configuration.list('/ovs/arakoon'):
                    config = ArakoonClusterConfig(cluster_id=cluster_name)
                    arakoon_client = ArakoonInstaller.build_client(config)
                    metadata = json.loads(
                        arakoon_client.get(ArakoonInstaller.METADATA_KEY))
                    if len(config.nodes) == 1 and config.nodes[
                            0].ip == ip and metadata.get('internal') is True:
                        raise RuntimeError(
                            'Demote is not supported when single node Arakoon cluster(s) are present on the node to be demoted.'
                        )

            configure_rabbitmq = Toolbox.is_service_internally_managed(
                service='rabbitmq')
            configure_memcached = Toolbox.is_service_internally_managed(
                service='memcached')
            if node_action == 'promote':
                try:
                    NodeTypeController.promote_node(
                        cluster_ip=ip,
                        master_ip=master_ip,
                        ip_client_map=ip_client_map,
                        unique_id=unique_id,
                        configure_memcached=configure_memcached,
                        configure_rabbitmq=configure_rabbitmq)
                except Exception:
                    if execute_rollback is True:
                        NodeTypeController.demote_node(
                            cluster_ip=ip,
                            master_ip=master_ip,
                            ip_client_map=ip_client_map,
                            unique_id=unique_id,
                            unconfigure_memcached=configure_memcached,
                            unconfigure_rabbitmq=configure_rabbitmq,
                            offline_nodes=offline_nodes)
                    elif target_client is not None:
                        target_client.file_write('/tmp/ovs_rollback', 'demote')
                    raise
            else:
                try:
                    NodeTypeController.demote_node(
                        cluster_ip=ip,
                        master_ip=master_ip,
                        ip_client_map=ip_client_map,
                        unique_id=unique_id,
                        unconfigure_memcached=configure_memcached,
                        unconfigure_rabbitmq=configure_rabbitmq,
                        offline_nodes=offline_nodes)
                except Exception:
                    if execute_rollback is True:
                        NodeTypeController.promote_node(
                            cluster_ip=ip,
                            master_ip=master_ip,
                            ip_client_map=ip_client_map,
                            unique_id=unique_id,
                            configure_memcached=configure_memcached,
                            configure_rabbitmq=configure_rabbitmq)
                    elif target_client is not None:
                        target_client.file_write('/tmp/ovs_rollback',
                                                 'promote')
                    raise

            Toolbox.log(logger=NodeTypeController._logger, messages='\n')
            Toolbox.log(logger=NodeTypeController._logger,
                        messages='{0} complete.'.format(
                            node_action.capitalize()),
                        boxed=True)
        except Exception as exception:
            Toolbox.log(logger=NodeTypeController._logger, messages='\n')
            Toolbox.log(
                logger=NodeTypeController._logger,
                messages=['An unexpected error occurred:',
                          str(exception)],
                boxed=True,
                loglevel='exception')
            sys.exit(1)
        except KeyboardInterrupt:
            Toolbox.log(logger=NodeTypeController._logger, messages='\n')
            Toolbox.log(
                logger=NodeTypeController._logger,
                messages=
                'This setup was aborted. Open vStorage may be in an inconsistent state, make sure to validate the installation.',
                boxed=True,
                loglevel='error')
            sys.exit(1)
    def validate_vpool_sanity(expected_settings):
        """
        Check if all requirements are met for a healthy vPool
        :param expected_settings: Parameters used to create a vPool, which will be verified
        :type expected_settings: dict

        :return: None
        """
        if not isinstance(expected_settings, dict) or len(expected_settings) == 0:
            raise ValueError('Cannot validate vpool when no settings are passed')

        generic_settings = expected_settings.values()[0]
        vpool_name = generic_settings['vpool_name']
        mountpoint = '/mnt/{0}'.format(vpool_name)
        backend_type = generic_settings['type']
        rdma_enabled = generic_settings['config_params']['dtl_transport'] == StorageDriverClient.FRAMEWORK_DTL_TRANSPORT_RSOCKET

        vpool = GeneralVPool.get_vpool_by_name(vpool_name=vpool_name)
        assert vpool is not None, 'Could not find vPool with name {0}'.format(vpool_name)
        vpool_config = GeneralVPool.get_configuration(vpool)

        # Verify some basic vPool attributes
        assert vpool.name == vpool_name, 'Expected name {0} for vPool'.format(vpool_name)
        assert vpool.backend_type.code == backend_type, 'Expected backend type {0}'.format(backend_type)
        assert vpool.status == VPool.STATUSES.RUNNING, 'vPool does not have RUNNING status'
        assert vpool.rdma_enabled == rdma_enabled, 'RDMA enabled setting is incorrect'
        assert set(expected_settings.keys()) == set([sd.storagerouter for sd in vpool.storagedrivers]), "vPool storagerouters don't match the expected Storage Routers"

        # Verify vPool Storage Driver configuration
        expected_vpool_config = copy.deepcopy(generic_settings['config_params'])
        for key, value in vpool_config.iteritems():
            if key == 'dtl_enabled' or key == 'tlog_multiplier':
                continue
            if key not in expected_vpool_config:
                raise ValueError('Expected settings does not contain key {0}'.format(key))

            if value != expected_vpool_config[key]:
                raise ValueError('vPool does not have expected configuration {0} for key {1}'.format(expected_vpool_config[key], key))
            expected_vpool_config.pop(key)

        if len(expected_vpool_config) > 0:
            raise ValueError('Actual vPool configuration does not contain keys: {0}'.format(', '.join(expected_vpool_config.keys())))

        # Prepare some fields to check
        config = generic_settings['config_params']
        dtl_mode = config['dtl_mode']
        sco_size = config['sco_size']
        dedupe_mode = config['dedupe_mode']
        cluster_size = config['cluster_size']
        write_buffer = config['write_buffer']
        dtl_transport = config['dtl_transport']
        cache_strategy = config['cache_strategy']
        # @TODO: Add more validations for other expected settings (instead of None)
        expected_config = {'backend_connection_manager': {'backend_interface_retries_on_error': 5,
                                                          'backend_interface_retry_interval_secs': 1,
                                                          'backend_interface_retry_backoff_multiplier': 2.0},
                           'content_addressed_cache': {'clustercache_mount_points': None,
                                                       'read_cache_serialization_path': u'/var/rsp/{0}'.format(vpool.name)},
                           'distributed_lock_store': {'dls_arakoon_cluster_id': None,
                                                      'dls_arakoon_cluster_nodes': None,
                                                      'dls_type': u'Arakoon'},
                           'distributed_transaction_log': {'dtl_path': None,
                                                           'dtl_transport': dtl_transport.upper()},
                           'event_publisher': {'events_amqp_routing_key': u'volumerouter',
                                               'events_amqp_uris': None},
                           'file_driver': {'fd_cache_path': None,
                                           'fd_extent_cache_capacity': u'1024',
                                           'fd_namespace': None},
                           'filesystem': {'fs_dtl_config_mode': u'Automatic',
                                          'fs_dtl_mode': u'{0}'.format(StorageDriverClient.VPOOL_DTL_MODE_MAP[dtl_mode]),
                                          'fs_enable_shm_interface': 1,
                                          'fs_file_event_rules': None,
                                          'fs_metadata_backend_arakoon_cluster_nodes': None,
                                          'fs_metadata_backend_mds_nodes': None,
                                          'fs_metadata_backend_type': u'MDS',
                                          'fs_raw_disk_suffix': None,
                                          'fs_virtual_disk_format': None},
                           'metadata_server': {'mds_nodes': None},
                           'scocache': {'backoff_gap': u'2GB',
                                        'scocache_mount_points': None,
                                        'trigger_gap': u'1GB'},
                           'threadpool_component': {'num_threads': 16},
                           'volume_manager': {'clean_interval': 1,
                                              'default_cluster_size': 1024 * cluster_size,
                                              'dtl_throttle_usecs': 4000,
                                              'metadata_path': None,
                                              'non_disposable_scos_factor': float(write_buffer) / StorageDriverClient.TLOG_MULTIPLIER_MAP[sco_size] / sco_size,
                                              'number_of_scos_in_tlog': StorageDriverClient.TLOG_MULTIPLIER_MAP[sco_size],
                                              'read_cache_default_behaviour': StorageDriverClient.VPOOL_CACHE_MAP[cache_strategy],
                                              'read_cache_default_mode': StorageDriverClient.VPOOL_DEDUPE_MAP[dedupe_mode],
                                              'tlog_path': None},
                           'volume_registry': {'vregistry_arakoon_cluster_id': u'voldrv',
                                               'vregistry_arakoon_cluster_nodes': None},
                           'volume_router': {'vrouter_backend_sync_timeout_ms': 5000,
                                             'vrouter_file_read_threshold': 1024,
                                             'vrouter_file_write_threshold': 1024,
                                             'vrouter_id': None,
                                             'vrouter_max_workers': 16,
                                             'vrouter_migrate_timeout_ms': 5000,
                                             'vrouter_min_workers': 4,
                                             'vrouter_redirect_timeout_ms': u'5000',
                                             'vrouter_routing_retries': 10,
                                             'vrouter_sco_multiplier': 1024,
                                             'vrouter_volume_read_threshold': 1024,
                                             'vrouter_volume_write_threshold': 1024},
                           'volume_router_cluster': {'vrouter_cluster_id': None}}
        vpool_services = {'all': ['ovs-watcher-volumedriver',
                                  'ovs-dtl_{0}'.format(vpool.name),
                                  'ovs-volumedriver_{0}'.format(vpool.name),
                                  'ovs-volumerouter-consumer'],
                          'extra': [],
                          'master': ['ovs-arakoon-voldrv']}
        sd_partitions = {'DB': ['MD', 'MDS', 'TLOG'],
                         'READ': ['None'],
                         'WRITE': ['FD', 'DTL', 'SCO'],
                         'SCRUB': ['None']}

        if backend_type == 'alba':
            backend_metadata = {'name': (str, None),
                                'preset': (str, Toolbox.regex_preset),
                                'backend_guid': (str, Toolbox.regex_guid),
                                'arakoon_config': (dict, None),
                                'connection': (dict, {'host': (str, Toolbox.regex_ip, False),
                                                      'port': (int, {'min': 1, 'max': 65535}),
                                                      'client_id': (str, Toolbox.regex_guid),
                                                      'client_secret': (str, None),
                                                      'local': (bool, None)}),
                                'backend_info': (dict, {'policies': (list, None),
                                                        'sco_size': (float, None),
                                                        'frag_size': (float, None),
                                                        'total_size': (float, None),
                                                        'nsm_partition_guids': (list, Toolbox.regex_guid)})}
            required = {'backend': (dict, backend_metadata),
                        'backend_aa': (dict, backend_metadata, False)}
            Toolbox.verify_required_params(required_params=required,
                                           actual_params=vpool.metadata)
            vpool_services['all'].append("ovs-albaproxy_{0}".format(vpool.name))
            sd_partitions['WRITE'].append('FCACHE')
            expected_config['backend_connection_manager'].update({'alba_connection_host': None,
                                                                  'alba_connection_port': None,
                                                                  'alba_connection_preset': None,
                                                                  'alba_connection_timeout': 15,
                                                                  'backend_type': u'{0}'.format(vpool.backend_type.code.upper())})
        elif backend_type == 'distributed':
            expected_config['backend_connection_manager'].update({'backend_type': u'LOCAL',
                                                                  'local_connection_path': u'{0}'.format(generic_settings['distributed_mountpoint'])})

        assert EtcdConfiguration.exists('/ovs/arakoon/voldrv/config', raw=True), 'Volumedriver arakoon does not exist'

        # Do some verifications for all SDs
        storage_ip = None
        voldrv_config = GeneralArakoon.get_config('voldrv')
        all_files = GeneralVPool.get_related_files(vpool=vpool)
        all_directories = GeneralVPool.get_related_directories(vpool=vpool)

        for storagedriver in vpool.storagedrivers:
            storagerouter = storagedriver.storagerouter
            root_client = SSHClient(storagerouter, username='******')

            assert EtcdConfiguration.exists('/ovs/vpools/{0}/hosts/{1}/config'.format(vpool.guid, storagedriver.storagedriver_id), raw=True), 'vPool config not found in etcd'
            current_config_sections = set([item for item in EtcdConfiguration.list('/ovs/vpools/{0}/hosts/{1}/config'.format(vpool.guid, storagedriver.storagedriver_id))])
            assert not current_config_sections.difference(set(expected_config.keys())), 'New section appeared in the storage driver config in etcd'
            assert not set(expected_config.keys()).difference(current_config_sections), 'Config section expected for storage driver, but not found in etcd'

            for key, values in expected_config.iteritems():
                current_config = EtcdConfiguration.get('/ovs/vpools/{0}/hosts/{1}/config/{2}'.format(vpool.guid, storagedriver.storagedriver_id, key))
                assert set(current_config.keys()).union(set(values.keys())) == set(values.keys()), 'Not all expected keys match for key "{0}" on Storage Driver {1}'.format(key, storagedriver.name)

                for sub_key, value in current_config.iteritems():
                    expected_value = values[sub_key]
                    if expected_value is None:
                        continue
                    assert value == expected_value, 'Key: {0} - Sub key: {1} - Value: {2} - Expected value: {3}'.format(key, sub_key, value, expected_value)

            # Check services
            if storagerouter.node_type == 'MASTER':
                for service_name in vpool_services['all'] + vpool_services['master']:
                    if service_name == 'ovs-arakoon-voldrv' and GeneralStorageDriver.has_role(storagedriver, 'DB') is False:
                        continue
                    if ServiceManager.get_service_status(name=service_name,
                                                         client=root_client) is not True:
                        raise ValueError('Service {0} is not running on node {1}'.format(service_name, storagerouter.ip))
            else:
                for service_name in vpool_services['all'] + vpool_services['extra']:
                    if ServiceManager.get_service_status(name=service_name,
                                                         client=root_client) is not True:
                        raise ValueError('Service {0} is not running on node {1}'.format(service_name, storagerouter.ip))

            # Check arakoon config
            if not voldrv_config.has_section(storagerouter.machine_id):
                raise ValueError('Voldrv arakoon cluster does not have section {0}'.format(storagerouter.machine_id))

            # Basic SD checks
            assert storagedriver.cluster_ip == storagerouter.ip, 'Incorrect cluster IP. Expected: {0}  -  Actual: {1}'.format(storagerouter.ip, storagedriver.cluster_ip)
            assert storagedriver.mountpoint == '/mnt/{0}'.format(vpool.name), 'Incorrect mountpoint. Expected: {0}  -  Actual: {1}'.format(mountpoint, storagedriver.mountpoint)
            if storage_ip is not None:
                assert storagedriver.storage_ip == storage_ip, 'Incorrect storage IP. Expected: {0}  -  Actual: {1}'.format(storage_ip, storagedriver.storage_ip)
            storage_ip = storagedriver.storage_ip

            # Check required directories and files
            if storagerouter.guid not in all_directories:
                raise ValueError('Could not find directory information for Storage Router {0}'.format(storagerouter.ip))
            if storagerouter.guid not in all_files:
                raise ValueError('Could not find file information for Storage Router {0}'.format(storagerouter.ip))

            for directory in all_directories[storagerouter.guid]:
                if root_client.dir_exists(directory) is False:
                    raise ValueError('Directory {0} does not exist on Storage Router {1}'.format(directory, storagerouter.ip))
            for file_name in all_files[storagerouter.guid]:
                if root_client.file_exists(file_name) is False:
                    raise ValueError('File {0} does not exist on Storage Router {1}'.format(file_name, storagerouter.ip))

            for partition in storagedriver.partitions:
                if partition.role in sd_partitions and partition.sub_role in sd_partitions[partition.role]:
                    sd_partitions[partition.role].remove(partition.sub_role)
                elif partition.role in sd_partitions and partition.sub_role is None:
                    sd_partitions[partition.role].remove('None')

            # Verify vPool writeable
            if storagerouter.pmachine.hvtype == 'VMWARE':
                GeneralVPool.mount_vpool(vpool=vpool,
                                         root_client=root_client)

            vdisk = GeneralVDisk.create_volume(size=10,
                                               vpool=vpool,
                                               root_client=root_client)
            GeneralVDisk.write_to_volume(vdisk=vdisk,
                                         vpool=vpool,
                                         root_client=root_client,
                                         count=10,
                                         bs='1M',
                                         input_type='random')
            GeneralVDisk.delete_volume(vdisk=vdisk,
                                       vpool=vpool,
                                       root_client=root_client)

        for role, sub_roles in sd_partitions.iteritems():
            for sub_role in sub_roles:
                raise ValueError('Not a single Storage Driver found with partition role {0} and sub-role {1}'.format(role, sub_role))
Esempio n. 47
0
    def refresh_package_information():
        """
        Retrieve and store the package information of all StorageRouters
        :return: None
        """
        GenericController._logger.info('Updating package information')

        client_map = {}
        prerequisites = []
        package_info_cluster = {}
        all_storagerouters = StorageRouterList.get_storagerouters()
        all_storagerouters.sort(key=lambda sr: ExtensionsToolbox.advanced_sort(
            element=sr.ip, separator='.'))
        for storagerouter in all_storagerouters:
            package_info_cluster[storagerouter.ip] = {}
            try:
                # We make use of these clients in Threads --> cached = False
                client_map[storagerouter] = SSHClient(endpoint=storagerouter,
                                                      username='******',
                                                      cached=False)
            except (NotAuthenticatedException, UnableToConnectException):
                GenericController._logger.warning(
                    'StorageRouter {0} is inaccessible'.format(
                        storagerouter.ip))
                prerequisites.append(['node_down', storagerouter.name])
                package_info_cluster[storagerouter.ip]['errors'] = [
                    'StorageRouter {0} is inaccessible'.format(
                        storagerouter.name)
                ]

        # Retrieve for each StorageRouter in the cluster the installed and candidate versions of related packages
        # This also validates whether all required packages have been installed
        GenericController._logger.debug(
            'Retrieving package information for the cluster')
        threads = []
        for storagerouter, client in client_map.iteritems():
            for fct in Toolbox.fetch_hooks(
                    component='update',
                    sub_component='get_package_update_info_cluster'):
                thread = Thread(target=fct,
                                args=(client, package_info_cluster))
                thread.start()
                threads.append(thread)

        for thread in threads:
            thread.join()

        # Retrieve the related downtime / service restart information
        GenericController._logger.debug(
            'Retrieving update information for the cluster')
        update_info_cluster = {}
        for storagerouter, client in client_map.iteritems():
            update_info_cluster[storagerouter.ip] = {
                'errors':
                package_info_cluster[storagerouter.ip].get('errors', [])
            }
            for fct in Toolbox.fetch_hooks(
                    component='update',
                    sub_component='get_update_info_cluster'):
                fct(client, update_info_cluster,
                    package_info_cluster[storagerouter.ip])

        # Retrieve the update information for plugins (eg: ALBA, iSCSI)
        GenericController._logger.debug(
            'Retrieving package and update information for the plugins')
        threads = []
        update_info_plugin = {}
        for fct in Toolbox.fetch_hooks('update', 'get_update_info_plugin'):
            thread = Thread(target=fct, args=(update_info_plugin, ))
            thread.start()
            threads.append(thread)

        for thread in threads:
            thread.join()

        # Add the prerequisites
        if len(prerequisites) > 0:
            for ip, component_info in update_info_cluster.iteritems():
                if PackageFactory.COMP_FWK in component_info:
                    component_info[PackageFactory.COMP_FWK][
                        'prerequisites'].extend(prerequisites)

        # Store information in model and collect errors for OVS cluster
        errors = set()
        for storagerouter in all_storagerouters:
            GenericController._logger.debug(
                'Storing update information for StorageRouter {0}'.format(
                    storagerouter.ip))
            update_info = update_info_cluster.get(storagerouter.ip, {})

            # Remove the errors from the update information
            sr_errors = update_info.pop('errors', [])
            if len(sr_errors) > 0:
                errors.update([
                    '{0}: {1}'.format(storagerouter.ip, error)
                    for error in sr_errors
                ])
                update_info = {
                }  # If any error occurred, we store no update information for this StorageRouter

            # Remove the components without updates from the update information
            update_info_copy = copy.deepcopy(update_info)
            for component, info in update_info_copy.iteritems():
                if len(info['packages']) == 0:
                    update_info.pop(component)

            # Store the update information
            storagerouter.package_information = update_info
            storagerouter.save()

        # Collect errors for plugins
        for ip, plugin_errors in update_info_plugin.iteritems():
            if len(plugin_errors) > 0:
                errors.update(
                    ['{0}: {1}'.format(ip, error) for error in plugin_errors])

        if len(errors) > 0:
            raise Exception('\n - {0}'.format('\n - '.join(errors)))
        GenericController._logger.info('Finished updating package information')
Esempio n. 48
0
    def remove_node(node_ip, silent=None):
        """
        Remove the node with specified IP from the cluster
        :param node_ip: IP of the node to remove
        :type node_ip: str
        :param silent: If silent == '--force-yes' no question will be asked to confirm the removal
        :type silent: str
        :return: None
        """
        from ovs.lib.storagedriver import StorageDriverController
        from ovs.lib.storagerouter import StorageRouterController
        from ovs.dal.lists.storagerouterlist import StorageRouterList

        Toolbox.log(logger=NodeRemovalController._logger, messages="Remove node", boxed=True)
        Toolbox.log(
            logger=NodeRemovalController._logger,
            messages="WARNING: Some of these steps may take a very long time, please check the logs for more information\n\n",
        )

        ###############
        # VALIDATIONS #
        ###############
        try:
            node_ip = node_ip.strip()
            if not isinstance(node_ip, str):
                raise ValueError("Node IP must be a string")
            if not re.match(SSHClient.IP_REGEX, node_ip):
                raise ValueError("Invalid IP {0} specified".format(node_ip))

            storage_router_all = StorageRouterList.get_storagerouters()
            storage_router_masters = StorageRouterList.get_masters()
            storage_router_all_ips = set([storage_router.ip for storage_router in storage_router_all])
            storage_router_master_ips = set([storage_router.ip for storage_router in storage_router_masters])
            storage_router_to_remove = StorageRouterList.get_by_ip(node_ip)

            if node_ip not in storage_router_all_ips:
                raise ValueError(
                    "Unknown IP specified\nKnown in model:\n - {0}\nSpecified for removal:\n - {1}".format(
                        "\n - ".join(storage_router_all_ips), node_ip
                    )
                )

            if len(storage_router_all_ips) == 1:
                raise RuntimeError("Removing the only node is not possible")

            if node_ip in storage_router_master_ips and len(storage_router_master_ips) == 1:
                raise RuntimeError("Removing the only master node is not possible")

            if System.get_my_storagerouter() == storage_router_to_remove:
                raise RuntimeError(
                    "The node to be removed cannot be identical to the node on which the removal is initiated"
                )

            Toolbox.log(
                logger=NodeRemovalController._logger, messages="Creating SSH connections to remaining master nodes"
            )
            master_ip = None
            ip_client_map = {}
            storage_routers_offline = []
            storage_router_to_remove_online = True
            for storage_router in storage_router_all:
                try:
                    client = SSHClient(storage_router, username="******")
                    if client.run(["pwd"]):
                        Toolbox.log(
                            logger=NodeRemovalController._logger,
                            messages="  Node with IP {0:<15} successfully connected to".format(storage_router.ip),
                        )
                        ip_client_map[storage_router.ip] = client
                        if storage_router != storage_router_to_remove and storage_router.node_type == "MASTER":
                            master_ip = storage_router.ip
                except UnableToConnectException:
                    Toolbox.log(
                        logger=NodeRemovalController._logger,
                        messages="  Node with IP {0:<15} is unreachable".format(storage_router.ip),
                    )
                    storage_routers_offline.append(storage_router)
                    if storage_router == storage_router_to_remove:
                        storage_router_to_remove_online = False

            if len(ip_client_map) == 0 or master_ip is None:
                raise RuntimeError("Could not connect to any master node in the cluster")

            storage_router_to_remove.invalidate_dynamics("vdisks_guids")
            if (
                len(storage_router_to_remove.vdisks_guids) > 0
            ):  # vDisks are supposed to be moved away manually before removing a node
                raise RuntimeError("Still vDisks attached to Storage Router {0}".format(storage_router_to_remove.name))

            internal_memcached = Toolbox.is_service_internally_managed(service="memcached")
            internal_rabbit_mq = Toolbox.is_service_internally_managed(service="rabbitmq")
            memcached_endpoints = Configuration.get(key="/ovs/framework/memcache|endpoints")
            rabbit_mq_endpoints = Configuration.get(key="/ovs/framework/messagequeue|endpoints")
            copy_memcached_endpoints = list(memcached_endpoints)
            copy_rabbit_mq_endpoints = list(rabbit_mq_endpoints)
            for endpoint in memcached_endpoints:
                if endpoint.startswith(storage_router_to_remove.ip):
                    copy_memcached_endpoints.remove(endpoint)
            for endpoint in rabbit_mq_endpoints:
                if endpoint.startswith(storage_router_to_remove.ip):
                    copy_rabbit_mq_endpoints.remove(endpoint)
            if len(copy_memcached_endpoints) == 0 and internal_memcached is True:
                raise RuntimeError(
                    "Removal of provided nodes will result in a complete removal of the memcached service"
                )
            if len(copy_rabbit_mq_endpoints) == 0 and internal_rabbit_mq is True:
                raise RuntimeError(
                    "Removal of provided nodes will result in a complete removal of the messagequeue service"
                )
        except Exception as exception:
            Toolbox.log(
                logger=NodeRemovalController._logger, messages=[str(exception)], boxed=True, loglevel="exception"
            )
            sys.exit(1)

        #################
        # CONFIRMATIONS #
        #################
        interactive = silent != "--force-yes"
        remove_asd_manager = not interactive  # Remove ASD manager if non-interactive else ask
        if interactive is True:
            proceed = Interactive.ask_yesno(
                message="Are you sure you want to remove node {0}?".format(storage_router_to_remove.name),
                default_value=False,
            )
            if proceed is False:
                Toolbox.log(logger=NodeRemovalController._logger, messages="Abort removal", title=True)
                sys.exit(1)

            if storage_router_to_remove_online is True:
                client = SSHClient(endpoint=storage_router_to_remove, username="******")
                if ServiceManager.has_service(name="asd-manager", client=client):
                    remove_asd_manager = Interactive.ask_yesno(
                        message="Do you also want to remove the ASD manager and related ASDs?", default_value=False
                    )

            if remove_asd_manager is True or storage_router_to_remove_online is False:
                for function in Toolbox.fetch_hooks("setup", "validate_asd_removal"):
                    validation_output = function(storage_router_to_remove.ip)
                    if validation_output["confirm"] is True:
                        if Interactive.ask_yesno(message=validation_output["question"], default_value=False) is False:
                            remove_asd_manager = False
                            break

        ###########
        # REMOVAL #
        ###########
        try:
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages="Starting removal of node {0} - {1}".format(
                    storage_router_to_remove.name, storage_router_to_remove.ip
                ),
            )
            if storage_router_to_remove_online is False:
                Toolbox.log(
                    logger=NodeRemovalController._logger,
                    messages="  Marking all Storage Drivers served by Storage Router {0} as offline".format(
                        storage_router_to_remove.ip
                    ),
                )
                StorageDriverController.mark_offline(storagerouter_guid=storage_router_to_remove.guid)

            # Remove vPools
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages="  Removing vPools from node".format(storage_router_to_remove.ip),
            )
            storage_routers_offline_guids = [
                sr.guid for sr in storage_routers_offline if sr.guid != storage_router_to_remove.guid
            ]
            for storage_driver in storage_router_to_remove.storagedrivers:
                Toolbox.log(
                    logger=NodeRemovalController._logger,
                    messages="    Removing vPool {0} from node".format(storage_driver.vpool.name),
                )
                StorageRouterController.remove_storagedriver(
                    storagedriver_guid=storage_driver.guid, offline_storage_router_guids=storage_routers_offline_guids
                )

            # Demote if MASTER
            if storage_router_to_remove.node_type == "MASTER":
                NodeTypeController.demote_node(
                    cluster_ip=storage_router_to_remove.ip,
                    master_ip=master_ip,
                    ip_client_map=ip_client_map,
                    unique_id=storage_router_to_remove.machine_id,
                    unconfigure_memcached=internal_memcached,
                    unconfigure_rabbitmq=internal_rabbit_mq,
                    offline_nodes=storage_routers_offline,
                )

            # Stop / remove services
            Toolbox.log(logger=NodeRemovalController._logger, messages="Stopping and removing services")
            config_store = Configuration.get_store()
            if storage_router_to_remove_online is True:
                client = SSHClient(endpoint=storage_router_to_remove, username="******")
                NodeRemovalController.remove_services(
                    client=client,
                    node_type=storage_router_to_remove.node_type.lower(),
                    logger=NodeRemovalController._logger,
                )
                service = "watcher-config"
                if ServiceManager.has_service(service, client=client):
                    Toolbox.log(logger=NodeRemovalController._logger, messages="Removing service {0}".format(service))
                    ServiceManager.stop_service(service, client=client)
                    ServiceManager.remove_service(service, client=client)

                if config_store == "etcd":
                    from ovs.extensions.db.etcd.installer import EtcdInstaller

                    if Configuration.get(key="/ovs/framework/external_config") is None:
                        Toolbox.log(logger=NodeRemovalController._logger, messages="      Removing Etcd cluster")
                        try:
                            EtcdInstaller.stop("config", client)
                            EtcdInstaller.remove("config", client)
                        except Exception as ex:
                            Toolbox.log(
                                logger=NodeRemovalController._logger,
                                messages=["\nFailed to unconfigure Etcd", ex],
                                loglevel="exception",
                            )

                    Toolbox.log(logger=NodeRemovalController._logger, messages="Removing Etcd proxy")
                    EtcdInstaller.remove_proxy("config", client.ip)

            Toolbox.run_hooks(
                component="noderemoval",
                sub_component="remove",
                logger=NodeRemovalController._logger,
                cluster_ip=storage_router_to_remove.ip,
                complete_removal=remove_asd_manager,
            )

            # Clean up model
            Toolbox.log(logger=NodeRemovalController._logger, messages="Removing node from model")
            for service in storage_router_to_remove.services:
                service.delete()
            for disk in storage_router_to_remove.disks:
                for partition in disk.partitions:
                    partition.delete()
                disk.delete()
            for j_domain in storage_router_to_remove.domains:
                j_domain.delete()
            Configuration.delete("/ovs/framework/hosts/{0}".format(storage_router_to_remove.machine_id))

            NodeTypeController.restart_framework_and_memcache_services(
                clients=ip_client_map,
                offline_node_ips=[node.ip for node in storage_routers_offline],
                logger=NodeRemovalController._logger,
            )

            if storage_router_to_remove_online is True:
                client = SSHClient(endpoint=storage_router_to_remove, username="******")
                if config_store == "arakoon":
                    client.file_delete(filenames=[ArakoonConfiguration.CACC_LOCATION])
                client.file_delete(filenames=[Configuration.BOOTSTRAP_CONFIG_LOCATION])
            storage_router_to_remove.delete()
            Toolbox.log(logger=NodeRemovalController._logger, messages="Successfully removed node\n")
        except Exception as exception:
            Toolbox.log(logger=NodeRemovalController._logger, messages="\n")
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages=["An unexpected error occurred:", str(exception)],
                boxed=True,
                loglevel="exception",
            )
            sys.exit(1)
        except KeyboardInterrupt:
            Toolbox.log(logger=NodeRemovalController._logger, messages="\n")
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages="This setup was aborted. Open vStorage may be in an inconsistent state, make sure to validate the installation.",
                boxed=True,
                loglevel="error",
            )
            sys.exit(1)

        if remove_asd_manager is True:
            Toolbox.log(logger=NodeRemovalController._logger, messages="\nRemoving ASD Manager")
            with remote(storage_router_to_remove.ip, [os]) as rem:
                rem.os.system("asd-manager remove --force-yes")
        Toolbox.log(logger=NodeRemovalController._logger, messages="Remove nodes finished", title=True)
Esempio n. 49
0
    def update_volumedriver():
        """
        Update the volumedriver
        :return: None
        """
        file_mutex = FileMutex('system_update', wait=2)
        upgrade_file = '/etc/ready_for_upgrade'
        upgrade_ongoing_check_file = '/etc/upgrade_ongoing'
        ssh_clients = []
        try:
            file_mutex.acquire()
            UpdateController._log_message(
                '+++ Starting volumedriver update +++')

            from ovs.dal.lists.storagerouterlist import StorageRouterList

            UpdateController._log_message(
                'Generating SSH client connections for each storage router')
            storage_routers = StorageRouterList.get_storagerouters()
            ssh_clients = [
                SSHClient(storage_router.ip, 'root')
                for storage_router in storage_routers
            ]
            this_client = [
                client for client in ssh_clients if client.is_local is True
            ][0]

            # Commence update !!!!!!!
            # 0. Create locks
            UpdateController._log_message('Creating lock files',
                                          client_ip=this_client.ip)
            for client in ssh_clients:
                client.run(
                    'touch {0}'.format(upgrade_file)
                )  # Prevents manual install or upgrade individual packages
                client.run('touch {0}'.format(upgrade_ongoing_check_file)
                           )  # Prevents clicking x times on 'Update' btn

            # 1. Check requirements
            packages_to_update = set()
            all_services_to_restart = []
            for client in ssh_clients:
                for function in Toolbox.fetch_hooks('update', 'metadata'):
                    UpdateController._log_message(
                        'Executing function {0}'.format(function.__name__),
                        client_ip=client.ip)
                    output = function(client)
                    for key, value in output.iteritems():
                        if key != 'volumedriver':
                            continue
                        for package_info in value:
                            packages_to_update.update(package_info['packages'])
                            all_services_to_restart += package_info['services']

            services_to_restart = []
            for service in all_services_to_restart:
                if service not in services_to_restart:
                    services_to_restart.append(
                        service
                    )  # Filter out duplicates keeping the order of services (eg: watcher-framework before memcached)

            UpdateController._log_message(
                'Services which will be restarted --> {0}'.format(
                    ', '.join(services_to_restart)))
            UpdateController._log_message(
                'Packages which will be installed --> {0}'.format(
                    ', '.join(packages_to_update)))

            # 1. Stop services
            if UpdateController._change_services_state(
                    services=services_to_restart,
                    ssh_clients=ssh_clients,
                    action='stop') is False:
                UpdateController._log_message(
                    'Stopping all services on every node failed, cannot continue',
                    client_ip=this_client.ip,
                    severity='warning')
                UpdateController._remove_lock_files(
                    [upgrade_file, upgrade_ongoing_check_file], ssh_clients)

                UpdateController._log_message(
                    'Attempting to start the services again',
                    client_ip=this_client.ip)
                UpdateController._change_services_state(
                    services=services_to_restart,
                    ssh_clients=ssh_clients,
                    action='start')
                UpdateController._log_message(
                    'Failed to stop all required services, update aborted',
                    client_ip=this_client.ip,
                    severity='error')
                return

            # 2. Update packages
            failed_clients = []
            for client in ssh_clients:
                PackageManager.update(client=client)
                try:
                    for package_name in packages_to_update:
                        UpdateController._log_message(
                            'Installing {0}'.format(package_name), client.ip)
                        PackageManager.install(package_name=package_name,
                                               client=client,
                                               force=True)
                        UpdateController._log_message(
                            'Installed {0}'.format(package_name), client.ip)
                    client.file_delete(upgrade_file)
                except subprocess.CalledProcessError as cpe:
                    UpdateController._log_message(
                        'Upgrade failed with error: {0}'.format(cpe.output),
                        client.ip, 'error')
                    failed_clients.append(client)
                    break

            if failed_clients:
                UpdateController._remove_lock_files(
                    [upgrade_file, upgrade_ongoing_check_file], ssh_clients)
                UpdateController._log_message(
                    'Error occurred. Attempting to start all services again',
                    client_ip=this_client.ip,
                    severity='error')
                UpdateController._change_services_state(
                    services=services_to_restart,
                    ssh_clients=ssh_clients,
                    action='start')
                UpdateController._log_message(
                    'Failed to upgrade following nodes:\n - {0}\nPlease check /var/log/ovs/lib.log on {1} for more information'
                    .format('\n - '.join([
                        client.ip for client in failed_clients
                    ])), this_client.ip, 'error')
                return

            # 3. Post upgrade actions
            UpdateController._log_message('Executing post upgrade actions',
                                          client_ip=this_client.ip)
            for client in ssh_clients:
                for function in Toolbox.fetch_hooks('update', 'postupgrade'):
                    UpdateController._log_message(
                        'Executing action: {0}'.format(function.__name__),
                        client_ip=client.ip)
                    try:
                        function(client)
                    except Exception as ex:
                        UpdateController._log_message(
                            'Post upgrade action failed with error: {0}'.
                            format(ex), client.ip, 'error')

            # 4. Start services
            UpdateController._log_message('Starting services',
                                          client_ip=this_client.ip)
            UpdateController._change_services_state(
                services=services_to_restart,
                ssh_clients=ssh_clients,
                action='start')

            UpdateController._remove_lock_files([upgrade_ongoing_check_file],
                                                ssh_clients)
            UpdateController._log_message('+++ Finished updating +++')
        except RuntimeError as rte:
            if 'Could not acquire lock' in rte.message:
                UpdateController._log_message(
                    'Another volumedriver update is currently in progress!')
            else:
                UpdateController._log_message(
                    'Error during volumedriver update: {0}'.format(rte),
                    severity='error')
                UpdateController._remove_lock_files(
                    [upgrade_file, upgrade_ongoing_check_file], ssh_clients)
        except Exception as ex:
            UpdateController._log_message(
                'Error during volumedriver update: {0}'.format(ex),
                severity='error')
            UpdateController._remove_lock_files(
                [upgrade_file, upgrade_ongoing_check_file], ssh_clients)
        finally:
            file_mutex.release()
Esempio n. 50
0
    def demote_node(cluster_ip,
                    master_ip,
                    ip_client_map,
                    unique_id,
                    unconfigure_memcached,
                    unconfigure_rabbitmq,
                    offline_nodes=None):
        """
        Demotes a given node
        """
        from ovs.dal.lists.storagerouterlist import StorageRouterList

        Toolbox.log(logger=NodeTypeController._logger,
                    messages='Demoting node',
                    title=True)
        service_manager = ServiceFactory.get_manager()
        if offline_nodes is None:
            offline_nodes = []

        if unconfigure_memcached is True and len(offline_nodes) == 0:
            if NodeTypeController._validate_local_memcache_servers(
                    ip_client_map) is False:
                raise RuntimeError(
                    'Not all memcache nodes can be reached which is required for demoting a node.'
                )

        # Find other (arakoon) master nodes
        arakoon_cluster_name = str(
            Configuration.get('/ovs/framework/arakoon_clusters|ovsdb'))
        arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(
            cluster_name=arakoon_cluster_name)
        config = ArakoonClusterConfig(cluster_id=arakoon_cluster_name)
        master_node_ips = [node.ip for node in config.nodes]
        shrink = False
        if cluster_ip in master_node_ips:
            shrink = True
            master_node_ips.remove(cluster_ip)
        if len(master_node_ips) == 0:
            raise RuntimeError(
                'There should be at least one other master node')

        storagerouter = StorageRouterList.get_by_machine_id(unique_id)
        storagerouter.node_type = 'EXTRA'
        storagerouter.save()

        offline_node_ips = [node.ip for node in offline_nodes]
        if arakoon_metadata['internal'] is True and shrink is True:
            Toolbox.log(logger=NodeTypeController._logger,
                        messages='Leaving Arakoon {0} cluster'.format(
                            arakoon_cluster_name))
            arakoon_installer = ArakoonInstaller(
                cluster_name=arakoon_cluster_name)
            arakoon_installer.load()
            arakoon_installer.shrink_cluster(removal_ip=cluster_ip,
                                             offline_nodes=offline_node_ips)
            arakoon_installer.restart_cluster_after_shrinking()
        try:
            external_config = Configuration.get(
                '/ovs/framework/external_config')
            if external_config is None and shrink is True:
                Toolbox.log(logger=NodeTypeController._logger,
                            messages='Leaving Arakoon config cluster')
                arakoon_installer = ArakoonInstaller(cluster_name='config')
                arakoon_installer.load(ip=master_node_ips[0])
                arakoon_installer.shrink_cluster(
                    removal_ip=cluster_ip, offline_nodes=offline_node_ips)
                arakoon_installer.restart_cluster_after_shrinking()
        except Exception as ex:
            Toolbox.log(
                logger=NodeTypeController._logger,
                messages=['\nFailed to leave configuration cluster', ex],
                loglevel='exception')

        Toolbox.log(logger=NodeTypeController._logger,
                    messages='Update configurations')
        try:
            if unconfigure_memcached is True:
                endpoints = Configuration.get(
                    '/ovs/framework/memcache|endpoints')
                endpoint = '{0}:{1}'.format(cluster_ip, 11211)
                if endpoint in endpoints:
                    endpoints.remove(endpoint)
                Configuration.set('/ovs/framework/memcache|endpoints',
                                  endpoints)
            if unconfigure_rabbitmq is True:
                endpoints = Configuration.get(
                    '/ovs/framework/messagequeue|endpoints')
                endpoint = '{0}:{1}'.format(cluster_ip, 5672)
                if endpoint in endpoints:
                    endpoints.remove(endpoint)
                Configuration.set('/ovs/framework/messagequeue|endpoints',
                                  endpoints)
        except Exception as ex:
            Toolbox.log(logger=NodeTypeController._logger,
                        messages=['\nFailed to update configurations', ex],
                        loglevel='exception')

        if arakoon_metadata['internal'] is True:
            Toolbox.log(logger=NodeTypeController._logger,
                        messages='Restarting master node services')
            remaining_nodes = ip_client_map.keys()[:]
            if cluster_ip in remaining_nodes:
                remaining_nodes.remove(cluster_ip)

            PersistentFactory.store = None
            VolatileFactory.store = None

            for service in storagerouter.services:
                if service.name == 'arakoon-ovsdb':
                    service.delete()

        target_client = None
        if storagerouter in offline_nodes:
            if unconfigure_rabbitmq is True:
                Toolbox.log(
                    logger=NodeTypeController._logger,
                    messages='Removing/unconfiguring offline RabbitMQ node')
                client = ip_client_map[master_ip]
                try:
                    client.run([
                        'rabbitmqctl', 'forget_cluster_node',
                        'rabbit@{0}'.format(storagerouter.name)
                    ])
                except Exception as ex:
                    Toolbox.log(logger=NodeTypeController._logger,
                                messages=[
                                    '\nFailed to forget RabbitMQ cluster node',
                                    ex
                                ],
                                loglevel='exception')
        else:
            target_client = ip_client_map[cluster_ip]
            if unconfigure_rabbitmq is True:
                Toolbox.log(logger=NodeTypeController._logger,
                            messages='Removing/unconfiguring RabbitMQ')
                try:
                    if service_manager.has_service('rabbitmq-server',
                                                   client=target_client):
                        ServiceFactory.change_service_state(
                            target_client, 'rabbitmq-server', 'stop',
                            NodeTypeController._logger)
                        target_client.run(['rabbitmq-server', '-detached'])
                        time.sleep(5)
                        target_client.run(['rabbitmqctl', 'stop_app'])
                        time.sleep(5)
                        target_client.run(['rabbitmqctl', 'reset'])
                        time.sleep(5)
                        target_client.run(['rabbitmqctl', 'stop'])
                        time.sleep(5)
                        target_client.file_unlink(
                            "/var/lib/rabbitmq/.erlang.cookie")
                        ServiceFactory.change_service_state(
                            target_client, 'rabbitmq-server', 'stop',
                            NodeTypeController._logger)  # To be sure
                except Exception as ex:
                    Toolbox.log(logger=NodeTypeController._logger,
                                messages=[
                                    '\nFailed to remove/unconfigure RabbitMQ',
                                    ex
                                ],
                                loglevel='exception')

            Toolbox.log(logger=NodeTypeController._logger,
                        messages='Stopping services')
            services = ['memcached', 'rabbitmq-server']
            if unconfigure_rabbitmq is False:
                services.remove('rabbitmq-server')
            if unconfigure_memcached is False:
                services.remove('memcached')
            for service in services:
                if service_manager.has_service(service, client=target_client):
                    Toolbox.log(
                        logger=NodeTypeController._logger,
                        messages='Stopping service {0}'.format(service))
                    try:
                        ServiceFactory.change_service_state(
                            target_client, service, 'stop',
                            NodeTypeController._logger)
                    except Exception as ex:
                        Toolbox.log(
                            logger=NodeTypeController._logger,
                            messages=[
                                '\nFailed to stop service'.format(service), ex
                            ],
                            loglevel='exception')

            Toolbox.log(logger=NodeTypeController._logger,
                        messages='Removing services')
            services = [
                'scheduled-tasks', 'webapp-api', 'volumerouter-consumer'
            ]
            for service in services:
                if service_manager.has_service(service, client=target_client):
                    Toolbox.log(
                        logger=NodeTypeController._logger,
                        messages='Removing service {0}'.format(service))
                    try:
                        ServiceFactory.change_service_state(
                            target_client, service, 'stop',
                            NodeTypeController._logger)
                        service_manager.remove_service(service,
                                                       client=target_client)
                    except Exception as ex:
                        Toolbox.log(
                            logger=NodeTypeController._logger,
                            messages=[
                                '\nFailed to remove service'.format(service),
                                ex
                            ],
                            loglevel='exception')

            if service_manager.has_service('workers', client=target_client):
                service_manager.add_service(
                    name='workers',
                    client=target_client,
                    params={'WORKER_QUEUE': '{0}'.format(unique_id)})
        try:
            NodeTypeController._configure_amqp_to_volumedriver()
        except Exception as ex:
            Toolbox.log(
                logger=NodeTypeController._logger,
                messages=['\nFailed to configure AMQP to Storage Driver', ex],
                loglevel='exception')

        Toolbox.log(logger=NodeTypeController._logger,
                    messages='Restarting services')
        NodeTypeController.restart_framework_and_memcache_services(
            clients=ip_client_map,
            logger=NodeTypeController._logger,
            offline_node_ips=offline_node_ips)

        if Toolbox.run_hooks(component='nodetype',
                             sub_component='demote',
                             logger=NodeTypeController._logger,
                             cluster_ip=cluster_ip,
                             master_ip=master_ip,
                             offline_node_ips=offline_node_ips):
            Toolbox.log(logger=NodeTypeController._logger,
                        messages='Restarting services')
            NodeTypeController.restart_framework_and_memcache_services(
                clients=ip_client_map,
                logger=NodeTypeController._logger,
                offline_node_ips=offline_node_ips)

        if storagerouter not in offline_nodes:
            target_client = ip_client_map[cluster_ip]
            node_name, _ = target_client.get_hostname()
            if NodeTypeController.avahi_installed(
                    client=target_client,
                    logger=NodeTypeController._logger) is True:
                NodeTypeController.configure_avahi(
                    client=target_client,
                    node_name=node_name,
                    node_type='extra',
                    logger=NodeTypeController._logger)
        Configuration.set(
            '/ovs/framework/hosts/{0}/type'.format(storagerouter.machine_id),
            'EXTRA')

        if target_client is not None and target_client.file_exists(
                '/tmp/ovs_rollback'):
            target_client.file_write('/tmp/ovs_rollback', 'rollback')

        Toolbox.log(logger=NodeTypeController._logger,
                    messages='Demote complete',
                    title=True)
Esempio n. 51
0
 def _can_remove(self):
     """
     Can be removed
     """
     return len(Toolbox.fetch_hooks('license', '{0}.remove'.format(self.component))) == 1
Esempio n. 52
0
    def update_framework():
        """
        Update the framework
        :return: None
        """
        filemutex = file_mutex('system_update', wait=2)
        upgrade_file = '/etc/ready_for_upgrade'
        upgrade_ongoing_check_file = '/etc/upgrade_ongoing'
        ssh_clients = []
        try:
            filemutex.acquire()
            UpdateController._log_message('+++ Starting framework update +++')

            from ovs.dal.lists.storagerouterlist import StorageRouterList

            UpdateController._log_message('Generating SSH client connections for each storage router')
            upgrade_file = '/etc/ready_for_upgrade'
            upgrade_ongoing_check_file = '/etc/upgrade_ongoing'
            storage_routers = StorageRouterList.get_storagerouters()
            ssh_clients = []
            master_ips = []
            extra_ips = []
            for sr in storage_routers:
                ssh_clients.append(SSHClient(sr.ip, username='******'))
                if sr.node_type == 'MASTER':
                    master_ips.append(sr.ip)
                elif sr.node_type == 'EXTRA':
                    extra_ips.append(sr.ip)
            this_client = [client for client in ssh_clients if client.is_local is True][0]

            # Create locks
            UpdateController._log_message('Creating lock files', client_ip=this_client.ip)
            for client in ssh_clients:
                client.run('touch {0}'.format(upgrade_file))  # Prevents manual install or upgrade individual packages
                client.run('touch {0}'.format(upgrade_ongoing_check_file))  # Prevents clicking x times on 'Update' btn

            # Check requirements
            packages_to_update = set()
            all_services_to_restart = []
            for client in ssh_clients:
                for function in Toolbox.fetch_hooks('update', 'metadata'):
                    UpdateController._log_message('Executing function {0}'.format(function.__name__),
                                                  client_ip=client.ip)
                    output = function(client)
                    for key, value in output.iteritems():
                        if key != 'framework':
                            continue
                        for package_info in value:
                            packages_to_update.update(package_info['packages'])
                            all_services_to_restart += package_info['services']

            services_to_restart = []
            for service in all_services_to_restart:
                if service not in services_to_restart:
                    services_to_restart.append(service)  # Filter out duplicates maintaining the order of services (eg: watcher-framework before memcached)

            UpdateController._log_message('Services which will be restarted --> {0}'.format(', '.join(services_to_restart)))
            UpdateController._log_message('Packages which will be installed --> {0}'.format(', '.join(packages_to_update)))

            # Stop services
            if UpdateController._change_services_state(services=services_to_restart,
                                                       ssh_clients=ssh_clients,
                                                       action='stop') is False:
                UpdateController._log_message('Stopping all services on every node failed, cannot continue',
                                              client_ip=this_client.ip, severity='warning')
                UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients)

                # Start services again if a service could not be stopped
                UpdateController._log_message('Attempting to start the services again', client_ip=this_client.ip)
                UpdateController._change_services_state(services=services_to_restart,
                                                        ssh_clients=ssh_clients,
                                                        action='start')

                UpdateController._log_message('Failed to stop all required services, aborting update',
                                              client_ip=this_client.ip, severity='error')
                return

            # Update packages
            failed_clients = []
            for client in ssh_clients:
                PackageManager.update(client=client)
                try:
                    UpdateController._log_message('Installing latest packages', client.ip)
                    for package in packages_to_update:
                        UpdateController._log_message('Installing {0}'.format(package), client.ip)
                        PackageManager.install(package_name=package,
                                               client=client,
                                               force=True)
                        UpdateController._log_message('Installed {0}'.format(package), client.ip)
                    client.file_delete(upgrade_file)
                except subprocess.CalledProcessError as cpe:
                    UpdateController._log_message('Upgrade failed with error: {0}'.format(cpe.output), client.ip,
                                                  'error')
                    failed_clients.append(client)
                    break

            if failed_clients:
                UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients)
                UpdateController._log_message('Error occurred. Attempting to start all services again',
                                              client_ip=this_client.ip, severity='error')
                UpdateController._change_services_state(services=services_to_restart,
                                                        ssh_clients=ssh_clients,
                                                        action='start')
                UpdateController._log_message('Failed to upgrade following nodes:\n - {0}\nPlease check /var/log/ovs/lib.log on {1} for more information'.format('\n - '.join([client.ip for client in failed_clients]), this_client.ip),
                                              this_client.ip,
                                              'error')
                return

            # Migrate code
            for client in ssh_clients:
                try:
                    UpdateController._log_message('Started code migration', client.ip)
                    try:
                        with remote(client.ip, [Migrator]) as rem:
                            rem.Migrator.migrate(master_ips, extra_ips)
                    except EOFError as eof:
                        UpdateController._log_message('EOFError during code migration, retrying {0}'.format(eof), client.ip, 'warning')
                        with remote(client.ip, [Migrator]) as rem:
                            rem.Migrator.migrate(master_ips, extra_ips)
                    UpdateController._log_message('Finished code migration', client.ip)
                except Exception as ex:
                    UpdateController._remove_lock_files([upgrade_ongoing_check_file], ssh_clients)
                    UpdateController._log_message('Code migration failed with error: {0}'.format(ex), client.ip, 'error')
                    return

            # Start services
            UpdateController._log_message('Starting services', client_ip=this_client.ip)
            model_services = []
            if 'arakoon-ovsdb' in services_to_restart:
                model_services.append('arakoon-ovsdb')
                services_to_restart.remove('arakoon-ovsdb')
            if 'memcached' in services_to_restart:
                model_services.append('memcached')
                services_to_restart.remove('memcached')
            UpdateController._change_services_state(services=model_services,
                                                    ssh_clients=ssh_clients,
                                                    action='start')

            # Migrate model
            UpdateController._log_message('Started model migration', client_ip=this_client.ip)
            try:
                from ovs.dal.helpers import Migration
                with remote(ssh_clients[0].ip, [Migration]) as rem:
                    rem.Migration.migrate()
                UpdateController._log_message('Finished model migration', client_ip=this_client.ip)
            except Exception as ex:
                UpdateController._remove_lock_files([upgrade_ongoing_check_file], ssh_clients)
                UpdateController._log_message('An unexpected error occurred: {0}'.format(ex), client_ip=this_client.ip,
                                              severity='error')
                return

            # Post upgrade actions
            UpdateController._log_message('Executing post upgrade actions', client_ip=this_client.ip)
            for client in ssh_clients:
                with remote(client.ip, [Toolbox, SSHClient]) as rem:
                    for function in rem.Toolbox.fetch_hooks('update', 'postupgrade'):
                        UpdateController._log_message('Executing action {0}'.format(function.__name__),
                                                      client_ip=client.ip)
                        try:
                            function(rem.SSHClient(client.ip, username='******'))
                            UpdateController._log_message('Executing action {0} completed'.format(function.__name__),
                                                          client_ip=client.ip)
                        except Exception as ex:
                            UpdateController._log_message('Post upgrade action failed with error: {0}'.format(ex),
                                                          client.ip, 'error')

            # Start watcher and restart support-agent
            UpdateController._change_services_state(services=services_to_restart,
                                                    ssh_clients=ssh_clients,
                                                    action='start')
            UpdateController._change_services_state(services=['support-agent'],
                                                    ssh_clients=ssh_clients,
                                                    action='restart')

            UpdateController._remove_lock_files([upgrade_ongoing_check_file], ssh_clients)
            UpdateController._log_message('+++ Finished updating +++')
        except RuntimeError as rte:
            UpdateController._log_message('Error during framework update: {0}'.format(rte), severity='error')
            UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients)
        except NoLockAvailableException:
            UpdateController._log_message('Another framework update is currently in progress!')
        except Exception as ex:
            UpdateController._log_message('Error during framework update: {0}'.format(ex), severity='error')
            UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients)
        finally:
            filemutex.release()
Esempio n. 53
0
    def remove_node(node_ip, silent=None):
        """
        Remove the node with specified IP from the cluster
        :param node_ip: IP of the node to remove
        :type node_ip: str
        :param silent: If silent == '--force-yes' no question will be asked to confirm the removal
        :type silent: str
        :return: None
        """
        from ovs.dal.lists.storagerouterlist import StorageRouterList
        from ovs.lib.storagedriver import StorageDriverController
        from ovs.lib.vpool import VPoolController

        Toolbox.log(logger=NodeRemovalController._logger,
                    messages='Remove node',
                    boxed=True)
        Toolbox.log(
            logger=NodeRemovalController._logger,
            messages=
            'WARNING: Some of these steps may take a very long time, please check the logs for more information\n\n'
        )
        service_manager = ServiceFactory.get_manager()

        ###############
        # VALIDATIONS #
        ###############
        try:
            node_ip = node_ip.strip()
            if not isinstance(node_ip, str):
                raise ValueError('Node IP must be a string')
            if not re.match(SSHClient.IP_REGEX, node_ip):
                raise ValueError('Invalid IP {0} specified'.format(node_ip))

            storage_router_all = sorted(StorageRouterList.get_storagerouters(),
                                        key=lambda k: k.name)
            storage_router_masters = StorageRouterList.get_masters()
            storage_router_all_ips = set(
                [storage_router.ip for storage_router in storage_router_all])
            storage_router_master_ips = set([
                storage_router.ip for storage_router in storage_router_masters
            ])
            storage_router_to_remove = StorageRouterList.get_by_ip(node_ip)
            offline_reasons = {}
            if node_ip not in storage_router_all_ips:
                raise ValueError(
                    'Unknown IP specified\nKnown in model:\n - {0}\nSpecified for removal:\n - {1}'
                    .format('\n - '.join(storage_router_all_ips), node_ip))

            if len(storage_router_all_ips) == 1:
                raise RuntimeError("Removing the only node is not possible")

            if node_ip in storage_router_master_ips and len(
                    storage_router_master_ips) == 1:
                raise RuntimeError(
                    "Removing the only master node is not possible")

            if System.get_my_storagerouter() == storage_router_to_remove:
                raise RuntimeError(
                    'The node to be removed cannot be identical to the node on which the removal is initiated'
                )

            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages='Creating SSH connections to remaining master nodes')
            master_ip = None
            ip_client_map = {}
            storage_routers_offline = []
            storage_router_to_remove_online = True
            for storage_router in storage_router_all:
                try:
                    client = SSHClient(storage_router,
                                       username='******',
                                       timeout=10)
                except (UnableToConnectException, NotAuthenticatedException,
                        TimeOutException) as ex:
                    if isinstance(ex, UnableToConnectException):
                        msg = 'Unable to connect'
                    elif isinstance(ex, NotAuthenticatedException):
                        msg = 'Could not authenticate'
                    elif isinstance(ex, TimeOutException):
                        msg = 'Connection timed out'
                    Toolbox.log(
                        logger=NodeRemovalController._logger,
                        messages='  * Node with IP {0:<15}- {1}'.format(
                            storage_router.ip, msg))
                    offline_reasons[storage_router.ip] = msg
                    storage_routers_offline.append(storage_router)
                    if storage_router == storage_router_to_remove:
                        storage_router_to_remove_online = False
                    continue

                Toolbox.log(
                    logger=NodeRemovalController._logger,
                    messages='  * Node with IP {0:<15}- Successfully connected'
                    .format(storage_router.ip))
                ip_client_map[storage_router.ip] = client
                if storage_router != storage_router_to_remove and storage_router.node_type == 'MASTER':
                    master_ip = storage_router.ip

            if len(ip_client_map) == 0 or master_ip is None:
                raise RuntimeError(
                    'Could not connect to any master node in the cluster')

            storage_router_to_remove.invalidate_dynamics('vdisks_guids')
            if len(
                    storage_router_to_remove.vdisks_guids
            ) > 0:  # vDisks are supposed to be moved away manually before removing a node
                raise RuntimeError(
                    "Still vDisks attached to Storage Router {0}".format(
                        storage_router_to_remove.name))

            internal_memcached = Toolbox.is_service_internally_managed(
                service='memcached')
            internal_rabbit_mq = Toolbox.is_service_internally_managed(
                service='rabbitmq')
            memcached_endpoints = Configuration.get(
                key='/ovs/framework/memcache|endpoints')
            rabbit_mq_endpoints = Configuration.get(
                key='/ovs/framework/messagequeue|endpoints')
            copy_memcached_endpoints = list(memcached_endpoints)
            copy_rabbit_mq_endpoints = list(rabbit_mq_endpoints)
            for endpoint in memcached_endpoints:
                if endpoint.startswith(storage_router_to_remove.ip):
                    copy_memcached_endpoints.remove(endpoint)
            for endpoint in rabbit_mq_endpoints:
                if endpoint.startswith(storage_router_to_remove.ip):
                    copy_rabbit_mq_endpoints.remove(endpoint)
            if len(copy_memcached_endpoints
                   ) == 0 and internal_memcached is True:
                raise RuntimeError(
                    'Removal of provided nodes will result in a complete removal of the memcached service'
                )
            if len(copy_rabbit_mq_endpoints
                   ) == 0 and internal_rabbit_mq is True:
                raise RuntimeError(
                    'Removal of provided nodes will result in a complete removal of the messagequeue service'
                )

            Toolbox.run_hooks(component='noderemoval',
                              sub_component='validate_removal',
                              logger=NodeRemovalController._logger,
                              cluster_ip=storage_router_to_remove.ip)
        except KeyboardInterrupt:
            Toolbox.log(logger=NodeRemovalController._logger, messages='\n')
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages=
                'Removal has been aborted during the validation step. No changes have been applied.',
                boxed=True,
                loglevel='warning')
            sys.exit(1)
        except Exception as exception:
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages=[str(exception)],
                        boxed=True,
                        loglevel='exception')
            sys.exit(1)

        #################
        # CONFIRMATIONS #
        #################
        try:
            interactive = silent != '--force-yes'
            remove_asd_manager = not interactive  # Remove ASD manager if non-interactive else ask
            if interactive is True:
                if len(storage_routers_offline) > 0:
                    Toolbox.log(
                        logger=NodeRemovalController._logger,
                        messages=
                        'Certain nodes appear to be offline. These will not fully removed and will cause issues if they are not really offline.'
                    )
                    Toolbox.log(
                        logger=NodeRemovalController._logger,
                        messages='Offline nodes: {0}'.format(''.join(
                            ('\n  * {0:<15}- {1}.'.format(ip, message)
                             for ip, message in offline_reasons.iteritems()))))
                    valid_node_info = Interactive.ask_yesno(
                        message=
                        'Continue the removal with these being presumably offline?',
                        default_value=False)
                    if valid_node_info is False:
                        Toolbox.log(
                            logger=NodeRemovalController._logger,
                            messages=
                            'Please validate the state of the nodes before removing.',
                            title=True)
                        sys.exit(1)
                proceed = Interactive.ask_yesno(
                    message='Are you sure you want to remove node {0}?'.format(
                        storage_router_to_remove.name),
                    default_value=False)
                if proceed is False:
                    Toolbox.log(logger=NodeRemovalController._logger,
                                messages='Abort removal',
                                title=True)
                    sys.exit(1)

                remove_asd_manager = True
                if storage_router_to_remove_online is True:
                    client = SSHClient(endpoint=storage_router_to_remove,
                                       username='******')
                    if service_manager.has_service(name='asd-manager',
                                                   client=client):
                        remove_asd_manager = Interactive.ask_yesno(
                            message=
                            'Do you also want to remove the ASD manager and related ASDs?',
                            default_value=False)

                if remove_asd_manager is True or storage_router_to_remove_online is False:
                    for fct in Toolbox.fetch_hooks('noderemoval',
                                                   'validate_asd_removal'):
                        validation_output = fct(storage_router_to_remove.ip)
                        if validation_output['confirm'] is True:
                            if Interactive.ask_yesno(
                                    message=validation_output['question'],
                                    default_value=False) is False:
                                remove_asd_manager = False
                                break
        except KeyboardInterrupt:
            Toolbox.log(logger=NodeRemovalController._logger, messages='\n')
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages=
                'Removal has been aborted during the confirmation step. No changes have been applied.',
                boxed=True,
                loglevel='warning')
            sys.exit(1)
        except Exception as exception:
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages=[str(exception)],
                        boxed=True,
                        loglevel='exception')
            sys.exit(1)
        ###########
        # REMOVAL #
        ###########
        try:
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='Starting removal of node {0} - {1}'.format(
                            storage_router_to_remove.name,
                            storage_router_to_remove.ip))
            if storage_router_to_remove_online is False:
                Toolbox.log(
                    logger=NodeRemovalController._logger,
                    messages=
                    '  Marking all Storage Drivers served by Storage Router {0} as offline'
                    .format(storage_router_to_remove.ip))
                StorageDriverController.mark_offline(
                    storagerouter_guid=storage_router_to_remove.guid)

            # Remove vPools
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='  Removing vPools from node'.format(
                            storage_router_to_remove.ip))
            storage_routers_offline_guids = [
                sr.guid for sr in storage_routers_offline
                if sr.guid != storage_router_to_remove.guid
            ]
            for storage_driver in storage_router_to_remove.storagedrivers:
                Toolbox.log(logger=NodeRemovalController._logger,
                            messages='    Removing vPool {0} from node'.format(
                                storage_driver.vpool.name))
                VPoolController.shrink_vpool(
                    storagedriver_guid=storage_driver.guid,
                    offline_storage_router_guids=storage_routers_offline_guids)

            # Demote if MASTER
            if storage_router_to_remove.node_type == 'MASTER':
                NodeTypeController.demote_node(
                    cluster_ip=storage_router_to_remove.ip,
                    master_ip=master_ip,
                    ip_client_map=ip_client_map,
                    unique_id=storage_router_to_remove.machine_id,
                    unconfigure_memcached=internal_memcached,
                    unconfigure_rabbitmq=internal_rabbit_mq,
                    offline_nodes=storage_routers_offline)

            # Stop / remove services
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='Stopping and removing services')
            if storage_router_to_remove_online is True:
                client = SSHClient(endpoint=storage_router_to_remove,
                                   username='******')
                NodeRemovalController.remove_services(
                    client=client,
                    node_type=storage_router_to_remove.node_type.lower(),
                    logger=NodeRemovalController._logger)
                service = 'watcher-config'
                if service_manager.has_service(service, client=client):
                    Toolbox.log(
                        logger=NodeRemovalController._logger,
                        messages='Removing service {0}'.format(service))
                    service_manager.stop_service(service, client=client)
                    service_manager.remove_service(service, client=client)

            Toolbox.run_hooks(component='noderemoval',
                              sub_component='remove',
                              logger=NodeRemovalController._logger,
                              cluster_ip=storage_router_to_remove.ip,
                              complete_removal=remove_asd_manager)

            # Clean up model
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='Removing node from model')
            for service in storage_router_to_remove.services:
                service.delete()
            for disk in storage_router_to_remove.disks:
                for partition in disk.partitions:
                    partition.delete()
                disk.delete()
            for j_domain in storage_router_to_remove.domains:
                j_domain.delete()
            Configuration.delete('/ovs/framework/hosts/{0}'.format(
                storage_router_to_remove.machine_id))

            NodeTypeController.restart_framework_and_memcache_services(
                clients=ip_client_map,
                offline_node_ips=[node.ip for node in storage_routers_offline],
                logger=NodeRemovalController._logger)

            if storage_router_to_remove_online is True:
                client = SSHClient(endpoint=storage_router_to_remove,
                                   username='******')
                client.file_delete(filenames=[CACC_LOCATION])
                client.file_delete(filenames=[CONFIG_STORE_LOCATION])
            storage_router_to_remove.delete()
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='Successfully removed node\n')
        except Exception as exception:
            Toolbox.log(logger=NodeRemovalController._logger, messages='\n')
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages=['An unexpected error occurred:',
                          str(exception)],
                boxed=True,
                loglevel='exception')
            sys.exit(1)
        except KeyboardInterrupt:
            Toolbox.log(logger=NodeRemovalController._logger, messages='\n')
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages=
                'This setup was aborted. Open vStorage may be in an inconsistent state, make sure to validate the installation.',
                boxed=True,
                loglevel='error')
            sys.exit(1)

        if remove_asd_manager is True and storage_router_to_remove_online is True:
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='\nRemoving ASD Manager')
            with remote(storage_router_to_remove.ip, [os]) as rem:
                rem.os.system('asd-manager remove --force-yes')
        Toolbox.log(logger=NodeRemovalController._logger,
                    messages='Remove nodes finished',
                    title=True)
Esempio n. 54
0
    def update_framework():
        """
        Update the framework
        :return: None
        """
        file_mutex = FileMutex('system_update', wait=2)
        upgrade_file = '/etc/ready_for_upgrade'
        upgrade_ongoing_check_file = '/etc/upgrade_ongoing'
        ssh_clients = []
        try:
            file_mutex.acquire()
            UpdateController._log_message('+++ Starting framework update +++')

            from ovs.dal.lists.storagerouterlist import StorageRouterList

            UpdateController._log_message(
                'Generating SSH client connections for each storage router')
            upgrade_file = '/etc/ready_for_upgrade'
            upgrade_ongoing_check_file = '/etc/upgrade_ongoing'
            storage_routers = StorageRouterList.get_storagerouters()
            ssh_clients = []
            master_ips = []
            extra_ips = []
            for sr in storage_routers:
                ssh_clients.append(SSHClient(sr.ip, username='******'))
                if sr.node_type == 'MASTER':
                    master_ips.append(sr.ip)
                elif sr.node_type == 'EXTRA':
                    extra_ips.append(sr.ip)
            this_client = [
                client for client in ssh_clients if client.is_local is True
            ][0]

            # Create locks
            UpdateController._log_message('Creating lock files',
                                          client_ip=this_client.ip)
            for client in ssh_clients:
                client.run(
                    'touch {0}'.format(upgrade_file)
                )  # Prevents manual install or upgrade individual packages
                client.run('touch {0}'.format(upgrade_ongoing_check_file)
                           )  # Prevents clicking x times on 'Update' btn

            # Check requirements
            packages_to_update = set()
            all_services_to_restart = []
            for client in ssh_clients:
                for function in Toolbox.fetch_hooks('update', 'metadata'):
                    UpdateController._log_message(
                        'Executing function {0}'.format(function.__name__),
                        client_ip=client.ip)
                    output = function(client)
                    for key, value in output.iteritems():
                        if key != 'framework':
                            continue
                        for package_info in value:
                            packages_to_update.update(package_info['packages'])
                            all_services_to_restart += package_info['services']

            services_to_restart = []
            for service in all_services_to_restart:
                if service not in services_to_restart:
                    services_to_restart.append(
                        service
                    )  # Filter out duplicates maintaining the order of services (eg: watcher-framework before memcached)

            UpdateController._log_message(
                'Services which will be restarted --> {0}'.format(
                    ', '.join(services_to_restart)))
            UpdateController._log_message(
                'Packages which will be installed --> {0}'.format(
                    ', '.join(packages_to_update)))

            # Stop services
            if UpdateController._change_services_state(
                    services=services_to_restart,
                    ssh_clients=ssh_clients,
                    action='stop') is False:
                UpdateController._log_message(
                    'Stopping all services on every node failed, cannot continue',
                    client_ip=this_client.ip,
                    severity='warning')
                UpdateController._remove_lock_files(
                    [upgrade_file, upgrade_ongoing_check_file], ssh_clients)

                # Start services again if a service could not be stopped
                UpdateController._log_message(
                    'Attempting to start the services again',
                    client_ip=this_client.ip)
                UpdateController._change_services_state(
                    services=services_to_restart,
                    ssh_clients=ssh_clients,
                    action='start')

                UpdateController._log_message(
                    'Failed to stop all required services, aborting update',
                    client_ip=this_client.ip,
                    severity='error')
                return

            # Update packages
            failed_clients = []
            for client in ssh_clients:
                PackageManager.update(client=client)
                try:
                    UpdateController._log_message('Installing latest packages',
                                                  client.ip)
                    for package in packages_to_update:
                        UpdateController._log_message(
                            'Installing {0}'.format(package), client.ip)
                        PackageManager.install(package_name=package,
                                               client=client,
                                               force=True)
                        UpdateController._log_message(
                            'Installed {0}'.format(package), client.ip)
                    client.file_delete(upgrade_file)
                except subprocess.CalledProcessError as cpe:
                    UpdateController._log_message(
                        'Upgrade failed with error: {0}'.format(cpe.output),
                        client.ip, 'error')
                    failed_clients.append(client)
                    break

            if failed_clients:
                UpdateController._remove_lock_files(
                    [upgrade_file, upgrade_ongoing_check_file], ssh_clients)
                UpdateController._log_message(
                    'Error occurred. Attempting to start all services again',
                    client_ip=this_client.ip,
                    severity='error')
                UpdateController._change_services_state(
                    services=services_to_restart,
                    ssh_clients=ssh_clients,
                    action='start')
                UpdateController._log_message(
                    'Failed to upgrade following nodes:\n - {0}\nPlease check /var/log/ovs/lib.log on {1} for more information'
                    .format('\n - '.join([
                        client.ip for client in failed_clients
                    ])), this_client.ip, 'error')
                return

            # Migrate code
            for client in ssh_clients:
                try:
                    UpdateController._log_message('Started code migration',
                                                  client.ip)
                    try:
                        with Remote(client.ip, [Migrator]) as remote:
                            remote.Migrator.migrate(master_ips, extra_ips)
                    except EOFError as eof:
                        UpdateController._log_message(
                            'EOFError during code migration, retrying {0}'.
                            format(eof), client.ip, 'warning')
                        with Remote(client.ip, [Migrator]) as remote:
                            remote.Migrator.migrate(master_ips, extra_ips)
                    UpdateController._log_message('Finished code migration',
                                                  client.ip)
                except Exception as ex:
                    UpdateController._remove_lock_files(
                        [upgrade_ongoing_check_file], ssh_clients)
                    UpdateController._log_message(
                        'Code migration failed with error: {0}'.format(ex),
                        client.ip, 'error')
                    return

            # Start services
            UpdateController._log_message('Starting services',
                                          client_ip=this_client.ip)
            model_services = []
            if 'arakoon-ovsdb' in services_to_restart:
                model_services.append('arakoon-ovsdb')
                services_to_restart.remove('arakoon-ovsdb')
            if 'memcached' in services_to_restart:
                model_services.append('memcached')
                services_to_restart.remove('memcached')
            UpdateController._change_services_state(services=model_services,
                                                    ssh_clients=ssh_clients,
                                                    action='start')

            # Migrate model
            UpdateController._log_message('Started model migration',
                                          client_ip=this_client.ip)
            try:
                from ovs.dal.helpers import Migration
                Migration.migrate()
                UpdateController._log_message('Finished model migration',
                                              client_ip=this_client.ip)
            except Exception as ex:
                UpdateController._remove_lock_files(
                    [upgrade_ongoing_check_file], ssh_clients)
                UpdateController._log_message(
                    'An unexpected error occurred: {0}'.format(ex),
                    client_ip=this_client.ip,
                    severity='error')
                return

            # Post upgrade actions
            UpdateController._log_message('Executing post upgrade actions',
                                          client_ip=this_client.ip)
            for client in ssh_clients:
                with Remote(client.ip, [Toolbox, SSHClient]) as remote:
                    for function in remote.Toolbox.fetch_hooks(
                            'update', 'postupgrade'):
                        UpdateController._log_message(
                            'Executing action {0}'.format(function.__name__),
                            client_ip=client.ip)
                        try:
                            function(
                                remote.SSHClient(client.ip, username='******'))
                            UpdateController._log_message(
                                'Executing action {0} completed'.format(
                                    function.__name__),
                                client_ip=client.ip)
                        except Exception as ex:
                            UpdateController._log_message(
                                'Post upgrade action failed with error: {0}'.
                                format(ex), client.ip, 'error')

            # Start watcher and restart support-agent
            UpdateController._change_services_state(
                services=services_to_restart,
                ssh_clients=ssh_clients,
                action='start')
            UpdateController._change_services_state(services=['support-agent'],
                                                    ssh_clients=ssh_clients,
                                                    action='restart')

            UpdateController._remove_lock_files([upgrade_ongoing_check_file],
                                                ssh_clients)
            UpdateController._log_message('+++ Finished updating +++')
        except RuntimeError as rte:
            if 'Could not acquire lock' in rte.message:
                UpdateController._log_message(
                    'Another framework update is currently in progress!')
            else:
                UpdateController._log_message(
                    'Error during framework update: {0}'.format(rte),
                    severity='error')
                UpdateController._remove_lock_files(
                    [upgrade_file, upgrade_ongoing_check_file], ssh_clients)
        except Exception as ex:
            UpdateController._log_message(
                'Error during framework update: {0}'.format(ex),
                severity='error')
            UpdateController._remove_lock_files(
                [upgrade_file, upgrade_ongoing_check_file], ssh_clients)
        finally:
            file_mutex.release()
Esempio n. 55
0
    def check_if_proxies_work(result_handler):
        """
        Checks if all Alba Proxies work on a local machine, it creates a namespace and tries to put and object
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        namespace_params = {
            'bucket_count': (list, None),
            'logical': (int, None),
            'storage': (int, None),
            'storage_per_osd': (list, None)
        }

        result_handler.info('Checking the ALBA proxies.', add_to_result=False)

        amount_of_presets_not_working = []
        # ignore possible subprocess output
        fnull = open(os.devnull, 'w')
        # try put/get/verify on all available proxies on the local node
        local_proxies = ServiceHelper.get_local_proxy_services()
        if len(local_proxies) == 0:
            result_handler.info('Found no proxies.', add_to_result=False)
            return amount_of_presets_not_working
        for service in local_proxies:
            try:
                result_handler.info('Checking ALBA proxy {0}.'.format(
                    service.name),
                                    add_to_result=False)
                ip = service.alba_proxy.storagedriver.storage_ip
                # Encapsulating try to determine test output
                try:
                    # Determine what to what backend the proxy is connected
                    proxy_client_cfg = AlbaCLI.run(command='proxy-client-cfg',
                                                   named_params={
                                                       'host': ip,
                                                       'port': service.ports[0]
                                                   })
                except AlbaException:
                    result_handler.failure(
                        'Fetching proxy info has failed. Please verify if {0}:{1} is the correct address for proxy {2}.'
                        .format(ip, service.ports[0], service.name))
                    continue
                # Fetch arakoon information
                abm_name = proxy_client_cfg.get('cluster_id')
                # Check if proxy config is correctly setup
                if abm_name is None:
                    raise ConfigNotMatchedException(
                        'Proxy config for proxy {0} does not have the correct format on node {1} with port {2}.'
                        .format(service.name, ip, service.ports[0]))
                abm_config = Configuration.get_configuration_path(
                    '/ovs/vpools/{0}/proxies/{1}/config/abm'.format(
                        service.alba_proxy.storagedriver.vpool.guid,
                        service.alba_proxy.guid))

                # Determine presets / backend
                try:
                    presets = AlbaCLI.run(command='list-presets',
                                          config=abm_config)
                except AlbaException:
                    result_handler.failure(
                        'Listing the presets has failed. Please check the arakoon config path. We used {0}'
                        .format(abm_config))
                    continue

                for preset in presets:
                    # If preset is not in use, test will fail so add a skip
                    if preset['in_use'] is False:
                        result_handler.skip(
                            'Preset {0} is not in use and will not be checked'.
                            format(preset['name']))
                        continue
                    preset_name = preset['name']
                    # Encapsulation try for cleanup
                    try:
                        # Generate new namespace name using the preset
                        namespace_key_prefix = 'ovs-healthcheck-ns-{0}-{1}'.format(
                            preset_name, AlbaHealthCheck.LOCAL_ID)
                        namespace_key = '{0}_{1}'.format(
                            namespace_key_prefix, uuid.uuid4())
                        object_key = 'ovs-healthcheck-obj-{0}'.format(
                            str(uuid.uuid4()))
                        # Create namespace
                        AlbaCLI.run(command='proxy-create-namespace',
                                    named_params={
                                        'host': ip,
                                        'port': service.ports[0]
                                    },
                                    extra_params=[namespace_key, preset_name])
                        # Wait until fully created
                        namespace_start_time = time.time()
                        for index in xrange(2):
                            # Running twice because the first one could give a false positive as the osds will alert the nsm
                            # and the nsm would respond with got messages but these were not the ones we are after
                            AlbaCLI.run(command='deliver-messages',
                                        config=abm_config)
                        while True:
                            if time.time(
                            ) - namespace_start_time > AlbaHealthCheck.NAMESPACE_TIMEOUT:
                                raise RuntimeError(
                                    'Creation namespace has timed out after {0}s'
                                    .format(time.time() -
                                            namespace_start_time))
                            list_ns_osds_output = AlbaCLI.run(
                                command='list-ns-osds',
                                config=abm_config,
                                extra_params=[namespace_key])
                            # Example output: [[0, [u'Active']], [3, [u'Active']]]
                            namespace_ready = True
                            for osd_info in list_ns_osds_output:  # If there are no osd_info records, uploading will fail so covered by HC
                                osd_state = osd_info[1][0]
                                if osd_state != 'Active':
                                    namespace_ready = False
                            if namespace_ready is True:
                                break
                        result_handler.success(
                            'Namespace successfully created on proxy {0} with preset {1}!'
                            .format(service.name, preset_name))
                        namespace_info = AlbaCLI.run(
                            command='show-namespace',
                            config=abm_config,
                            extra_params=[namespace_key])
                        Toolbox.verify_required_params(
                            required_params=namespace_params,
                            actual_params=namespace_info)
                        result_handler.success(
                            'Namespace successfully fetched on proxy {0} with preset {1}!'
                            .format(service.name, preset_name))

                        # Put test object to given dir
                        with open(AlbaHealthCheck.TEMP_FILE_LOC,
                                  'wb') as output_file:
                            output_file.write(
                                os.urandom(AlbaHealthCheck.TEMP_FILE_SIZE))
                        AlbaCLI.run(command='proxy-upload-object',
                                    named_params={
                                        'host': ip,
                                        'port': service.ports[0]
                                    },
                                    extra_params=[
                                        namespace_key,
                                        AlbaHealthCheck.TEMP_FILE_LOC,
                                        object_key
                                    ])
                        result_handler.success(
                            'Successfully uploaded the object to namespace {0}'
                            .format(namespace_key))
                        # download object
                        AlbaCLI.run(command='proxy-download-object',
                                    named_params={
                                        'host': ip,
                                        'port': service.ports[0]
                                    },
                                    extra_params=[
                                        namespace_key, object_key,
                                        AlbaHealthCheck.TEMP_FILE_FETCHED_LOC
                                    ])
                        result_handler.success(
                            'Successfully downloaded the object to namespace {0}'
                            .format(namespace_key))
                        # check if files exists - issue #57
                        if not (os.path.isfile(
                                AlbaHealthCheck.TEMP_FILE_FETCHED_LOC) and
                                os.path.isfile(AlbaHealthCheck.TEMP_FILE_LOC)):
                            # creation of object failed
                            raise ObjectNotFoundException(
                                ValueError('Creation of object has failed'))
                        hash_original = hashlib.md5(
                            open(AlbaHealthCheck.TEMP_FILE_LOC,
                                 'rb').read()).hexdigest()
                        hash_fetched = hashlib.md5(
                            open(AlbaHealthCheck.TEMP_FILE_FETCHED_LOC,
                                 'rb').read()).hexdigest()

                        if hash_original == hash_fetched:
                            result_handler.success(
                                'Fetched object {0} from namespace {1} on proxy {2} with preset {3} matches the created object!'
                                .format(object_key, namespace_key,
                                        service.name, preset_name))
                        else:
                            result_handler.failure(
                                'Fetched object {0} from namespace {1} on proxy {2} with preset {3} does not match the created object!'
                                .format(object_key, namespace_key,
                                        service.name, preset_name))

                    except ObjectNotFoundException as ex:
                        amount_of_presets_not_working.append(preset_name)
                        result_handler.failure(
                            'Failed to put object on namespace {0} failed on proxy {1}with preset {2} With error {3}'
                            .format(namespace_key, service.name, preset_name,
                                    ex))
                    except AlbaException as ex:
                        if ex.alba_command == 'proxy-create-namespace':
                            result_handler.failure(
                                'Create namespace has failed with {0} on namespace {1} with proxy {2} with preset {3}'
                                .format(str(ex), namespace_key, service.name,
                                        preset_name))
                        elif ex.alba_command == 'show-namespace':
                            result_handler.failure(
                                'Show namespace has failed with {0} on namespace {1} with proxy {2} with preset {3}'
                                .format(str(ex), namespace_key, service.name,
                                        preset_name))
                        elif ex.alba_command == 'proxy-upload-object':
                            result_handler.failure(
                                'Uploading the object has failed with {0} on namespace {1} with proxy {2} with preset {3}'
                                .format(str(ex), namespace_key, service.name,
                                        preset_name))
                        elif ex.alba_command == 'proxy-download-object':
                            result_handler.failure(
                                'Downloading the object has failed with {0} on namespace {1} with proxy {2} with preset {3}'
                                .format(str(ex), namespace_key, service.name,
                                        preset_name))
                    finally:
                        # Delete the created namespace and preset
                        subprocess.call(
                            ['rm', str(AlbaHealthCheck.TEMP_FILE_LOC)],
                            stdout=fnull,
                            stderr=subprocess.STDOUT)
                        subprocess.call(
                            ['rm',
                             str(AlbaHealthCheck.TEMP_FILE_FETCHED_LOC)],
                            stdout=fnull,
                            stderr=subprocess.STDOUT)
                        namespaces = AlbaCLI.run(command='list-namespaces',
                                                 config=abm_config)
                        namespaces_to_remove = []
                        proxy_named_params = {
                            'host': ip,
                            'port': service.ports[0]
                        }
                        for namespace in namespaces:
                            if namespace['name'].startswith(
                                    namespace_key_prefix):
                                namespaces_to_remove.append(namespace['name'])
                        for namespace_name in namespaces_to_remove:
                            if namespace_name == namespace_key:
                                result_handler.info(
                                    'Deleting namespace {0}.'.format(
                                        namespace_name))
                            else:
                                result_handler.warning(
                                    'Deleting namespace {0} which was leftover from a previous run.'
                                    .format(namespace_name))

                            AlbaCLI.run(command='proxy-delete-namespace',
                                        named_params=proxy_named_params,
                                        extra_params=[namespace_name])

                            namespace_delete_start = time.time()
                            while True:
                                try:
                                    AlbaCLI.run(
                                        command='show-namespace',
                                        config=abm_config,
                                        extra_params=[namespace_name]
                                    )  # Will fail if the namespace does not exist
                                except AlbaException:
                                    result_handler.success(
                                        'Namespace {0} successfully removed.'.
                                        format(namespace_name))
                                    break
                                if time.time(
                                ) - namespace_delete_start > AlbaHealthCheck.NAMESPACE_TIMEOUT:
                                    raise RuntimeError(
                                        'Delete namespace has timed out after {0}s'
                                        .format(time.time() -
                                                namespace_start_time))

                            # be tidy, and make the proxy forget the namespace
                            try:
                                AlbaCLI.run(
                                    command='proxy-statistics',
                                    named_params=proxy_named_params,
                                    extra_params=['--forget', namespace_name])
                            except:
                                result_handler.warning(
                                    'Failed to make proxy forget namespace {0}.'
                                    .format(namespace_name))

            except subprocess.CalledProcessError as ex:
                # this should stay for the deletion of the remaining files
                amount_of_presets_not_working.append(service.name)
                result_handler.failure(
                    'Proxy {0} has some problems. Got {1} as error'.format(
                        service.name, ex))

            except ConfigNotMatchedException as ex:
                amount_of_presets_not_working.append(service.name)
                result_handler.failure(
                    'Proxy {0} has some problems. Got {1} as error'.format(
                        service.name, ex))
Esempio n. 56
0
    def promote_node(cluster_ip, master_ip, ip_client_map, unique_id,
                     configure_memcached, configure_rabbitmq):
        """
        Promotes a given node
        """
        from ovs.dal.lists.storagerouterlist import StorageRouterList
        from ovs.dal.lists.servicetypelist import ServiceTypeList
        from ovs.dal.lists.servicelist import ServiceList
        from ovs.dal.hybrids.service import Service

        Toolbox.log(logger=NodeTypeController._logger,
                    messages='Promoting node',
                    title=True)
        service_manager = ServiceFactory.get_manager()
        if configure_memcached is True:
            if NodeTypeController._validate_local_memcache_servers(
                    ip_client_map) is False:
                raise RuntimeError(
                    'Not all memcache nodes can be reached which is required for promoting a node.'
                )

        target_client = ip_client_map[cluster_ip]
        machine_id = System.get_my_machine_id(target_client)
        node_name, _ = target_client.get_hostname()
        master_client = ip_client_map[master_ip]

        storagerouter = StorageRouterList.get_by_machine_id(unique_id)
        storagerouter.node_type = 'MASTER'
        storagerouter.save()

        external_config = Configuration.get('/ovs/framework/external_config')
        if external_config is None:
            Toolbox.log(logger=NodeTypeController._logger,
                        messages='Joining Arakoon configuration cluster')
            arakoon_installer = ArakoonInstaller(cluster_name='config')
            arakoon_installer.load(ip=master_ip)
            arakoon_installer.extend_cluster(
                new_ip=cluster_ip,
                base_dir=Configuration.get('/ovs/framework/paths|ovsdb'))
            arakoon_installer.restart_cluster_after_extending(
                new_ip=cluster_ip)
            service_manager.register_service(
                node_name=machine_id,
                service_metadata=arakoon_installer.service_metadata[cluster_ip]
            )

        # Find other (arakoon) master nodes
        arakoon_cluster_name = str(
            Configuration.get('/ovs/framework/arakoon_clusters|ovsdb'))
        arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(
            cluster_name=arakoon_cluster_name)
        config = ArakoonClusterConfig(cluster_id=arakoon_cluster_name)
        master_node_ips = [node.ip for node in config.nodes]
        if cluster_ip in master_node_ips:
            master_node_ips.remove(cluster_ip)
        if len(master_node_ips) == 0:
            raise RuntimeError(
                'There should be at least one other master node')

        arakoon_ports = []
        if arakoon_metadata['internal'] is True:
            Toolbox.log(logger=NodeTypeController._logger,
                        messages='Joining Arakoon OVS DB cluster')
            arakoon_installer = ArakoonInstaller(
                cluster_name=arakoon_cluster_name)
            arakoon_installer.load()
            arakoon_installer.extend_cluster(
                new_ip=cluster_ip,
                base_dir=Configuration.get('/ovs/framework/paths|ovsdb'))
            arakoon_installer.restart_cluster_after_extending(
                new_ip=cluster_ip)
            arakoon_ports = arakoon_installer.ports[cluster_ip]

        if configure_memcached is True:
            NodeTypeController.configure_memcached(
                client=target_client, logger=NodeTypeController._logger)
        NodeTypeController.add_services(client=target_client,
                                        node_type='master',
                                        logger=NodeTypeController._logger)

        Toolbox.log(logger=NodeTypeController._logger,
                    messages='Update configurations')
        if configure_memcached is True:
            endpoints = Configuration.get('/ovs/framework/memcache|endpoints')
            endpoint = '{0}:11211'.format(cluster_ip)
            if endpoint not in endpoints:
                endpoints.append(endpoint)
                Configuration.set('/ovs/framework/memcache|endpoints',
                                  endpoints)
        if configure_rabbitmq is True:
            endpoints = Configuration.get(
                '/ovs/framework/messagequeue|endpoints')
            endpoint = '{0}:5672'.format(cluster_ip)
            if endpoint not in endpoints:
                endpoints.append(endpoint)
                Configuration.set('/ovs/framework/messagequeue|endpoints',
                                  endpoints)

        if arakoon_metadata['internal'] is True:
            Toolbox.log(logger=NodeTypeController._logger,
                        messages='Restarting master node services')
            PersistentFactory.store = None
            VolatileFactory.store = None

            if 'arakoon-ovsdb' not in [
                    s.name for s in ServiceList.get_services() if
                    s.is_internal is False or s.storagerouter.ip == cluster_ip
            ]:
                service = Service()
                service.name = 'arakoon-ovsdb'
                service.type = ServiceTypeList.get_by_name(
                    ServiceType.SERVICE_TYPES.ARAKOON)
                service.ports = arakoon_ports
                service.storagerouter = storagerouter
                service.save()

        if configure_rabbitmq is True:
            NodeTypeController.configure_rabbitmq(
                client=target_client, logger=NodeTypeController._logger)
            # Copy rabbitmq cookie
            rabbitmq_cookie_file = '/var/lib/rabbitmq/.erlang.cookie'

            Toolbox.log(logger=NodeTypeController._logger,
                        messages='Copying RabbitMQ cookie')
            contents = master_client.file_read(rabbitmq_cookie_file)
            master_hostname, _ = master_client.get_hostname()
            target_client.dir_create(os.path.dirname(rabbitmq_cookie_file))
            target_client.file_write(rabbitmq_cookie_file, contents)
            target_client.file_chmod(rabbitmq_cookie_file, mode=0400)
            target_client.run(['rabbitmq-server', '-detached'])
            time.sleep(5)
            target_client.run(['rabbitmqctl', 'stop_app'])
            time.sleep(5)
            target_client.run([
                'rabbitmqctl', 'join_cluster',
                'rabbit@{0}'.format(master_hostname)
            ])
            time.sleep(5)
            target_client.run(['rabbitmqctl', 'stop'])
            time.sleep(5)

            # Enable HA for the rabbitMQ queues
            ServiceFactory.change_service_state(target_client,
                                                'rabbitmq-server', 'start',
                                                NodeTypeController._logger)
            NodeTypeController.check_rabbitmq_and_enable_ha_mode(
                client=target_client, logger=NodeTypeController._logger)

        NodeTypeController._configure_amqp_to_volumedriver()

        Toolbox.log(logger=NodeTypeController._logger,
                    messages='Starting services')
        services = ['memcached', 'arakoon-ovsdb', 'rabbitmq-server']
        if arakoon_metadata['internal'] is True:
            services.remove('arakoon-ovsdb')
        for service in services:
            if service_manager.has_service(service, client=target_client):
                ServiceFactory.change_service_state(target_client, service,
                                                    'start',
                                                    NodeTypeController._logger)

        Toolbox.log(logger=NodeTypeController._logger,
                    messages='Restarting services')
        NodeTypeController.restart_framework_and_memcache_services(
            clients=ip_client_map, logger=NodeTypeController._logger)

        if Toolbox.run_hooks(component='nodetype',
                             sub_component='promote',
                             logger=NodeTypeController._logger,
                             cluster_ip=cluster_ip,
                             master_ip=master_ip):
            Toolbox.log(logger=NodeTypeController._logger,
                        messages='Restarting services')
            NodeTypeController.restart_framework_and_memcache_services(
                clients=ip_client_map, logger=NodeTypeController._logger)

        if NodeTypeController.avahi_installed(
                client=target_client,
                logger=NodeTypeController._logger) is True:
            NodeTypeController.configure_avahi(
                client=target_client,
                node_name=node_name,
                node_type='master',
                logger=NodeTypeController._logger)
        Configuration.set('/ovs/framework/hosts/{0}/type'.format(machine_id),
                          'MASTER')
        target_client.run(
            ['chown', '-R', 'ovs:ovs', '/opt/OpenvStorage/config'])
        Configuration.set(
            '/ovs/framework/hosts/{0}/promotecompleted'.format(machine_id),
            True)

        if target_client.file_exists('/tmp/ovs_rollback'):
            target_client.file_delete('/tmp/ovs_rollback')

        Toolbox.log(logger=NodeTypeController._logger,
                    messages='Promote complete')
Esempio n. 57
0
    def update_volumedriver():
        """
        Update the volumedriver
        :return: None
        """
        filemutex = file_mutex('system_update', wait=2)
        upgrade_file = '/etc/ready_for_upgrade'
        upgrade_ongoing_check_file = '/etc/upgrade_ongoing'
        ssh_clients = []
        try:
            filemutex.acquire()
            UpdateController._log_message('+++ Starting volumedriver update +++')

            from ovs.dal.lists.storagerouterlist import StorageRouterList

            UpdateController._log_message('Generating SSH client connections for each storage router')
            storage_routers = StorageRouterList.get_storagerouters()
            ssh_clients = [SSHClient(storage_router.ip, 'root') for storage_router in storage_routers]
            this_client = [client for client in ssh_clients if client.is_local is True][0]

            # Commence update !!!!!!!
            # 0. Create locks
            UpdateController._log_message('Creating lock files', client_ip=this_client.ip)
            for client in ssh_clients:
                client.run('touch {0}'.format(upgrade_file))  # Prevents manual install or upgrade individual packages
                client.run('touch {0}'.format(upgrade_ongoing_check_file))  # Prevents clicking x times on 'Update' btn

            # 1. Check requirements
            packages_to_update = set()
            all_services_to_restart = []
            for client in ssh_clients:
                for function in Toolbox.fetch_hooks('update', 'metadata'):
                    UpdateController._log_message('Executing function {0}'.format(function.__name__),
                                                  client_ip=client.ip)
                    output = function(client)
                    for key, value in output.iteritems():
                        if key != 'volumedriver':
                            continue
                        for package_info in value:
                            packages_to_update.update(package_info['packages'])
                            all_services_to_restart += package_info['services']

            services_to_restart = []
            for service in all_services_to_restart:
                if service not in services_to_restart:
                    services_to_restart.append(service)  # Filter out duplicates keeping the order of services (eg: watcher-framework before memcached)

            UpdateController._log_message('Services which will be restarted --> {0}'.format(', '.join(services_to_restart)))
            UpdateController._log_message('Packages which will be installed --> {0}'.format(', '.join(packages_to_update)))

            # 1. Stop services
            if UpdateController._change_services_state(services=services_to_restart,
                                                       ssh_clients=ssh_clients,
                                                       action='stop') is False:
                UpdateController._log_message('Stopping all services on every node failed, cannot continue',
                                              client_ip=this_client.ip, severity='warning')
                UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients)

                UpdateController._log_message('Attempting to start the services again', client_ip=this_client.ip)
                UpdateController._change_services_state(services=services_to_restart,
                                                        ssh_clients=ssh_clients,
                                                        action='start')
                UpdateController._log_message('Failed to stop all required services, update aborted',
                                              client_ip=this_client.ip, severity='error')
                return

            # 2. Update packages
            failed_clients = []
            for client in ssh_clients:
                PackageManager.update(client=client)
                try:
                    for package_name in packages_to_update:
                        UpdateController._log_message('Installing {0}'.format(package_name), client.ip)
                        PackageManager.install(package_name=package_name,
                                               client=client,
                                               force=True)
                        UpdateController._log_message('Installed {0}'.format(package_name), client.ip)
                    client.file_delete(upgrade_file)
                except subprocess.CalledProcessError as cpe:
                    UpdateController._log_message('Upgrade failed with error: {0}'.format(cpe.output), client.ip,
                                                  'error')
                    failed_clients.append(client)
                    break

            if failed_clients:
                UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients)
                UpdateController._log_message('Error occurred. Attempting to start all services again',
                                              client_ip=this_client.ip, severity='error')
                UpdateController._change_services_state(services=services_to_restart,
                                                        ssh_clients=ssh_clients,
                                                        action='start')
                UpdateController._log_message('Failed to upgrade following nodes:\n - {0}\nPlease check /var/log/ovs/lib.log on {1} for more information'.format('\n - '.join([client.ip for client in failed_clients]), this_client.ip),
                                              this_client.ip,
                                              'error')
                return

            # 3. Post upgrade actions
            UpdateController._log_message('Executing post upgrade actions', client_ip=this_client.ip)
            for client in ssh_clients:
                for function in Toolbox.fetch_hooks('update', 'postupgrade'):
                    UpdateController._log_message('Executing action: {0}'.format(function.__name__), client_ip=client.ip)
                    try:
                        function(client)
                    except Exception as ex:
                        UpdateController._log_message('Post upgrade action failed with error: {0}'.format(ex),
                                                      client.ip, 'error')

            # 4. Start services
            UpdateController._log_message('Starting services', client_ip=this_client.ip)
            UpdateController._change_services_state(services=services_to_restart,
                                                    ssh_clients=ssh_clients,
                                                    action='start')

            UpdateController._remove_lock_files([upgrade_ongoing_check_file], ssh_clients)
            UpdateController._log_message('+++ Finished updating +++')
        except RuntimeError as rte:
            UpdateController._log_message('Error during volumedriver update: {0}'.format(rte), severity='error')
            UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients)
        except NoLockAvailableException:
            UpdateController._log_message('Another volumedriver update is currently in progress!')
        except Exception as ex:
            UpdateController._log_message('Error during volumedriver update: {0}'.format(ex), severity='error')
            UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients)
        finally:
            filemutex.release()
Esempio n. 58
0
    def configure_rabbitmq(client, logger):
        """
        Configure RabbitMQ
        :param client: Client on which to configure RabbitMQ
        :type client: ovs_extensions.generic.sshclient.SSHClient
        :param logger: Logger object used for logging
        :type logger: ovs.extensions.generic.logger.Logger
        :return: None
        """
        Toolbox.log(logger=logger, messages='Setting up RabbitMQ')
        service_manager = ServiceFactory.get_manager()
        rabbitmq_port = Configuration.get(
            '/ovs/framework/messagequeue|endpoints')[0].split(':')[1]
        rabbitmq_login = Configuration.get('/ovs/framework/messagequeue|user')
        rabbitmq_password = Configuration.get(
            '/ovs/framework/messagequeue|password')
        client.file_write(
            '/etc/rabbitmq/rabbitmq.config', """[
   {{rabbit, [{{tcp_listeners, [{0}]}},
              {{default_user, <<"{1}">>}},
              {{default_pass, <<"{2}">>}},
              {{cluster_partition_handling, autoheal}},
              {{log_levels, [{{connection, warning}}]}},
              {{vm_memory_high_watermark, 0.2}}]}}
].""".format(rabbitmq_port, rabbitmq_login, rabbitmq_password))

        rabbitmq_running, same_process = service_manager.is_rabbitmq_running(
            client=client)
        if rabbitmq_running is True:
            # Example output of 'list_users' command
            # Listing users ...
            # guest   [administrator]
            # ovs     []
            # ... done.
            users = [
                user.split('\t')[0] for user in client.run(
                    ['rabbitmqctl', 'list_users']).splitlines()
                if '\t' in user and '[' in user and ']' in user
            ]
            if 'ovs' in users:
                Toolbox.log(logger=logger,
                            messages='Already configured RabbitMQ')
                return
            ServiceFactory.change_service_state(client, 'rabbitmq-server',
                                                'stop', logger)

        client.run(['rabbitmq-server', '-detached'])
        time.sleep(5)

        # Sometimes/At random the rabbitmq server takes longer than 5 seconds to start,
        #  and the next command fails so the best solution is to retry several times
        # Also retry the add_user/set_permissions, and validate the result
        retry = 0
        while retry < 10:
            users = Toolbox.retry_client_run(
                client=client,
                command=['rabbitmqctl', 'list_users'],
                logger=logger).splitlines()
            users = [
                usr.split('\t')[0] for usr in users
                if '\t' in usr and '[' in usr and ']' in usr
            ]
            logger.debug('Rabbitmq users {0}'.format(users))
            if 'ovs' in users:
                logger.debug('User ovs configured in rabbitmq')
                break

            logger.debug(
                Toolbox.retry_client_run(client=client,
                                         command=[
                                             'rabbitmqctl', 'add_user',
                                             rabbitmq_login, rabbitmq_password
                                         ],
                                         logger=logger))
            logger.debug(
                Toolbox.retry_client_run(client=client,
                                         command=[
                                             'rabbitmqctl', 'set_permissions',
                                             rabbitmq_login, '.*', '.*', '.*'
                                         ],
                                         logger=logger))
            retry += 1
            time.sleep(1)
        client.run(['rabbitmqctl', 'stop'])
        time.sleep(5)