Ejemplo n.º 1
0
 def set_rdma_capability(storagerouter_guid):
     """
     Check if the StorageRouter has been reconfigured to be able to support RDMA
     :param storagerouter_guid: Guid of the StorageRouter to check and set
     :type storagerouter_guid: str
     :return: None
     :rtype: NoneType
     """
     storagerouter = StorageRouter(storagerouter_guid)
     client = SSHClient(storagerouter, username='******')
     rdma_capable = False
     with remote(client.ip, [os], username='******') as rem:
         for root, dirs, files in rem.os.walk('/sys/class/infiniband'):
             for directory in dirs:
                 ports_dir = '/'.join([root, directory, 'ports'])
                 if not rem.os.path.exists(ports_dir):
                     continue
                 for sub_root, sub_dirs, _ in rem.os.walk(ports_dir):
                     if sub_root != ports_dir:
                         continue
                     for sub_directory in sub_dirs:
                         state_file = '/'.join(
                             [sub_root, sub_directory, 'state'])
                         if rem.os.path.exists(state_file):
                             if 'ACTIVE' in client.run(['cat', state_file]):
                                 rdma_capable = True
     storagerouter.rdma_capable = rdma_capable
     storagerouter.save()
Ejemplo n.º 2
0
    def file_list(self, directory, abs_path=False, recursive=False):
        """
        List all files in directory
        WARNING: If executed recursively while not locally, this can take quite some time

        :param directory: Directory to list the files in
        :param abs_path: Return the absolute path of the files or only the file names
        :param recursive: Loop through the directories recursively
        :return: List of files in directory
        """
        all_files = []
        if self.is_local is True:
            for root, dirs, files in os.walk(directory):
                for file_name in files:
                    if abs_path is True:
                        all_files.append('/'.join([root, file_name]))
                    else:
                        all_files.append(file_name)
                if recursive is False:
                    break
        else:
            with remote(self.ip, [os], 'root') as rem:
                for root, dirs, files in rem.os.walk(directory):
                    for file_name in files:
                        if abs_path is True:
                            all_files.append('/'.join([root, file_name]))
                        else:
                            all_files.append(file_name)
                    if recursive is False:
                        break
        return all_files
Ejemplo n.º 3
0
 def test_ssh_connectivity():
     """
     Validates whether all nodes can SSH into eachother
     """
     MonitoringController._logger.info('Starting SSH connectivity test...')
     ips = [sr.ip for sr in StorageRouterList.get_storagerouters()]
     for ip in ips:
         for primary_username in ['root', 'ovs']:
             try:
                 with remote(ip, [SSHClient],
                             username=primary_username) as rem:
                     for local_ip in ips:
                         for username in ['root', 'ovs']:
                             message = '* Connection from {0}@{1} to {2}@{3}... {{0}}'.format(
                                 primary_username, ip, username, local_ip)
                             try:
                                 c = rem.SSHClient(local_ip,
                                                   username=username)
                                 assert c.run(['whoami'
                                               ]).strip() == username
                                 message = message.format('OK')
                                 logger = MonitoringController._logger.info
                             except Exception as ex:
                                 message = message.format(ex.message)
                                 logger = MonitoringController._logger.error
                             logger(message)
             except Exception as ex:
                 MonitoringController._logger.error(
                     '* Could not connect to {0}@{1}: {2}'.format(
                         primary_username, ip, ex.message))
     MonitoringController._logger.info('Finished')
Ejemplo n.º 4
0
    def execute_search_on_remote(since=None, until=None, search_locations=None, hosts=None, python_error=False,
                                 mode='search', username='******', password=None, suppress_return=False, search_patterns=None):
        """
        Searches all hosts for entries between given dates.
        Can be used standalone on the execution machine
        :param since: Starting date
        :type since: str / Datetime
        :param until: End date
        :type until: str / Datetime
        :param search_locations: list of paths of files / servicenames that will be searched on all nodes
        :type search_locations: list of str
        :param hosts: Ip of the nodes
        :type hosts: list of str
        :param mode: Search mode
        :type mode: str
        :param python_error: Whether only python errors should be checked
        :type python_error: Boolean
        :param username: Username of the user to login
        :type username: str
        :param password: Password of the user to login
        :type password: str
        :param suppress_return: only write to file and not return contents
        :type suppress_return: Boolean
        :param search_patterns: What error patterns should be recognized
        :type search_patterns: list of str
        :return: Output of a file as string
        """
        # Validate parameter
        if mode not in LogFileTimeParser.POSSIBLE_MODES:
            raise ValueError('Mode "{0}" is not supported. Possible modes are {1}'.format(mode, ', '.join(LogFileTimeParser.POSSIBLE_MODES)))

        # Clear cache
        open(LogFileTimeParser.FILE_PATH_REMOTE, 'w').close()

        since, until, search_locations, hosts = LogFileTimeParser._default_starting_values(since, until, search_locations, hosts)

        # Setup remote instances
        with remote(hosts, [LogFileTimeParser], username=username, password=password) as remotes:
            for host in hosts:
                results = ''
                if mode == 'search':
                    # Execute search
                    results = remotes[host].LogFileTimeParser.get_lines_between_timestamps(since=since, until=until,
                                                                                           search_locations=search_locations,
                                                                                           search_patterns=search_patterns,
                                                                                           host=host)
                elif mode == 'error-search':
                    # Execute search
                    results = remotes[host].LogFileTimeParser.search_for_errors(since=since, until=until,
                                                                                search_locations=search_locations,
                                                                                host=host,
                                                                                python_error=python_error)
                # Append output to cache
                with open(LogFileTimeParser.FILE_PATH_REMOTE, 'a') as output_file:
                    output_file.write(str(results))
        if not suppress_return:
            with open(LogFileTimeParser.FILE_PATH_REMOTE, 'r') as output_file:
                return output_file.read()
Ejemplo n.º 5
0
 def run_test(cls, storagedriver, vm_info, logger=LOGGER):
     """
     Deploy a vdbench and see if the following bug is triggered (or other datacorruption bugs)
     https://github.com/openvstorage/integrationtests/issues/468
     :param storagedriver: storagedriver to use for the VM its vdisks
     :type storagedriver: ovs.dal.hybrids.storagedriver.StorageDriver
     :param logger: logging instance
     :type logger: ovs.log.log_handler.LogHandler
     :param vm_info: information about all vms
     :type vm_info: dict
     :return: None
     :rtype: NoneType
     """
     with remote(storagedriver.storage_ip, [SSHClient]) as rem:
         try:
             for vm_name, vm_data in vm_info.iteritems():
                 vm_client = rem.SSHClient(vm_data['ip'], cls.VM_USERNAME,
                                           cls.VM_PASSWORD)
                 vm_client.file_create('/mnt/data/{0}.raw'.format(
                     vm_data['create_msg']))
                 vm_data['client'] = vm_client
                 # install fio on the VM
                 logger.info('Installing vdbench on {0}.'.format(vm_name))
                 DataWriter.deploy_vdbench(
                     client=vm_data['client'],
                     zip_remote_location=cls.VDBENCH_ZIP,
                     unzip_location=cls.VM_VDBENCH_ZIP,
                     amount_of_errors=cls.AMOUNT_DATA_ERRORS,
                     vdbench_config_path=cls.VM_VDBENCH_CFG_PATH,
                     lun_location=cls.VM_FILENAME,
                     thread_amount=cls.AMOUNT_THREADS,
                     write_amount=cls.AMOUNT_TO_WRITE,
                     xfersize=cls.XFERSIZE,
                     read_percentage=cls.READ_PERCENTAGE,
                     random_seek_percentage=cls.RANDOM_SEEK_PERCENTAGE,
                     io_rate=cls.IO_RATE,
                     duration=cls.VDBENCH_TIME,
                     interval=cls.VDBENCH_INTERVAL)
             for vm_name, vm_data in vm_info.iteritems():
                 logger.info('Starting VDBENCH on {0}!'.format(vm_name))
                 screen_names, output_files = DataWriter.write_data_vdbench(
                     client=vm_data['client'],
                     binary_location=cls.VM_VDBENCH_ZIP.replace('.zip', ''),
                     config_location=cls.VM_VDBENCH_CFG_PATH)
                 vm_data['screen_names'] = screen_names
             logger.info('Finished VDBENCH without errors!')
             logger.info('No data corruption detected!')
         finally:
             for vm_name, vm_data in vm_info.iteritems():
                 for screen_name in vm_data.get('screen_names', []):
                     logger.debug('Stopping screen {0} on {1}.'.format(
                         screen_name, vm_data['client'].ip))
                     vm_data['client'].run(
                         ['screen', '-S', screen_name, '-X', 'quit'])
                 vm_data['screen_names'] = []
 def __init__(self, host, port, message_queue=None, remote_ip=None):
     self.host = host
     self.port = port
     self.work_queue = Queue.Queue()
     self.work_queue.put('test')
     if remote_ip is not None:
         self.remote = remote(remote_ip, [socket])
         self._remote = self.remote.__enter__()
         self.sock = self._remote.socket.socket(socket.AF_INET,
                                                socket.SOCK_STREAM)
     else:
         self.remote = None
         self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
     self.sock.settimeout(1)
     self.sock.bind((self.host, self.port))
     self.logger.debug('Bound to {0}:{1}'.format(*self.sock.getsockname()))
     self.listening_thread = None
     if message_queue is None:
         message_queue = Queue.Queue()
     self.message_queue = message_queue
     self.received_messages = []
Ejemplo n.º 7
0
 def get_free_port(listener_ip, logger=LOGGER):
     """
     Returns a free port
     :param listener_ip: ip to listen on
     :type listener_ip: str
     :param logger: logging instance
     :type logger: ovs.extensions.generic.logger.Logger
     :return: port number
     :rtype: int
     """
     with remote(listener_ip, [socket]) as rem:
         listening_socket = rem.socket.socket(socket.AF_INET, socket.SOCK_STREAM)
         try:
             # Bind to first available port
             listening_socket.bind(('', 0))
             port = listening_socket.getsockname()[1]
             return port
         except socket.error as ex:
             logger.error('Could not bind the socket. Got {0}'.format(str(ex)))
             raise
         finally:
             listening_socket.close()
Ejemplo n.º 8
0
 def retrieve_storagerouter_info_via_host(ip, password):
     """
     Retrieve the storagerouters from model
     """
     storagerouters = {}
     try:
         from ovs.dal.lists.storagerouterlist import StorageRouterList
         with remote(ip_info=ip,
                     modules=[StorageRouterList],
                     username='******',
                     password=password,
                     strict_host_key_checking=False) as rem:
             for sr in rem.StorageRouterList.get_storagerouters():
                 storagerouters[sr.name] = {
                     'ip': sr.ip,
                     'type': sr.node_type.lower()
                 }
     except Exception as ex:
         Toolbox.log(
             logger=NodeTypeController._logger,
             messages='Error loading storagerouters: {0}'.format(ex),
             loglevel='exception',
             silent=True)
     return storagerouters
Ejemplo n.º 9
0
    def run_test(cls, vm_info, cluster_info, logger=LOGGER):
        """
        Tests the HA using a virtual machine which will write in his own filesystem
        :param cluster_info: information about the cluster, contains all dal objects
        :type cluster_info: dict
        :param vm_info: info about the vms
        :param logger: logging instance
        :return: None
        :rtype: NoneType
        """
        compute_client = SSHClient(cluster_info['storagerouters']['compute'],
                                   username='******')
        failed_configurations = []

        destination_storagedriver = cluster_info['storagedrivers'][
            'destination']
        source_storagedriver = cluster_info['storagedrivers']['source']

        # Cache to validate properties
        values_to_check = {
            'source_std': source_storagedriver.serialize(),
            'target_std': destination_storagedriver.serialize()
        }

        vm_to_stop = cls.HYPERVISOR_INFO['vms'][
            source_storagedriver.storage_ip]['name']
        parent_hypervisor = HypervisorFactory().get()
        # Extract vdisk info from vm_info
        vdisk_info = {}
        disk_amount = 0
        for vm_name, vm_object in vm_info.iteritems():
            for vdisk in vm_object['vdisks']:
                # Ignore the cd vdisk as no IO will come from it
                if vdisk.name == vm_object['cd_path'].replace(
                        '.raw', '').split('/')[-1]:
                    continue
                disk_amount += 1
                vdisk_info.update({vdisk.name: vdisk})

        with remote(compute_client.ip, [SSHClient]) as rem:
            configuration = random.choice(cls.DATA_TEST_CASES)
            threads = {'evented': {'io': {'pairs': [], 'r_semaphore': None}}}
            output_files = []
            vm_downed = False
            try:
                logger.info('Starting the following configuration: {0}'.format(
                    configuration))
                for vm_name, vm_data in vm_info.iteritems():
                    vm_client = rem.SSHClient(vm_data['ip'], cls.VM_USERNAME,
                                              cls.VM_PASSWORD)
                    vm_client.file_create('/mnt/data/{0}.raw'.format(
                        vm_data['create_msg']))
                    vm_data['client'] = vm_client
                io_thread_pairs, monitoring_data, io_r_semaphore = ThreadingHandler.start_io_polling_threads(
                    volume_bundle=vdisk_info)
                threads['evented']['io']['pairs'] = io_thread_pairs
                threads['evented']['io']['r_semaphore'] = io_r_semaphore
                for vm_name, vm_data in vm_info.iteritems():  # Write data
                    screen_names, output_files = DataWriter.write_data_fio(
                        client=vm_data['client'],
                        fio_configuration={
                            'io_size': cls.AMOUNT_TO_WRITE,
                            'configuration': configuration
                        },
                        file_locations=[
                            '/mnt/data/{0}.raw'.format(vm_data['create_msg'])
                        ])
                    vm_data['screen_names'] = screen_names
                logger.info(
                    'Doing IO for {0}s before bringing down the node.'.format(
                        cls.IO_TIME))
                ThreadingHandler.keep_threads_running(
                    r_semaphore=io_r_semaphore,
                    threads=io_thread_pairs,
                    shared_resource=monitoring_data,
                    duration=cls.IO_TIME)
                # Threads ready for monitoring at this point
                #########################
                # Bringing original owner of the volume down
                #########################
                try:
                    logger.info('Stopping {0}.'.format(vm_to_stop))
                    VMHandler.stop_vm(hypervisor=parent_hypervisor,
                                      vmid=vm_to_stop)
                    vm_downed = True
                except Exception as ex:
                    logger.error('Failed to stop. Got {0}'.format(str(ex)))
                    raise
                downed_time = time.time()
                time.sleep(cls.IO_REFRESH_RATE * 2)
                # Start IO polling to verify nothing went down
                ThreadingHandler.poll_io(
                    r_semaphore=io_r_semaphore,
                    required_thread_amount=len(io_thread_pairs),
                    shared_resource=monitoring_data,
                    downed_time=downed_time,
                    timeout=cls.HA_TIMEOUT,
                    output_files=output_files,
                    client=compute_client,
                    disk_amount=disk_amount)
                cls._validate(values_to_check, monitoring_data)
            except Exception as ex:
                logger.error(
                    'Running the test for configuration {0} has failed because {1}'
                    .format(configuration, str(ex)))
                failed_configurations.append({
                    'configuration': configuration,
                    'reason': str(ex)
                })
            finally:
                for thread_category, thread_collection in threads[
                        'evented'].iteritems():
                    ThreadHelper.stop_evented_threads(
                        thread_collection['pairs'],
                        thread_collection['r_semaphore'])
                if vm_downed is True:
                    VMHandler.start_vm(parent_hypervisor, vm_to_stop)
                    logger.debug('Started {0}'.format(vm_to_stop))
                    SystemHelper.idle_till_ovs_is_up(
                        source_storagedriver.storage_ip,
                        **cls.get_shell_user())
                    # @TODO: Remove when https://github.com/openvstorage/integrationtests/issues/540 is fixed
                    FwkHandler.restart_all()
                for vm_name, vm_data in vm_info.iteritems():
                    for screen_name in vm_data.get('screen_names', []):
                        logger.debug('Stopping screen {0} on {1}.'.format(
                            screen_name, vm_data['client'].ip))
                        vm_data['client'].run(
                            ['screen', '-S', screen_name, '-X', 'quit'])
                    vm_data['screen_names'] = []
        assert len(failed_configurations
                   ) == 0, 'Certain configuration failed: {0}'.format(
                       ' '.join(failed_configurations))
Ejemplo n.º 10
0
    def _execute_scrub(queue, vpool, scrub_info, scrub_dir, error_messages):
        def _verify_mds_config(current_vdisk):
            current_vdisk.invalidate_dynamics('info')
            vdisk_configs = current_vdisk.info['metadata_backend_config']
            if len(vdisk_configs) == 0:
                raise RuntimeError('Could not load MDS configuration')
            return vdisk_configs

        storagerouter = scrub_info['storage_router']
        partition_guid = scrub_info['partition_guid']
        volatile_client = VolatileFactory.get_client()
        backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format(
            vpool.guid, partition_guid)
        try:
            # Empty the queue with vDisks to scrub
            with remote(storagerouter.ip, [VDisk]) as rem:
                while True:
                    vdisk = None
                    vdisk_guid = queue.get(
                        False
                    )  # Raises Empty Exception when queue is empty, so breaking the while True loop
                    volatile_key = 'ovs_scrubbing_vdisk_{0}'.format(vdisk_guid)
                    try:
                        # Check MDS master is local. Trigger MDS handover if necessary
                        vdisk = rem.VDisk(vdisk_guid)
                        GenericController._logger.info(
                            'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Started scrubbing at location {3}'
                            .format(vpool.name, storagerouter.name, vdisk.name,
                                    scrub_dir))
                        configs = _verify_mds_config(current_vdisk=vdisk)
                        storagedriver = StorageDriverList.get_by_storagedriver_id(
                            vdisk.storagedriver_id)
                        if configs[0].get(
                                'ip') != storagedriver.storagerouter.ip:
                            GenericController._logger.info(
                                'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - MDS master is not local, trigger handover'
                                .format(vpool.name, storagerouter.name,
                                        vdisk.name))
                            MDSServiceController.ensure_safety(
                                VDisk(vdisk_guid)
                            )  # Do not use a remote VDisk instance here
                            configs = _verify_mds_config(current_vdisk=vdisk)
                            if configs[0].get(
                                    'ip') != storagedriver.storagerouter.ip:
                                GenericController._logger.warning(
                                    'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because master MDS still not local'
                                    .format(vpool.name, storagerouter.name,
                                            vdisk.name))
                                continue

                        # Check if vDisk is already being scrubbed
                        if volatile_client.add(key=volatile_key,
                                               value=volatile_key,
                                               time=24 * 60 * 60) is False:
                            GenericController._logger.warning(
                                'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because vDisk is already being scrubbed'
                                .format(vpool.name, storagerouter.name,
                                        vdisk.name))
                            continue

                        # Do the actual scrubbing
                        with vdisk.storagedriver_client.make_locked_client(
                                str(vdisk.volume_id)) as locked_client:
                            GenericController._logger.info(
                                'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Retrieve and apply scrub work'
                                .format(vpool.name, storagerouter.name,
                                        vdisk.name))
                            work_units = locked_client.get_scrubbing_workunits(
                            )
                            for work_unit in work_units:
                                res = locked_client.scrub(
                                    work_unit=work_unit,
                                    scratch_dir=scrub_dir,
                                    log_sinks=[
                                        LogHandler.get_sink_path(
                                            'scrubber_{0}'.format(vpool.name),
                                            allow_override=True,
                                            forced_target_type='file')
                                    ],
                                    backend_config=Configuration.
                                    get_configuration_path(backend_config_key))
                                locked_client.apply_scrubbing_result(
                                    scrubbing_work_result=res)
                            if work_units:
                                GenericController._logger.info(
                                    'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - {3} work units successfully applied'
                                    .format(vpool.name, storagerouter.name,
                                            vdisk.name, len(work_units)))
                            else:
                                GenericController._logger.info(
                                    'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - No scrubbing required'
                                    .format(vpool.name, storagerouter.name,
                                            vdisk.name))
                    except Exception:
                        if vdisk is None:
                            message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk with guid {2} could not be found'.format(
                                vpool.name, storagerouter.name, vdisk_guid)
                        else:
                            message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Scrubbing failed'.format(
                                vpool.name, storagerouter.name, vdisk.name)
                        error_messages.append(message)
                        GenericController._logger.exception(message)
                    finally:
                        # Remove vDisk from volatile memory
                        volatile_client.delete(volatile_key)

        except Empty:  # Raised when all items have been fetched from the queue
            GenericController._logger.info(
                'Scrubber - vPool {0} - StorageRouter {1} - Queue completely processed'
                .format(vpool.name, storagerouter.name))
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - Scrubbing failed'.format(
                vpool.name, storagerouter.name)
            error_messages.append(message)
            GenericController._logger.exception(message)
Ejemplo n.º 11
0
    def live_migrate(cls,
                     vm_info,
                     cluster_info,
                     disk_amount,
                     hypervisor_info,
                     logger=LOGGER):
        """
        Execute the live migration test
        Migrates the vm away using libvirt migrate call
        Expects the DAL to be updated due to the IO causing volumedriver to move the volume
        """
        failed_configurations = []

        destination_storagedriver = cluster_info['storagedrivers'][
            'destination']
        source_storagedriver = cluster_info['storagedrivers']['source']

        hv_credentials = HypervisorCredentials(
            ip=source_storagedriver.storage_ip,
            user=hypervisor_info['user'],
            password=hypervisor_info['password'],
            type=hypervisor_info['type'])
        source_hypervisor = HypervisorFactory().get(
            hv_credentials=hv_credentials)
        client = SSHClient(source_storagedriver.storagerouter)
        # Cache to validate properties
        values_to_check = {
            'source_std': source_storagedriver.serialize(),
            'target_std': destination_storagedriver.serialize()
        }

        # Extract vdisk info from vm_info
        vdisk_info = {}
        for vm_name, vm_object in vm_info.iteritems():
            for vdisk in vm_object['vdisks']:
                vdisk_info.update({vdisk.name: vdisk})

        with remote(source_storagedriver.storage_ip, [SSHClient]) as rem:
            test_run_nr = 0
            configuration = random.choice(cls.DATA_TEST_CASES)
            threads = {'evented': {'io': {'pairs': [], 'r_semaphore': None}}}
            output_files = []
            try:
                logger.info('Starting the following configuration: {0}'.format(
                    configuration))
                if test_run_nr == 0:  # Build reusable ssh clients
                    for vm_name, vm_data in vm_info.iteritems():
                        vm_client = rem.SSHClient(vm_data['ip'],
                                                  cls.VM_USERNAME,
                                                  cls.VM_PASSWORD)
                        vm_client.file_create('/mnt/data/{0}.raw'.format(
                            vm_data['create_msg']))
                        vm_data['client'] = vm_client
                else:
                    for vm_name, vm_data in vm_info.iteritems():
                        vm_data['client'].run([
                            'rm',
                            '/mnt/data/{0}.raw'.format(vm_data['create_msg'])
                        ])
                io_thread_pairs, monitoring_data, io_r_semaphore = ThreadingHandler.start_io_polling_threads(
                    volume_bundle=vdisk_info)
                threads['evented']['io']['pairs'] = io_thread_pairs
                threads['evented']['io']['r_semaphore'] = io_r_semaphore
                for vm_name, vm_data in vm_info.iteritems():  # Write data
                    screen_names, output_files = DataWriter.write_data_fio(
                        client=vm_data['client'],
                        fio_configuration={
                            'io_size': cls.AMOUNT_TO_WRITE,
                            'configuration': configuration
                        },
                        file_locations=[
                            '/mnt/data/{0}.raw'.format(vm_data['create_msg'])
                        ])
                    vm_data['screen_names'] = screen_names
                logger.info(
                    'Doing IO for {0}s before bringing down the node.'.format(
                        cls.IO_TIME))
                ThreadingHandler.keep_threads_running(
                    r_semaphore=io_r_semaphore,
                    threads=io_thread_pairs,
                    shared_resource=monitoring_data,
                    duration=cls.IO_TIME)
                # Threads ready for monitoring at this point
                #########################
                # Migrate the VMs
                #########################
                try:
                    logger.info('Migrating the VM.')
                    for vm_name in vm_info:
                        source_hypervisor.sdk.migrate(
                            vm_name, destination_storagedriver.storage_ip,
                            hypervisor_info['user'])
                except Exception as ex:
                    logger.error('Failed to stop. Got {0}'.format(str(ex)))
                    raise
                downed_time = time.time()
                time.sleep(cls.IO_REFRESH_RATE * 2)
                # Start IO polling to verify nothing went down
                ThreadingHandler.poll_io(
                    r_semaphore=io_r_semaphore,
                    required_thread_amount=len(io_thread_pairs),
                    shared_resource=monitoring_data,
                    downed_time=downed_time,
                    timeout=cls.FAILOVER_TIMEOUT,
                    output_files=output_files,
                    client=client,
                    disk_amount=disk_amount)
                # Do some more IO to trigger ownership migration
                ThreadingHandler.keep_threads_running(
                    r_semaphore=io_r_semaphore,
                    threads=io_thread_pairs,
                    shared_resource=monitoring_data,
                    duration=cls.IO_TIME)
                cls._validate_move(values_to_check)
            except Exception as ex:
                logger.error(
                    'Running the test for configuration {0} has failed because {1}'
                    .format(configuration, str(ex)))
                failed_configurations.append({
                    'configuration': configuration,
                    'reason': str(ex)
                })
            finally:
                for thread_category, thread_collection in threads[
                        'evented'].iteritems():
                    ThreadHelper.stop_evented_threads(
                        thread_collection['pairs'],
                        thread_collection['r_semaphore'])
                for vm_name, vm_data in vm_info.iteritems():
                    for screen_name in vm_data.get('screen_names', []):
                        logger.debug('Stopping screen {0} on {1}.'.format(
                            screen_name, vm_data['client'].ip))
                        vm_data['client'].run(
                            ['screen', '-S', screen_name, '-X', 'quit'])
                    vm_data['screen_names'] = []
Ejemplo n.º 12
0
    def configure_disk(storagerouter_guid, disk_guid, partition_guid, offset,
                       size, roles):
        """
        Configures a partition
        :param storagerouter_guid: Guid of the StorageRouter to configure a disk on
        :type storagerouter_guid: str
        :param disk_guid: Guid of the disk to configure
        :type disk_guid: str
        :param partition_guid: Guid of the partition on the disk
        :type partition_guid: str
        :param offset: Offset for the partition
        :type offset: int
        :param size: Size of the partition
        :type size: int
        :param roles: Roles assigned to the partition
        :type roles: list
        :return: None
        :rtype: NoneType
        """
        # Validations
        storagerouter = StorageRouter(storagerouter_guid)
        for role in roles:
            if role not in DiskPartition.ROLES or role == DiskPartition.ROLES.BACKEND:
                raise RuntimeError('Invalid role specified: {0}'.format(role))
        disk = Disk(disk_guid)
        if disk.storagerouter_guid != storagerouter_guid:
            raise RuntimeError(
                'The given Disk is not on the given StorageRouter')
        for partition in disk.partitions:
            if DiskPartition.ROLES.BACKEND in partition.roles:
                raise RuntimeError('The given Disk is in use by a Backend')

        if len({DiskPartition.ROLES.DB, DiskPartition.ROLES.DTL}.intersection(
                set(roles))) > 0:
            roles_on_sr = StorageRouterController._get_roles_on_storagerouter(
                storagerouter.ip)
            for role in [DiskPartition.ROLES.DB, DiskPartition.ROLES.DTL]:
                if role in roles_on_sr and role in roles and roles_on_sr[role][
                        0] != disk.name:  # DB and DTL roles still have to be unassignable
                    raise RoleDuplicationException(
                        'Disk {0} cannot have the {1} role due to presence on disk {2}'
                        .format(disk.name, role, roles_on_sr[role][0]))

        # Create partition
        if partition_guid is None:
            StorageRouterController._logger.debug(
                'Creating new partition - Offset: {0} bytes - Size: {1} bytes - Roles: {2}'
                .format(offset, size, roles))
            with remote(storagerouter.ip, [DiskTools], username='******') as rem:
                if len(disk.aliases) == 0:
                    raise ValueError(
                        'Disk {0} does not have any aliases'.format(disk.name))
                rem.DiskTools.create_partition(disk_alias=disk.aliases[0],
                                               disk_size=disk.size,
                                               partition_start=offset,
                                               partition_size=size)
            DiskController.sync_with_reality(storagerouter_guid)
            disk = Disk(disk_guid)
            end_point = offset + size
            partition = None
            for part in disk.partitions:
                if offset < part.offset + part.size and end_point > part.offset:
                    partition = part
                    break

            if partition is None:
                raise RuntimeError(
                    'No new partition detected on disk {0} after having created 1'
                    .format(disk.name))
            StorageRouterController._logger.debug('Partition created')
        else:
            StorageRouterController._logger.debug('Using existing partition')
            partition = DiskPartition(partition_guid)
            if partition.disk_guid != disk_guid:
                raise RuntimeError(
                    'The given DiskPartition is not on the given Disk')
            if partition.filesystem in [
                    'swap', 'linux_raid_member', 'LVM2_member'
            ]:
                raise RuntimeError(
                    "It is not allowed to assign roles on partitions of type: ['swap', 'linux_raid_member', 'LVM2_member']"
                )
            metadata = StorageRouterController.get_metadata(storagerouter_guid)
            partition_info = metadata['partitions']
            removed_roles = set(partition.roles) - set(roles)
            used_roles = []
            for role in removed_roles:
                for info in partition_info[role]:
                    if info['in_use'] and info['guid'] == partition.guid:
                        used_roles.append(role)
            if len(used_roles) > 0:
                raise RuntimeError(
                    'Roles in use cannot be removed. Used roles: {0}'.format(
                        ', '.join(used_roles)))

        # Add filesystem
        if partition.filesystem is None or partition_guid is None:
            StorageRouterController._logger.debug('Creating filesystem')
            if len(partition.aliases) == 0:
                raise ValueError(
                    'Partition with offset {0} does not have any aliases'.
                    format(partition.offset))
            with remote(storagerouter.ip, [DiskTools], username='******') as rem:
                rem.DiskTools.make_fs(partition_alias=partition.aliases[0])
            DiskController.sync_with_reality(storagerouter_guid)
            partition = DiskPartition(partition.guid)
            if partition.filesystem not in ['ext4', 'xfs']:
                raise RuntimeError('Unexpected filesystem')
            StorageRouterController._logger.debug('Filesystem created')

        # Mount the partition and add to FSTab
        if partition.mountpoint is None:
            StorageRouterController._logger.debug('Configuring mount point')
            with remote(storagerouter.ip, [DiskTools], username='******') as rem:
                counter = 1
                mountpoint = '/mnt/{0}{1}'.format(
                    'ssd' if disk.is_ssd else 'hdd', counter)
                while True:
                    if not rem.DiskTools.mountpoint_exists(mountpoint):
                        break
                    counter += 1
                    mountpoint = '/mnt/{0}{1}'.format(
                        'ssd' if disk.is_ssd else 'hdd', counter)
                StorageRouterController._logger.debug(
                    'Found mount point: {0}'.format(mountpoint))
                rem.DiskTools.add_fstab(partition_aliases=partition.aliases,
                                        mountpoint=mountpoint,
                                        filesystem=partition.filesystem)
                rem.DiskTools.mount(mountpoint)
            DiskController.sync_with_reality(storagerouter_guid)
            partition = DiskPartition(partition.guid)
            if partition.mountpoint != mountpoint:
                raise RuntimeError('Unexpected mount point')
            StorageRouterController._logger.debug('Mount point configured')
        partition.roles = roles
        partition.save()
        StorageRouterController._logger.debug('Partition configured')
Ejemplo n.º 13
0
    def test_collapse():
        """
        Test the arakoon collapsing

        :return:
        """
        ArakoonCollapse.LOGGER.info("Starting validating arakoon collapse")
        node_ips = StoragerouterHelper.get_storagerouter_ips()
        node_ips.sort()
        for node_ip in node_ips:
            ArakoonCollapse.LOGGER.info(
                "Fetching arakoons on node `{0}`".format(node_ip))
            arakoon_clusters = []
            root_client = SSHClient(node_ip, username='******')

            # fetch arakoon clusters
            for service in ServiceList.get_services():
                if service.is_internal is True and service.storagerouter.ip == node_ip and \
                    service.type.name in (ServiceType.SERVICE_TYPES.ARAKOON,
                                          ServiceType.SERVICE_TYPES.NS_MGR,
                                          ServiceType.SERVICE_TYPES.ALBA_MGR):
                    arakoon_clusters.append(
                        service.name.replace('arakoon-', ''))

            # perform collapse
            ArakoonCollapse.LOGGER.info(
                "Starting arakoon collapse on node `{0}`".format(node_ip))
            for arakoon_cluster in arakoon_clusters:
                ArakoonCollapse.LOGGER.info(
                    "Fetching `{0}` arakoon on node `{1}`".format(
                        arakoon_cluster, node_ip))
                arakoon_config_path = Configuration.get_configuration_path(
                    '/ovs/arakoon/{0}/config'.format(arakoon_cluster))
                tlog_location = '/opt/OpenvStorage/db/arakoon/{0}/tlogs'.format(
                    arakoon_cluster)

                # read_tlog_dir
                with remote(node_ip, [Configuration]) as rem:
                    config_contents = rem.Configuration.get(
                        '/ovs/arakoon/{0}/config'.format(arakoon_cluster),
                        raw=True)
                for line in config_contents.splitlines():
                    if 'tlog_dir' in line:
                        tlog_location = line.split()[-1]

                nr_of_tlogs = ArakoonCollapse.get_nr_of_tlogs_in_folder(
                    root_client, tlog_location)
                old_headdb_timestamp = 0
                if root_client.file_exists('/'.join([tlog_location,
                                                     'head.db'])):
                    old_headdb_timestamp = root_client.run([
                        'stat', '--format=%Y',
                        '{0}/{1}'.format(tlog_location, 'head.db')
                    ])
                if nr_of_tlogs <= 2:
                    benchmark_command = [
                        'arakoon', '--benchmark', '-n_clients', '1', '-max_n',
                        '5_000', '-config', arakoon_config_path
                    ]
                    root_client.run(benchmark_command)

                ArakoonCollapse.LOGGER.info(
                    "Collapsing arakoon `{0}` on node `{1}` ...".format(
                        arakoon_cluster, node_ip))
                GenericController.collapse_arakoon()

                nr_of_tlogs = ArakoonCollapse.get_nr_of_tlogs_in_folder(
                    root_client, tlog_location)
                new_headdb_timestamp = root_client.run([
                    'stat', '--format=%Y',
                    '{0}/{1}'.format(tlog_location, 'head.db')
                ])

                # perform assertion
                assert nr_of_tlogs <= 2,\
                    'Arakoon collapse left {0} tlogs on the environment, expecting less than 2 in `{1}` on node `{1}`'\
                    .format(nr_of_tlogs, arakoon_cluster, node_ip)
                assert old_headdb_timestamp != new_headdb_timestamp,\
                    'Timestamp of the head_db file was not changed ' \
                    'in the process of collapsing tlogs of arakoon `{0}` on node `{1}`'\
                    .format(arakoon_cluster, node_ip)

                ArakoonCollapse.LOGGER.info(
                    "Successfully collapsed arakoon `{0}` on node `{1}`".
                    format(arakoon_cluster, node_ip))

        ArakoonCollapse.LOGGER.info("Finished validating arakoon collapsing")
Ejemplo n.º 14
0
    def cluster_registry_checkup():
        """
        Verify whether changes have occurred in the cluster registry for each vPool
        :return: Information whether changes occurred
        :rtype: dict
        """
        changed_vpools = {}
        for vpool in VPoolList.get_vpools():
            changed_vpools[vpool.guid] = {'changes': False, 'success': True}
            try:
                StorageDriverController._logger.info(
                    'Validating cluster registry settings for Vpool {0}'.
                    format(vpool.guid))

                current_configs = vpool.clusterregistry_client.get_node_configs(
                )
                changes = len(current_configs) == 0
                node_configs = []
                for sd in vpool.storagedrivers:
                    sd.invalidate_dynamics(['cluster_node_config'])
                    new_config = sd.cluster_node_config
                    node_configs.append(ClusterNodeConfig(**new_config))
                    if changes is False:
                        current_node_configs = [
                            config for config in current_configs
                            if config.vrouter_id == sd.storagedriver_id
                        ]
                        if len(current_node_configs) == 1:
                            current_node_config = current_node_configs[0]
                            for key in new_config:
                                if getattr(current_node_config,
                                           key) != new_config[key]:
                                    changes = True
                                    break
                changed_vpools[vpool.guid]['changes'] = changes

                if changes is True:
                    StorageDriverController._logger.info(
                        'Cluster registry settings for Vpool {0} needs to be updated'
                        .format(vpool.guid))
                    available_storagedrivers = []
                    for sd in vpool.storagedrivers:
                        storagerouter = sd.storagerouter
                        try:
                            SSHClient(storagerouter, username='******')
                        except UnableToConnectException:
                            StorageDriverController._logger.warning(
                                'StorageRouter {0} not available.'.format(
                                    storagerouter.name))
                            continue

                        with remote(storagerouter.ip,
                                    [LocalStorageRouterClient]) as rem:
                            sd_key = '/ovs/vpools/{0}/hosts/{1}/config'.format(
                                vpool.guid, sd.storagedriver_id)
                            if Configuration.exists(sd_key) is True:
                                path = Configuration.get_configuration_path(
                                    sd_key)
                                try:
                                    lsrc = rem.LocalStorageRouterClient(path)
                                    lsrc.server_revision(
                                    )  # 'Cheap' call to verify whether volumedriver is responsive
                                    available_storagedrivers.append(sd)
                                except Exception as ex:
                                    if 'ClusterNotReachableException' in str(
                                            ex):
                                        StorageDriverController._logger.warning(
                                            'StorageDriver {0} on StorageRouter {1} not available.'
                                            .format(sd.guid,
                                                    storagerouter.name))
                                    else:
                                        StorageDriverController._logger.exception(
                                            'Got exception when validating StorageDriver {0} on StorageRouter {1}.'
                                            .format(sd.guid,
                                                    storagerouter.name))

                    StorageDriverController._logger.info(
                        'Updating cluster node configs for VPool {0}'.format(
                            vpool.guid))
                    vpool.clusterregistry_client.set_node_configs(node_configs)
                    for sd in available_storagedrivers:
                        StorageDriverController._logger.info(
                            'Trigger config reload for StorageDriver {0}'.
                            format(sd.guid))
                        vpool.storagedriver_client.update_cluster_node_configs(
                            str(sd.storagedriver_id), req_timeout_secs=10)
                    StorageDriverController._logger.info(
                        'Updating cluster node configs for Vpool {0} completed'
                        .format(vpool.guid))
                else:
                    StorageDriverController._logger.info(
                        'Cluster registry settings for Vpool {0} is up to date'
                        .format(vpool.guid))
            except Exception as ex:
                StorageDriverController._logger.exception(
                    'Got exception when validating cluster registry settings for Vpool {0}.'
                    .format(vpool.name))
                changed_vpools[vpool.guid]['success'] = False
                changed_vpools[vpool.guid]['error'] = ex.message
        return changed_vpools
Ejemplo n.º 15
0
    def stop_services(self):
        """
        Stop all services related to the Storagedriver
        :return: A boolean indicating whether something went wrong
        :rtype: bool
        """
        if self.sr_installer is None:
            raise RuntimeError('No StorageRouterInstaller instance found')

        root_client = self.sr_installer.root_client
        errors_found = False

        for service in [self.sd_service, self.dtl_service]:
            try:
                if self.service_manager.has_service(name=service, client=root_client):
                    self._logger.debug('StorageDriver {0} - Stopping service {1}'.format(self.storagedriver.guid, service))
                    self.service_manager.stop_service(name=service, client=root_client)
                    self._logger.debug('StorageDriver {0} - Removing service {1}'.format(self.storagedriver.guid, service))
                    self.service_manager.remove_service(name=service, client=root_client)
            except Exception:
                self._logger.exception('StorageDriver {0} - Disabling/stopping service {1} failed'.format(self.storagedriver.guid, service))
                errors_found = True

        sd_config_key = '/ovs/vpools/{0}/hosts/{1}/config'.format(self.vp_installer.vpool.guid, self.storagedriver.storagedriver_id)
        if self.vp_installer.storagedriver_amount <= 1 and Configuration.exists(sd_config_key):
            try:
                for proxy in self.storagedriver.alba_proxies:
                    if self.service_manager.has_service(name=proxy.service.name, client=root_client):
                        self._logger.debug('StorageDriver {0} - Starting proxy {1}'.format(self.storagedriver.guid, proxy.service.name))
                        self.service_manager.start_service(name=proxy.service.name, client=root_client)
                        tries = 10
                        running = False
                        port = proxy.service.ports[0]
                        while running is False and tries > 0:
                            self._logger.debug('StorageDriver {0} - Waiting for the proxy {1} to start up'.format(self.storagedriver.guid, proxy.service.name))
                            tries -= 1
                            time.sleep(10 - tries)
                            try:
                                root_client.run(['alba', 'proxy-statistics', '--host', self.storagedriver.storage_ip, '--port', str(port)])
                                running = True
                            except CalledProcessError as ex:
                                self._logger.error('StorageDriver {0} - Fetching alba proxy-statistics failed with error (but ignoring): {1}'.format(self.storagedriver.guid, ex))
                        if running is False:
                            raise RuntimeError('Alba proxy {0} failed to start'.format(proxy.service.name))
                        self._logger.debug('StorageDriver {0} - Alba proxy {0} running'.format(self.storagedriver.guid, proxy.service.name))

                self._logger.debug('StorageDriver {0} - Destroying filesystem and erasing node configs'.format(self.storagedriver.guid))
                with remote(root_client.ip, [LocalStorageRouterClient], username='******') as rem:
                    path = Configuration.get_configuration_path(sd_config_key)
                    storagedriver_client = rem.LocalStorageRouterClient(path)
                    try:
                        storagedriver_client.destroy_filesystem()
                    except RuntimeError as rte:
                        # If backend has already been deleted, we cannot delete the filesystem anymore --> storage leak!!!
                        if 'MasterLookupResult.Error' not in rte.message:
                            raise

                self.vp_installer.vpool.clusterregistry_client.erase_node_configs()
            except RuntimeError:
                self._logger.exception('StorageDriver {0} - Destroying filesystem and erasing node configs failed'.format(self.storagedriver.guid))
                errors_found = True

        for proxy in self.storagedriver.alba_proxies:
            service_name = proxy.service.name
            try:
                if self.service_manager.has_service(name=service_name, client=root_client):
                    self._logger.debug('StorageDriver {0} - Stopping service {1}'.format(self.storagedriver.guid, service_name))
                    self.service_manager.stop_service(name=service_name, client=root_client)
                    self._logger.debug('StorageDriver {0} - Removing service {1}'.format(self.storagedriver.guid, service_name))
                    self.service_manager.remove_service(name=service_name, client=root_client)
            except Exception:
                self._logger.exception('StorageDriver {0} - Disabling/stopping service {1} failed'.format(self.storagedriver.guid, service_name))
                errors_found = True

        return errors_found
Ejemplo n.º 16
0
    def run_test(cls, cluster_info, compute_client, vm_info, vm_username=CIConstants.VM_USERNAME, vm_password=CIConstants.VM_PASSWORD,
                 timeout=TEST_TIMEOUT, data_test_cases=CIConstants.DATA_TEST_CASES, logger=LOGGER):
        """
        Runs the test as described in https://github.com/openvstorage/dev_ops/issues/64
        :param cluster_info: information about the cluster
        :param compute_client: SSHclient of the computenode
        :param vm_info: vm information
        :param vm_username: username to login on all vms
        :param vm_password: password to login on all vms
        :param timeout: timeout in seconds
        :param data_test_cases: data rw ratios to test
        :param logger: logging instance
        :return: 
        """
        compute_str = cluster_info['storagerouters']['compute']
        destination_storagedriver = cluster_info['storagedrivers']['destination']
        source_storagedriver = cluster_info['storagedrivers']['source']

        # Cache to validate properties
        values_to_check = {
            'source_std': source_storagedriver.serialize(),
            'target_std': destination_storagedriver.serialize()
        }
        # Prep VM listener #
        failed_configurations = []
        # Extract vdisk info from vm_info - only get the data ones
        vdisk_info = {}
        disk_amount = 0
        for vm_name, vm_object in vm_info.iteritems():
            for vdisk in vm_object['vdisks']:
                if 'vdisk_data' in vdisk.name:
                    vdisk_info.update({vdisk.name: vdisk})
                    disk_amount += 1
        try:
            cls._adjust_automatic_scrubbing(disable=True)
            with remote(compute_str.ip, [SSHClient]) as rem:
                configuration = random.choice(data_test_cases)
                threads = {'evented': {'io': {'pairs': [], 'r_semaphore': None},
                                       'snapshots': {'pairs': [], 'r_semaphore': None}}}
                output_files = []
                safety_set = False
                try:
                    logger.info('Starting the following configuration: {0}'.format(configuration))
                    for vm_name, vm_data in vm_info.iteritems():
                        vm_client = rem.SSHClient(vm_data['ip'], vm_username, vm_password)
                        vm_client.file_create('/mnt/data/{0}.raw'.format(vm_data['create_msg']))
                        vm_data['client'] = vm_client
                    cls._set_mds_safety(source_storagedriver.vpool, 1, checkup=True)  # Set the safety to trigger the mds
                    safety_set = True
                    io_thread_pairs, monitoring_data, io_r_semaphore = ThreadingHandler.start_io_polling_threads(volume_bundle=vdisk_info)
                    threads['evented']['io']['pairs'] = io_thread_pairs
                    threads['evented']['io']['r_semaphore'] = io_r_semaphore
                    # @todo snapshot every minute
                    threads['evented']['snapshots']['pairs'] = ThreadingHandler.start_snapshotting_threads(volume_bundle=vdisk_info, kwargs={'interval': 15})
                    for vm_name, vm_data in vm_info.iteritems():  # Write data
                        screen_names, output_files = DataWriter.write_data_fio(client=vm_data['client'],
                                                                               fio_configuration={
                                                                                   'io_size': cls.AMOUNT_TO_WRITE,
                                                                                   'configuration': configuration},
                                                                               file_locations=['/mnt/data/{0}.raw'.format(vm_data['create_msg'])])
                        vm_data['screen_names'] = screen_names
                    logger.info('Doing IO for {0}s before bringing down the node.'.format(cls.IO_TIME))
                    ThreadingHandler.keep_threads_running(r_semaphore=io_r_semaphore,
                                                          threads=io_thread_pairs,
                                                          shared_resource=monitoring_data,
                                                          duration=cls.IO_TIME / 2)
                    ThreadHelper.stop_evented_threads(threads['evented']['snapshots']['pairs'],
                                                      threads['evented']['snapshots']['r_semaphore'])  # Stop snapshotting
                    cls._delete_snapshots(volume_bundle=vdisk_info)
                    # Start scrubbing thread
                    async_scrubbing = cls.start_scrubbing(volume_bundle=vdisk_info)  # Starting to scrub
                    cls._trigger_mds_issue(cluster_info['vpool'], vdisk_info, destination_storagedriver.storagerouter.guid)  # Trigger mds failover while scrubber is busy
                    # Do some monitoring further for 60s
                    ThreadingHandler.keep_threads_running(r_semaphore=io_r_semaphore,
                                                          threads=io_thread_pairs,
                                                          shared_resource=monitoring_data,
                                                          duration=cls.IO_TIME / 2)
                    time.sleep(cls.IO_REFRESH_RATE * 2)
                    downed_time = time.time()
                    # Start IO polling to verify nothing went down
                    ThreadingHandler.poll_io(r_semaphore=io_r_semaphore,
                                             required_thread_amount=len(io_thread_pairs),
                                             shared_resource=monitoring_data,
                                             downed_time=downed_time,
                                             timeout=timeout,
                                             output_files=output_files,
                                             client=compute_client,
                                             disk_amount=disk_amount)
                    possible_scrub_errors = async_scrubbing.get()  # Wait until scrubbing calls have given a result
                    assert len(possible_scrub_errors) == 0, 'Scrubbing has encountered some errors: {0}'.format(', '.join(possible_scrub_errors))
                    cls._validate(values_to_check, monitoring_data)
                except Exception as ex:
                    logger.error('Running the test for configuration {0} has failed because {1}'.format(configuration, str(ex)))
                    failed_configurations.append({'configuration': configuration, 'reason': str(ex)})
                    raise
                finally:
                    for thread_category, thread_collection in threads['evented'].iteritems():
                        ThreadHelper.stop_evented_threads(thread_collection['pairs'], thread_collection['r_semaphore'])
                    for vm_name, vm_data in vm_info.iteritems():
                        for screen_name in vm_data.get('screen_names', []):
                            logger.debug('Stopping screen {0} on {1}.'.format(screen_name, vm_data['client'].ip))
                            vm_data['client'].run(['screen', '-S', screen_name, '-X', 'quit'])
                        vm_data['screen_names'] = []
                    if safety_set is True:
                        cls._set_mds_safety(source_storagedriver.vpool, len(StorageRouterList.get_masters()), checkup=True)
        finally:
            cls._adjust_automatic_scrubbing(disable=False)
        assert len(failed_configurations) == 0, 'Certain configuration failed: {0}'.format(' '.join(failed_configurations))
Ejemplo n.º 17
0
    def shrink_vpool(cls,
                     storagedriver_guid,
                     offline_storage_router_guids=list()):
        """
        Removes a StorageDriver (if its the last StorageDriver for a vPool, the vPool is removed as well)
        :param storagedriver_guid: Guid of the StorageDriver to remove
        :type storagedriver_guid: str
        :param offline_storage_router_guids: Guids of StorageRouters which are offline and will be removed from cluster.
                                             WHETHER VPOOL WILL BE DELETED DEPENDS ON THIS
        :type offline_storage_router_guids: list
        :return: None
        :rtype: NoneType
        """
        # TODO: Add logging
        # TODO: Unit test individual pieces of code
        # Validations
        storagedriver = StorageDriver(storagedriver_guid)
        storagerouter = storagedriver.storagerouter
        cls._logger.info(
            'StorageDriver {0} - Deleting StorageDriver {1}'.format(
                storagedriver.guid, storagedriver.name))

        vp_installer = VPoolInstaller(name=storagedriver.vpool.name)
        vp_installer.validate(storagedriver=storagedriver)

        sd_installer = StorageDriverInstaller(vp_installer=vp_installer,
                                              storagedriver=storagedriver)

        cls._logger.info(
            'StorageDriver {0} - Checking availability of related StorageRouters'
            .format(storagedriver.guid, storagedriver.name))
        sr_client_map = SSHClient.get_clients(endpoints=[
            sd.storagerouter for sd in vp_installer.vpool.storagedrivers
        ],
                                              user_names=['root'])
        sr_installer = StorageRouterInstaller(root_client=sr_client_map.get(
            storagerouter, {}).get('root'),
                                              storagerouter=storagerouter,
                                              vp_installer=vp_installer,
                                              sd_installer=sd_installer)

        offline_srs = sr_client_map.pop('offline')
        if sorted([sr.guid for sr in offline_srs
                   ]) != sorted(offline_storage_router_guids):
            raise RuntimeError('Not all StorageRouters are reachable')

        if storagerouter not in offline_srs:
            mtpt_pids = sr_installer.root_client.run(
                "lsof -t +D '/mnt/{0}' || true".format(
                    vp_installer.name.replace(r"'", r"'\''")),
                allow_insecure=True).splitlines()
            if len(mtpt_pids) > 0:
                raise RuntimeError(
                    'vPool cannot be deleted. Following processes keep the vPool mount point occupied: {0}'
                    .format(', '.join(mtpt_pids)))

        # Retrieve reachable StorageDrivers
        reachable_storagedrivers = []
        for sd in vp_installer.vpool.storagedrivers:
            if sd.storagerouter not in sr_client_map:
                # StorageRouter is offline
                continue

            sd_key = '/ovs/vpools/{0}/hosts/{1}/config'.format(
                vp_installer.vpool.guid, sd.storagedriver_id)
            if Configuration.exists(sd_key) is True:
                path = Configuration.get_configuration_path(sd_key)
                with remote(sd.storagerouter.ip,
                            [LocalStorageRouterClient]) as rem:
                    try:
                        lsrc = rem.LocalStorageRouterClient(path)
                        lsrc.server_revision(
                        )  # 'Cheap' call to verify whether volumedriver is responsive
                        cls._logger.info(
                            'StorageDriver {0} - Responsive StorageDriver {1} on node with IP {2}'
                            .format(storagedriver.guid, sd.name,
                                    sd.storagerouter.ip))
                        reachable_storagedrivers.append(sd)
                    except Exception as exception:
                        if not is_connection_failure(exception):
                            raise

        if len(reachable_storagedrivers) == 0:
            raise RuntimeError(
                'Could not find any responsive node in the cluster')

        # Start removal
        if vp_installer.storagedriver_amount > 1:
            vp_installer.update_status(status=VPool.STATUSES.SHRINKING)
        else:
            vp_installer.update_status(status=VPool.STATUSES.DELETING)

        # Clean up stale vDisks
        cls._logger.info('StorageDriver {0} - Removing stale vDisks'.format(
            storagedriver.guid))
        VDiskController.remove_stale_vdisks(vpool=vp_installer.vpool)

        # Reconfigure the MDSes
        cls._logger.info('StorageDriver {0} - Reconfiguring MDSes'.format(
            storagedriver.guid))
        for vdisk_guid in storagerouter.vdisks_guids:
            try:
                MDSServiceController.ensure_safety(
                    vdisk_guid=vdisk_guid,
                    excluded_storagerouter_guids=[storagerouter.guid] +
                    offline_storage_router_guids)
            except Exception:
                cls._logger.exception(
                    'StorageDriver {0} - vDisk {1} - Ensuring MDS safety failed'
                    .format(storagedriver.guid, vdisk_guid))

        # Validate that all MDSes on current StorageRouter have been moved away
        # Ensure safety does not always throw an error, that's why we perform this check here instead of in the Exception clause of above code
        vdisks = []
        for mds in vp_installer.mds_services:
            for junction in mds.vdisks:
                vdisk = junction.vdisk
                if vdisk in vdisks:
                    continue
                vdisks.append(vdisk)
                cls._logger.critical(
                    'StorageDriver {0} - vDisk {1} {2} - MDS Services have not been migrated away'
                    .format(storagedriver.guid, vdisk.guid, vdisk.name))
        if len(vdisks) > 0:
            # Put back in RUNNING, so it can be used again. Errors keep on displaying in GUI now anyway
            vp_installer.update_status(status=VPool.STATUSES.RUNNING)
            raise RuntimeError(
                'Not all MDS Services have been successfully migrated away')

        # Start with actual removal
        errors_found = False
        if storagerouter not in offline_srs:
            errors_found &= sd_installer.stop_services()

        errors_found &= vp_installer.configure_cluster_registry(
            exclude=[storagedriver], apply_on=reachable_storagedrivers)
        errors_found &= vp_installer.update_node_distance_map()
        errors_found &= vp_installer.remove_mds_services()
        errors_found &= sd_installer.clean_config_management()
        errors_found &= sd_installer.clean_model()

        if storagerouter not in offline_srs:
            errors_found &= sd_installer.clean_directories(
                mountpoints=StorageRouterController.get_mountpoints(
                    client=sr_installer.root_client))

            try:
                DiskController.sync_with_reality(
                    storagerouter_guid=storagerouter.guid)
            except Exception:
                cls._logger.exception(
                    'StorageDriver {0} - Synchronizing disks with reality failed'
                    .format(storagedriver.guid))
                errors_found = True

        if vp_installer.storagedriver_amount > 1:
            # Update the vPool metadata and run DTL checkup
            vp_installer.vpool.metadata['caching_info'].pop(
                sr_installer.storagerouter.guid, None)
            vp_installer.vpool.save()

            try:
                VDiskController.dtl_checkup(vpool_guid=vp_installer.vpool.guid,
                                            ensure_single_timeout=600)
            except Exception:
                cls._logger.exception(
                    'StorageDriver {0} - DTL checkup failed for vPool {1} with guid {2}'
                    .format(storagedriver.guid, vp_installer.name,
                            vp_installer.vpool.guid))
        else:
            cls._logger.info(
                'StorageDriver {0} - Removing vPool from model'.format(
                    storagedriver.guid))
            # Clean up model
            try:
                vp_installer.vpool.delete()
            except Exception:
                errors_found = True
                cls._logger.exception(
                    'StorageDriver {0} - Cleaning up vPool from the model failed'
                    .format(storagedriver.guid))
            Configuration.delete('/ovs/vpools/{0}'.format(
                vp_installer.vpool.guid))

        cls._logger.info('StorageDriver {0} - Running MDS checkup'.format(
            storagedriver.guid))
        try:
            MDSServiceController.mds_checkup()
        except Exception:
            cls._logger.exception(
                'StorageDriver {0} - MDS checkup failed'.format(
                    storagedriver.guid))

        # Update vPool status
        if errors_found is True:
            if vp_installer.storagedriver_amount > 1:
                vp_installer.update_status(status=VPool.STATUSES.FAILURE)
            raise RuntimeError(
                '1 or more errors occurred while trying to remove the StorageDriver. Please check the logs for more information'
            )

        if vp_installer.storagedriver_amount > 1:
            vp_installer.update_status(status=VPool.STATUSES.RUNNING)
        cls._logger.info(
            'StorageDriver {0} - Deleted StorageDriver {1}'.format(
                storagedriver.guid, storagedriver.name))

        if len(VPoolList.get_vpools()) == 0:
            cluster_name = ArakoonInstaller.get_cluster_name('voldrv')
            if ArakoonInstaller.get_arakoon_metadata_by_cluster_name(
                    cluster_name=cluster_name)['internal'] is True:
                cls._logger.debug(
                    'StorageDriver {0} - Removing Arakoon cluster {1}'.format(
                        storagedriver.guid, cluster_name))
                try:
                    installer = ArakoonInstaller(cluster_name=cluster_name)
                    installer.load()
                    installer.delete_cluster()
                except Exception:
                    cls._logger.exception(
                        'StorageDriver {0} - Delete voldrv Arakoon cluster failed'
                        .format(storagedriver.guid))
                service_type = ServiceTypeList.get_by_name(
                    ServiceType.SERVICE_TYPES.ARAKOON)
                service_name = ArakoonInstaller.get_service_name_for_cluster(
                    cluster_name=cluster_name)
                for service in list(service_type.services):
                    if service.name == service_name:
                        service.delete()

        # Remove watcher volumedriver service if last StorageDriver on current StorageRouter
        if len(
                storagerouter.storagedrivers
        ) == 0 and storagerouter not in offline_srs:  # ensure client is initialized for StorageRouter
            try:
                if cls._service_manager.has_service(
                        ServiceFactory.SERVICE_WATCHER_VOLDRV,
                        client=sr_installer.root_client):
                    cls._service_manager.stop_service(
                        ServiceFactory.SERVICE_WATCHER_VOLDRV,
                        client=sr_installer.root_client)
                    cls._service_manager.remove_service(
                        ServiceFactory.SERVICE_WATCHER_VOLDRV,
                        client=sr_installer.root_client)
            except Exception:
                cls._logger.exception(
                    'StorageDriver {0} - {1} service deletion failed'.format(
                        storagedriver.guid,
                        ServiceFactory.SERVICE_WATCHER_VOLDRV))
Ejemplo n.º 18
0
    def remove_node(node_ip, silent=None):
        """
        Remove the node with specified IP from the cluster
        :param node_ip: IP of the node to remove
        :type node_ip: str
        :param silent: If silent == '--force-yes' no question will be asked to confirm the removal
        :type silent: str
        :return: None
        """
        from ovs.dal.lists.storagerouterlist import StorageRouterList
        from ovs.lib.storagedriver import StorageDriverController
        from ovs.lib.vpool import VPoolController

        Toolbox.log(logger=NodeRemovalController._logger,
                    messages='Remove node',
                    boxed=True)
        Toolbox.log(
            logger=NodeRemovalController._logger,
            messages=
            'WARNING: Some of these steps may take a very long time, please check the logs for more information\n\n'
        )
        service_manager = ServiceFactory.get_manager()

        ###############
        # VALIDATIONS #
        ###############
        try:
            node_ip = node_ip.strip()
            if not isinstance(node_ip, str):
                raise ValueError('Node IP must be a string')
            if not re.match(SSHClient.IP_REGEX, node_ip):
                raise ValueError('Invalid IP {0} specified'.format(node_ip))

            storage_router_all = sorted(StorageRouterList.get_storagerouters(),
                                        key=lambda k: k.name)
            storage_router_masters = StorageRouterList.get_masters()
            storage_router_all_ips = set(
                [storage_router.ip for storage_router in storage_router_all])
            storage_router_master_ips = set([
                storage_router.ip for storage_router in storage_router_masters
            ])
            storage_router_to_remove = StorageRouterList.get_by_ip(node_ip)
            offline_reasons = {}
            if node_ip not in storage_router_all_ips:
                raise ValueError(
                    'Unknown IP specified\nKnown in model:\n - {0}\nSpecified for removal:\n - {1}'
                    .format('\n - '.join(storage_router_all_ips), node_ip))

            if len(storage_router_all_ips) == 1:
                raise RuntimeError("Removing the only node is not possible")

            if node_ip in storage_router_master_ips and len(
                    storage_router_master_ips) == 1:
                raise RuntimeError(
                    "Removing the only master node is not possible")

            if System.get_my_storagerouter() == storage_router_to_remove:
                raise RuntimeError(
                    'The node to be removed cannot be identical to the node on which the removal is initiated'
                )

            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages='Creating SSH connections to remaining master nodes')
            master_ip = None
            ip_client_map = {}
            storage_routers_offline = []
            storage_router_to_remove_online = True
            for storage_router in storage_router_all:
                try:
                    client = SSHClient(storage_router,
                                       username='******',
                                       timeout=10)
                except (UnableToConnectException, NotAuthenticatedException,
                        TimeOutException) as ex:
                    if isinstance(ex, UnableToConnectException):
                        msg = 'Unable to connect'
                    elif isinstance(ex, NotAuthenticatedException):
                        msg = 'Could not authenticate'
                    elif isinstance(ex, TimeOutException):
                        msg = 'Connection timed out'
                    Toolbox.log(
                        logger=NodeRemovalController._logger,
                        messages='  * Node with IP {0:<15}- {1}'.format(
                            storage_router.ip, msg))
                    offline_reasons[storage_router.ip] = msg
                    storage_routers_offline.append(storage_router)
                    if storage_router == storage_router_to_remove:
                        storage_router_to_remove_online = False
                    continue

                Toolbox.log(
                    logger=NodeRemovalController._logger,
                    messages='  * Node with IP {0:<15}- Successfully connected'
                    .format(storage_router.ip))
                ip_client_map[storage_router.ip] = client
                if storage_router != storage_router_to_remove and storage_router.node_type == 'MASTER':
                    master_ip = storage_router.ip

            if len(ip_client_map) == 0 or master_ip is None:
                raise RuntimeError(
                    'Could not connect to any master node in the cluster')

            storage_router_to_remove.invalidate_dynamics('vdisks_guids')
            if len(
                    storage_router_to_remove.vdisks_guids
            ) > 0:  # vDisks are supposed to be moved away manually before removing a node
                raise RuntimeError(
                    "Still vDisks attached to Storage Router {0}".format(
                        storage_router_to_remove.name))

            internal_memcached = Toolbox.is_service_internally_managed(
                service='memcached')
            internal_rabbit_mq = Toolbox.is_service_internally_managed(
                service='rabbitmq')
            memcached_endpoints = Configuration.get(
                key='/ovs/framework/memcache|endpoints')
            rabbit_mq_endpoints = Configuration.get(
                key='/ovs/framework/messagequeue|endpoints')
            copy_memcached_endpoints = list(memcached_endpoints)
            copy_rabbit_mq_endpoints = list(rabbit_mq_endpoints)
            for endpoint in memcached_endpoints:
                if endpoint.startswith(storage_router_to_remove.ip):
                    copy_memcached_endpoints.remove(endpoint)
            for endpoint in rabbit_mq_endpoints:
                if endpoint.startswith(storage_router_to_remove.ip):
                    copy_rabbit_mq_endpoints.remove(endpoint)
            if len(copy_memcached_endpoints
                   ) == 0 and internal_memcached is True:
                raise RuntimeError(
                    'Removal of provided nodes will result in a complete removal of the memcached service'
                )
            if len(copy_rabbit_mq_endpoints
                   ) == 0 and internal_rabbit_mq is True:
                raise RuntimeError(
                    'Removal of provided nodes will result in a complete removal of the messagequeue service'
                )

            Toolbox.run_hooks(component='noderemoval',
                              sub_component='validate_removal',
                              logger=NodeRemovalController._logger,
                              cluster_ip=storage_router_to_remove.ip)
        except KeyboardInterrupt:
            Toolbox.log(logger=NodeRemovalController._logger, messages='\n')
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages=
                'Removal has been aborted during the validation step. No changes have been applied.',
                boxed=True,
                loglevel='warning')
            sys.exit(1)
        except Exception as exception:
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages=[str(exception)],
                        boxed=True,
                        loglevel='exception')
            sys.exit(1)

        #################
        # CONFIRMATIONS #
        #################
        try:
            interactive = silent != '--force-yes'
            remove_asd_manager = not interactive  # Remove ASD manager if non-interactive else ask
            if interactive is True:
                if len(storage_routers_offline) > 0:
                    Toolbox.log(
                        logger=NodeRemovalController._logger,
                        messages=
                        'Certain nodes appear to be offline. These will not fully removed and will cause issues if they are not really offline.'
                    )
                    Toolbox.log(
                        logger=NodeRemovalController._logger,
                        messages='Offline nodes: {0}'.format(''.join(
                            ('\n  * {0:<15}- {1}.'.format(ip, message)
                             for ip, message in offline_reasons.iteritems()))))
                    valid_node_info = Interactive.ask_yesno(
                        message=
                        'Continue the removal with these being presumably offline?',
                        default_value=False)
                    if valid_node_info is False:
                        Toolbox.log(
                            logger=NodeRemovalController._logger,
                            messages=
                            'Please validate the state of the nodes before removing.',
                            title=True)
                        sys.exit(1)
                proceed = Interactive.ask_yesno(
                    message='Are you sure you want to remove node {0}?'.format(
                        storage_router_to_remove.name),
                    default_value=False)
                if proceed is False:
                    Toolbox.log(logger=NodeRemovalController._logger,
                                messages='Abort removal',
                                title=True)
                    sys.exit(1)

                remove_asd_manager = True
                if storage_router_to_remove_online is True:
                    client = SSHClient(endpoint=storage_router_to_remove,
                                       username='******')
                    if service_manager.has_service(name='asd-manager',
                                                   client=client):
                        remove_asd_manager = Interactive.ask_yesno(
                            message=
                            'Do you also want to remove the ASD manager and related ASDs?',
                            default_value=False)

                if remove_asd_manager is True or storage_router_to_remove_online is False:
                    for fct in Toolbox.fetch_hooks('noderemoval',
                                                   'validate_asd_removal'):
                        validation_output = fct(storage_router_to_remove.ip)
                        if validation_output['confirm'] is True:
                            if Interactive.ask_yesno(
                                    message=validation_output['question'],
                                    default_value=False) is False:
                                remove_asd_manager = False
                                break
        except KeyboardInterrupt:
            Toolbox.log(logger=NodeRemovalController._logger, messages='\n')
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages=
                'Removal has been aborted during the confirmation step. No changes have been applied.',
                boxed=True,
                loglevel='warning')
            sys.exit(1)
        except Exception as exception:
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages=[str(exception)],
                        boxed=True,
                        loglevel='exception')
            sys.exit(1)
        ###########
        # REMOVAL #
        ###########
        try:
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='Starting removal of node {0} - {1}'.format(
                            storage_router_to_remove.name,
                            storage_router_to_remove.ip))
            if storage_router_to_remove_online is False:
                Toolbox.log(
                    logger=NodeRemovalController._logger,
                    messages=
                    '  Marking all Storage Drivers served by Storage Router {0} as offline'
                    .format(storage_router_to_remove.ip))
                StorageDriverController.mark_offline(
                    storagerouter_guid=storage_router_to_remove.guid)

            # Remove vPools
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='  Removing vPools from node'.format(
                            storage_router_to_remove.ip))
            storage_routers_offline_guids = [
                sr.guid for sr in storage_routers_offline
                if sr.guid != storage_router_to_remove.guid
            ]
            for storage_driver in storage_router_to_remove.storagedrivers:
                Toolbox.log(logger=NodeRemovalController._logger,
                            messages='    Removing vPool {0} from node'.format(
                                storage_driver.vpool.name))
                VPoolController.shrink_vpool(
                    storagedriver_guid=storage_driver.guid,
                    offline_storage_router_guids=storage_routers_offline_guids)

            # Demote if MASTER
            if storage_router_to_remove.node_type == 'MASTER':
                NodeTypeController.demote_node(
                    cluster_ip=storage_router_to_remove.ip,
                    master_ip=master_ip,
                    ip_client_map=ip_client_map,
                    unique_id=storage_router_to_remove.machine_id,
                    unconfigure_memcached=internal_memcached,
                    unconfigure_rabbitmq=internal_rabbit_mq,
                    offline_nodes=storage_routers_offline)

            # Stop / remove services
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='Stopping and removing services')
            if storage_router_to_remove_online is True:
                client = SSHClient(endpoint=storage_router_to_remove,
                                   username='******')
                NodeRemovalController.remove_services(
                    client=client,
                    node_type=storage_router_to_remove.node_type.lower(),
                    logger=NodeRemovalController._logger)
                service = 'watcher-config'
                if service_manager.has_service(service, client=client):
                    Toolbox.log(
                        logger=NodeRemovalController._logger,
                        messages='Removing service {0}'.format(service))
                    service_manager.stop_service(service, client=client)
                    service_manager.remove_service(service, client=client)

            Toolbox.run_hooks(component='noderemoval',
                              sub_component='remove',
                              logger=NodeRemovalController._logger,
                              cluster_ip=storage_router_to_remove.ip,
                              complete_removal=remove_asd_manager)

            # Clean up model
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='Removing node from model')
            for service in storage_router_to_remove.services:
                service.delete()
            for disk in storage_router_to_remove.disks:
                for partition in disk.partitions:
                    partition.delete()
                disk.delete()
            for j_domain in storage_router_to_remove.domains:
                j_domain.delete()
            Configuration.delete('/ovs/framework/hosts/{0}'.format(
                storage_router_to_remove.machine_id))

            NodeTypeController.restart_framework_and_memcache_services(
                clients=ip_client_map,
                offline_node_ips=[node.ip for node in storage_routers_offline],
                logger=NodeRemovalController._logger)

            if storage_router_to_remove_online is True:
                client = SSHClient(endpoint=storage_router_to_remove,
                                   username='******')
                client.file_delete(filenames=[CACC_LOCATION])
                client.file_delete(filenames=[CONFIG_STORE_LOCATION])
            storage_router_to_remove.delete()
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='Successfully removed node\n')
        except Exception as exception:
            Toolbox.log(logger=NodeRemovalController._logger, messages='\n')
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages=['An unexpected error occurred:',
                          str(exception)],
                boxed=True,
                loglevel='exception')
            sys.exit(1)
        except KeyboardInterrupt:
            Toolbox.log(logger=NodeRemovalController._logger, messages='\n')
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages=
                'This setup was aborted. Open vStorage may be in an inconsistent state, make sure to validate the installation.',
                boxed=True,
                loglevel='error')
            sys.exit(1)

        if remove_asd_manager is True and storage_router_to_remove_online is True:
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='\nRemoving ASD Manager')
            with remote(storage_router_to_remove.ip, [os]) as rem:
                rem.os.system('asd-manager remove --force-yes')
        Toolbox.log(logger=NodeRemovalController._logger,
                    messages='Remove nodes finished',
                    title=True)
Ejemplo n.º 19
0
    def run_test(cls, vm_info, cluster_info, logger=LOGGER):
        """
        Tests the DTL using a virtual machine which will write in his own filesystem
        Expects last data to be pulled from the DTL and not backend
        :param cluster_info: information about the cluster, contains all dal objects
        :type cluster_info: dict
        :param vm_info: info about the vms
        :param logger: logging instance
        :return: None
        :rtype: NoneType
        """
        source_std = cluster_info['storagedrivers']['source']
        source_client = SSHClient(source_std.storagerouter, username='******')

        compute_str = cluster_info['storagerouters']['compute']
        compute_client = SSHClient(compute_str)

        # setup hypervisor details
        parent_hypervisor = HypervisorFactory().get()
        vm_to_stop = cls.HYPERVISOR_INFO['vms'][source_std.storage_ip]['name']

        vdisk_info = {}
        disk_amount = 0
        for vm_name, vm_object in vm_info.iteritems():
            for vdisk in vm_object['vdisks']:
                # Ignore the cd vdisk as no IO will come from it
                if vdisk.name == vm_object['cd_path'].replace(
                        '.raw', '').split('/')[-1]:
                    continue
                disk_amount += 1
                vdisk_info.update({vdisk.name: vdisk})

        # Cache to validate properties
        values_to_check = {
            'source_std': source_std.serialize(),
            'vdisks': vdisk_info.values()
        }

        with remote(compute_str.ip, [SSHClient]) as rem:
            threads = {'evented': {'io': {'pairs': [], 'r_semaphore': None}}}
            vm_downed = False
            output_files = []
            try:
                for vm_name, vm_data in vm_info.iteritems():
                    vm_client = rem.SSHClient(vm_data['ip'], cls.VM_USERNAME,
                                              cls.VM_PASSWORD)
                    vm_client.file_create('/mnt/data/{0}.raw'.format(
                        vm_data['create_msg']))
                    vm_data['client'] = vm_client
                    # Load dd, md5sum, screen & fio in memory
                    vm_data['client'].run([
                        'dd', 'if=/dev/urandom',
                        'of={0}'.format(cls.VM_RANDOM), 'bs=1M', 'count=2'
                    ])
                    vm_data['client'].run(['md5sum', cls.VM_RANDOM])

                logger.info("Stopping proxy services")
                service_manager = ServiceFactory.get_manager()

                for proxy in source_std.alba_proxies:
                    service_manager.restart_service(proxy.service.name,
                                                    client=source_client)

                logger.info(
                    'Starting to WRITE file while proxy is offline. All data should be stored in the DTL!'
                )
                for vm_name, vm_data in vm_info.iteritems():
                    vm_data['client'].run(
                        'dd if=/dev/urandom of={0} bs=1M count=2'.format(
                            cls.VM_FILENAME).split())
                    original_md5sum = ' '.join(vm_data['client'].run(
                        ['md5sum', cls.VM_FILENAME]).split())
                    vm_data['original_md5sum'] = original_md5sum
                    logger.info('Original MD5SUM for VM {0}: {1}.'.format(
                        vm_name, original_md5sum))
                logger.info('Finished to WRITE file while proxy is offline!')
                logger.info(
                    "Starting fio to generate IO for failing over.".format(
                        cls.IO_TIME))
                io_thread_pairs, monitoring_data, io_r_semaphore = ThreadingHandler.start_io_polling_threads(
                    volume_bundle=vdisk_info)
                threads['evented']['io']['pairs'] = io_thread_pairs
                threads['evented']['io']['r_semaphore'] = io_r_semaphore
                for vm_name, vm_data in vm_info.iteritems():  # Write data
                    screen_names, output_files = DataWriter.write_data_fio(
                        client=vm_data['client'],
                        fio_configuration={
                            'io_size': cls.AMOUNT_TO_WRITE,
                            'configuration': cls.IO_PATTERN
                        },
                        file_locations=[
                            '/mnt/data/{0}.raw'.format(vm_data['create_msg'])
                        ])
                    vm_data['screen_names'] = screen_names
                logger.info(
                    'Doing IO for {0}s before bringing down the node.'.format(
                        cls.IO_TIME))
                ThreadingHandler.keep_threads_running(
                    r_semaphore=io_r_semaphore,
                    threads=io_thread_pairs,
                    shared_resource=monitoring_data,
                    duration=cls.IO_TIME)
                ##############################################
                # Bringing original owner of the volume down #
                ##############################################
                VMHandler.stop_vm(hypervisor=parent_hypervisor,
                                  vmid=vm_to_stop)
                vm_downed = True
                downed_time = time.time()
                time.sleep(cls.IO_REFRESH_RATE * 2)
                # Start IO polling to verify nothing went down
                ThreadingHandler.poll_io(
                    r_semaphore=io_r_semaphore,
                    required_thread_amount=len(io_thread_pairs),
                    shared_resource=monitoring_data,
                    downed_time=downed_time,
                    timeout=cls.HA_TIMEOUT,
                    output_files=output_files,
                    client=compute_client,
                    disk_amount=disk_amount)
                logger.info('Starting to validate move...')
                cls._validate_move(values_to_check)
                logger.info('Finished validating move!')
                logger.info('Validate if DTL is working correctly!')
                unmatching_checksum_vms = []
                for vm_name, vm_data in vm_info.iteritems():
                    current_md5sum = ' '.join(vm_data['client'].run(
                        ['md5sum', cls.VM_FILENAME]).split())
                    if vm_data['original_md5sum'] != current_md5sum:
                        unmatching_checksum_vms.append(vm_name)
                assert len(
                    unmatching_checksum_vms
                ) == 0, 'Not all data was read from the DTL. Checksums do not line up for {}'.format(
                    ', '.join(unmatching_checksum_vms))
                logger.info('DTL is working correctly!')
            finally:
                for thread_category, thread_collection in threads[
                        'evented'].iteritems():
                    ThreadHelper.stop_evented_threads(
                        thread_collection['pairs'],
                        thread_collection['r_semaphore'])
                if vm_downed is True:
                    VMHandler.start_vm(parent_hypervisor, vm_to_stop)
                    logger.debug('Started {0}'.format(vm_to_stop))
                    SystemHelper.idle_till_ovs_is_up(source_std.storage_ip,
                                                     **cls.get_shell_user())
                    # @TODO: Remove when https://github.com/openvstorage/integrationtests/issues/540 is fixed
                    FwkHandler.restart_all()
                for vm_name, vm_data in vm_info.iteritems():
                    for screen_name in vm_data.get('screen_names', []):
                        logger.debug('Stopping screen {0} on {1}.'.format(
                            screen_name, vm_data['client'].ip))
                        vm_data['client'].run(
                            ['screen', '-S', screen_name, '-X', 'quit'])
                    vm_data['screen_names'] = []
Ejemplo n.º 20
0
    def save(self, client=None, force_reload=False):
        """
        Saves the configuration to a given file, optionally a remote one
        :param client: If provided, save remote configuration
        :type client: ovs_extensions.generic.sshclient.SSHClient
        :param force_reload: Make sure the 'update_configuration' gets triggered. Should be used when configuration changes have been applied from 'outside'
        :type force_reload: bool
        :return: Changes to the configuration
        :rtype: list
        """
        changes = []
        Configuration.set(self.key,
                          json.dumps(self.configuration, indent=4),
                          raw=True)

        # No changes detected in the configuration management
        if len(self.dirty_entries) == 0 and force_reload is False:
            self._logger.debug('No need to apply changes, nothing changed')
            self.is_new = False
            return changes

        # Retrieve the changes from volumedriver
        self._logger.info(
            'Applying local storagedriver configuration changes{0}'.format(
                '' if client is None else ' on {0}'.format(client.ip)))
        reloaded = False
        try:
            if client is None:
                changes = LocalStorageRouterClient(
                    self.remote_path).update_configuration(self.remote_path)
            else:
                with remote(client.ip, [LocalStorageRouterClient]) as rem:
                    changes = copy.deepcopy(
                        rem.LocalStorageRouterClient(
                            self.remote_path).update_configuration(
                                self.remote_path))
            reloaded = True
        except Exception as ex:
            if 'ClusterNotReachableException' not in str(ex):
                raise

        # No changes
        if len(changes) == 0:
            if reloaded is True:
                if len(self.dirty_entries) > 0:
                    self._logger.warning(
                        'Following changes were not applied: {0}'.format(
                            ', '.join(self.dirty_entries)))
            else:
                self._logger.warning(
                    'Changes were not applied since StorageDriver is unavailable'
                )
            self.is_new = False
            self.dirty_entries = []
            return changes

        # Verify the output of the changes and log them
        for change in changes:
            if not isinstance(change, dict):
                raise RuntimeError('Unexpected update_configuration output')
            if 'param_name' not in change or 'old_value' not in change or 'new_value' not in change:
                raise RuntimeError(
                    'Unexpected update_configuration output. Expected different keys, but got {0}'
                    .format(', '.join(change.keys())))

            param_name = change['param_name']
            if force_reload is False:
                if param_name not in self.dirty_entries:
                    raise RuntimeError(
                        'Unexpected configuration change: {0}'.format(
                            param_name))
                self.dirty_entries.remove(param_name)
            self._logger.info('Changed {0} from "{1}" to "{2}"'.format(
                param_name, change['old_value'], change['new_value']))
        self._logger.info('Changes applied')
        if len(self.dirty_entries) > 0:
            self._logger.warning(
                'Following changes were not applied: {0}'.format(', '.join(
                    self.dirty_entries)))
        self.is_new = False
        self.dirty_entries = []
        return changes
Ejemplo n.º 21
0
    def _get_update_information_cluster_alba(cls, client, update_info,
                                             package_info):
        """
        In this function the services for each component / package combination are defined
        This service information consists out of:
            * Services to stop (before update) and start (after update of packages) -> 'services_stop_start'
            * Services to restart after update (post-update logic)                  -> 'services_post_update'
            * Down-times which will be caused due to service restarts               -> 'downtime'
            * Prerequisites that have not been met                                  -> 'prerequisites'

        Verify whether all relevant services have the correct binary active
        Whether a service has the correct binary version in use, we use the ServiceFactory.get_service_update_versions functionality
        When a service has an older binary version running, we add this information to the 'update_info'

        This combined information is then stored in the 'package_information' of the StorageRouter DAL object

        :param client: SSHClient on which to retrieve the service information required for an update
        :type client: ovs.extensions.generic.sshclient.SSHClient
        :param update_info: Dictionary passed in by the thread calling this function used to store all update information
        :type update_info: dict
        :param package_info: Dictionary containing the components and packages which have an update available for current SSHClient
        :type package_info: dict
        :return: None
        :rtype: NoneType
        """
        cls._logger.info(
            'StorageRouter {0}: Refreshing ALBA update information'.format(
                client.ip))
        try:
            binaries = cls._package_manager.get_binary_versions(client=client)
            storagerouter = StorageRouterList.get_by_ip(ip=client.ip)
            cls._logger.debug('StorageRouter {0}: Binary versions: {1}'.format(
                client.ip, binaries))

            # Retrieve Arakoon information
            arakoon_info = {}
            for service in storagerouter.services:
                if service.type.name not in [
                        ServiceType.SERVICE_TYPES.ALBA_MGR,
                        ServiceType.SERVICE_TYPES.NS_MGR
                ]:
                    continue

                if service.type.name == ServiceType.SERVICE_TYPES.ALBA_MGR:
                    cluster_name = service.abm_service.abm_cluster.name
                    alba_backend_name = service.abm_service.abm_cluster.alba_backend.name
                else:
                    cluster_name = service.nsm_service.nsm_cluster.name
                    alba_backend_name = service.nsm_service.nsm_cluster.alba_backend.name

                cls._logger.debug(
                    'StorageRouter {0}: Retrieving update information for Arakoon cluster {1}'
                    .format(client.ip, cluster_name))
                arakoon_update_info = ArakoonInstaller.get_arakoon_update_info(
                    cluster_name=cluster_name)
                cls._logger.debug(
                    'StorageRouter {0}: Arakoon update information for cluster {1}: {2}'
                    .format(client.ip, cluster_name, arakoon_update_info))
                if arakoon_update_info['internal'] is True:
                    arakoon_info[arakoon_update_info['service_name']] = [
                        'backend', alba_backend_name
                    ] if arakoon_update_info['downtime'] is True else None

            for component, package_names in PackageFactory.get_package_info(
            )['names'].iteritems():
                package_names = sorted(package_names)
                cls._logger.debug(
                    'StorageRouter {0}: Validating component {1} and related packages: {2}'
                    .format(client.ip, component, package_names))

                if component not in update_info[client.ip]:
                    update_info[client.ip][component] = copy.deepcopy(
                        ServiceFactory.DEFAULT_UPDATE_ENTRY)
                svc_component_info = update_info[client.ip][component]
                pkg_component_info = package_info.get(component, {})

                for package_name in package_names:
                    cls._logger.debug(
                        'StorageRouter {0}: Validating ALBA plugin related package {1}'
                        .format(client.ip, package_name))
                    if package_name == PackageFactory.PKG_OVS_BACKEND and package_name in pkg_component_info:
                        if ['gui', None] not in svc_component_info['downtime']:
                            svc_component_info['downtime'].append(
                                ['gui', None])
                        if ['api', None] not in svc_component_info['downtime']:
                            svc_component_info['downtime'].append(
                                ['api', None])
                        svc_component_info['services_stop_start'][10].append(
                            'ovs-watcher-framework')
                        svc_component_info['services_stop_start'][20].append(
                            'memcached')
                        cls._logger.debug(
                            'StorageRouter {0}: Added services "ovs-watcher-framework" and "memcached" to stop-start services'
                            .format(client.ip))
                        cls._logger.debug(
                            'StorageRouter {0}: Added GUI and API to downtime'.
                            format(client.ip))

                    elif package_name in [
                            PackageFactory.PKG_ALBA, PackageFactory.PKG_ALBA_EE
                    ]:
                        # Retrieve proxy service information
                        for service in storagerouter.services:
                            if service.type.name != ServiceType.SERVICE_TYPES.ALBA_PROXY or service.alba_proxy is None:
                                continue

                            service_version = None
                            if package_name not in pkg_component_info:
                                service_version = ServiceFactory.get_service_update_versions(
                                    client=client,
                                    service_name=service.name,
                                    binary_versions=binaries)

                            cls._logger.debug(
                                'StorageRouter {0}: Service {1} is running version {2}'
                                .format(client.ip, service.name,
                                        service_version))
                            if package_name in pkg_component_info or service_version is not None:
                                if service_version is not None and package_name not in svc_component_info[
                                        'packages']:
                                    svc_component_info['packages'][
                                        package_name] = service_version
                                svc_component_info['services_post_update'][
                                    10].append('ovs-{0}'.format(service.name))
                                cls._logger.debug(
                                    'StorageRouter {0}: Added service {1} to post-update services'
                                    .format(client.ip,
                                            'ovs-{0}'.format(service.name)))

                                downtime = [
                                    'proxy',
                                    service.alba_proxy.storagedriver.vpool.name
                                ]
                                if downtime not in svc_component_info[
                                        'downtime']:
                                    svc_component_info['downtime'].append(
                                        downtime)
                                    cls._logger.debug(
                                        'StorageRouter {0}: Added ALBA proxy downtime for vPool {1} to downtime'
                                        .format(
                                            client.ip, service.alba_proxy.
                                            storagedriver.vpool.name))

                    if package_name in [
                            PackageFactory.PKG_ALBA,
                            PackageFactory.PKG_ALBA_EE,
                            PackageFactory.PKG_ARAKOON
                    ]:
                        for service_name, downtime in arakoon_info.iteritems():
                            service_version = ServiceFactory.get_service_update_versions(
                                client=client,
                                service_name=service_name,
                                binary_versions=binaries,
                                package_name=package_name)
                            cls._logger.debug(
                                'StorageRouter {0}: Arakoon service {1} information: {2}'
                                .format(client.ip, service_name,
                                        service_version))

                            if package_name in pkg_component_info or service_version is not None:
                                svc_component_info['services_post_update'][
                                    10].append('ovs-{0}'.format(service_name))
                                cls._logger.debug(
                                    'StorageRouter {0}: Added service {1} to post-update services'
                                    .format(client.ip,
                                            'ovs-{0}'.format(service_name)))
                                if service_version is not None and package_name not in svc_component_info[
                                        'packages']:
                                    svc_component_info['packages'][
                                        package_name] = service_version
                                if downtime is not None and downtime not in svc_component_info[
                                        'downtime']:
                                    svc_component_info['downtime'].append(
                                        downtime)
                                    cls._logger.debug(
                                        'StorageRouter {0}: Added Arakoon cluster for ALBA Backend {1} to downtime'
                                        .format(client.ip, downtime[1]))

                    # Extend the service information with the package information related to this repository for current StorageRouter
                    if package_name in pkg_component_info and package_name not in svc_component_info[
                            'packages']:
                        cls._logger.debug(
                            'StorageRouter {0}: Adding package {1} because it has an update available'
                            .format(client.ip, package_name))
                        svc_component_info['packages'][
                            package_name] = pkg_component_info[package_name]

                if component == PackageFactory.COMP_ALBA:
                    for alba_node in AlbaNodeList.get_albanodes():
                        try:
                            alba_node.client.get_metadata()
                        except:
                            svc_component_info['prerequisites'].append(
                                ['alba_node_unresponsive', alba_node.ip])
                            cls._logger.debug(
                                'StorageRouter {0}: Added unresponsive ALBA Node {1} to prerequisites'
                                .format(client.ip, alba_node.ip))

                # Verify whether migration (DAL and extension) code needs to be executed (only if no packages have an update available so far)
                elif component == PackageFactory.COMP_FWK and PackageFactory.PKG_OVS_BACKEND not in svc_component_info[
                        'packages']:
                    cls._logger.debug(
                        'StorageRouter {0}: No updates detected, checking for required migrations'
                        .format(client.ip))
                    # Extension migration check
                    key = '/ovs/framework/hosts/{0}/versions'.format(
                        System.get_my_machine_id(client=client))
                    old_version = Configuration.get(key, default={}).get(
                        PackageFactory.COMP_MIGRATION_ALBA)
                    installed_version = str(
                        cls._package_manager.get_installed_versions(
                            client=client,
                            package_names=[PackageFactory.PKG_OVS_BACKEND
                                           ])[PackageFactory.PKG_OVS_BACKEND])
                    migrations_detected = False
                    if old_version is not None:
                        cls._logger.debug(
                            'StorageRouter {0}: Current running version for {1} extension migrations: {2}'
                            .format(client.ip, PackageFactory.COMP_ALBA,
                                    old_version))
                        with remote(client.ip, [ExtensionMigrator]) as rem:
                            cls._logger.debug(
                                'StorageRouter {0}: Available version for {1} extension migrations: {2}'
                                .format(client.ip, PackageFactory.COMP_ALBA,
                                        rem.ExtensionMigrator.THIS_VERSION))
                            if rem.ExtensionMigrator.THIS_VERSION > old_version:
                                migrations_detected = True
                                svc_component_info['packages'][
                                    PackageFactory.PKG_OVS_BACKEND] = {
                                        'installed': 'migrations',
                                        'candidate': installed_version
                                    }

                    # DAL migration check
                    if migrations_detected is False:
                        persistent_client = PersistentFactory.get_client()
                        old_version = persistent_client.get(
                            'ovs_model_version').get(
                                PackageFactory.COMP_MIGRATION_ALBA
                            ) if persistent_client.exists(
                                'ovs_model_version') else None
                        if old_version is not None:
                            cls._logger.debug(
                                'StorageRouter {0}: Current running version for {1} DAL migrations: {2}'
                                .format(client.ip, PackageFactory.COMP_ALBA,
                                        old_version))
                            with remote(client.ip, [DALMigrator]) as rem:
                                cls._logger.debug(
                                    'StorageRouter {0}: Available version for {1} DAL migrations: {2}'
                                    .format(client.ip,
                                            PackageFactory.COMP_ALBA,
                                            rem.DALMigrator.THIS_VERSION))
                                if rem.DALMigrator.THIS_VERSION > old_version:
                                    svc_component_info['packages'][
                                        PackageFactory.PKG_OVS_BACKEND] = {
                                            'installed': 'migrations',
                                            'candidate': installed_version
                                        }

            cls._logger.info(
                'StorageRouter {0}: Refreshed ALBA update information'.format(
                    client.ip))
        except Exception as ex:
            cls._logger.exception(
                'StorageRouter {0}: Refreshing ALBA update information failed'.
                format(client.ip))
            if 'errors' not in update_info[client.ip]:
                update_info[client.ip]['errors'] = []
            update_info[client.ip]['errors'].append(ex)