Exemple #1
0
    def get_mgmtcenter(pmachine=None, mgmt_center=None):
        """
        @param pmachine: pmachine hybrid from DAL
        @param mgmt_center: mgmtcenter hybrid from DAL
        Returns the appropriate sdk client for the management center of the node
        """
        if not ((pmachine is None) ^ (mgmt_center is None)):
            raise ValueError('Either a pMachine or a Management center should be passed')
        if pmachine is not None:
            mgmt_center = pmachine.mgmtcenter
            if mgmt_center is None:
                return None

        mgmtcenter_type = mgmt_center.type
        ip = mgmt_center.ip
        username = mgmt_center.username
        password = mgmt_center.password
        key = '{0}_{1}'.format(ip, username)
        if key not in Factory.mgmtcenters:
            mutex = file_mutex('mgmtcenter_{0}'.format(key))
            try:
                mutex.acquire(30)
                if key not in Factory.mgmtcenters:
                    if mgmtcenter_type == 'VCENTER':
                        from mgmtcenters.vcenter import VCenter
                        mgmtcenter = VCenter(ip, username, password)
                    elif mgmtcenter_type == 'OPENSTACK':
                        from mgmtcenters.openstack import OpenStack
                        mgmtcenter = OpenStack(ip, username, password)
                    else:
                        raise NotImplementedError('Management center for {0} is not yet supported'.format(mgmtcenter_type))
                    Factory.mgmtcenters[key] = mgmtcenter
            finally:
                mutex.release()
        return Factory.mgmtcenters[key]
Exemple #2
0
    def get_mgmtcenter(pmachine=None, mgmt_center=None):
        """
        @param pmachine: pmachine hybrid from DAL
        @param mgmt_center: mgmtcenter hybrid from DAL
        Returns the appropriate sdk client for the management center of the node
        """
        if not ((pmachine is None) ^ (mgmt_center is None)):
            raise ValueError('Either a pMachine or a Management center should be passed')
        if pmachine is not None:
            mgmt_center = pmachine.mgmtcenter
            if mgmt_center is None:
                return None

        mgmtcenter_type = mgmt_center.type
        ip = mgmt_center.ip
        username = mgmt_center.username
        password = mgmt_center.password
        key = '{0}_{1}'.format(ip, username)
        if key not in Factory.mgmtcenters:
            mutex = file_mutex('mgmtcenter_{0}'.format(key))
            try:
                mutex.acquire(30)
                if key not in Factory.mgmtcenters:
                    if mgmtcenter_type == 'VCENTER':
                        from mgmtcenters.vcenter import VCenter
                        mgmtcenter = VCenter(ip, username, password)
                    elif mgmtcenter_type == 'OPENSTACK':
                        from mgmtcenters.openstack import OpenStack
                        mgmtcenter = OpenStack(ip, username, password)
                    else:
                        raise NotImplementedError('Management center for {0} is not yet supported'.format(mgmtcenter_type))
                    Factory.mgmtcenters[key] = mgmtcenter
            finally:
                mutex.release()
        return Factory.mgmtcenters[key]
Exemple #3
0
 def get(pmachine):
     """
     Returns the appropriate hypervisor client class for a given PMachine
     """
     hvtype = pmachine.hvtype
     ip = pmachine.ip
     username = pmachine.username
     password = pmachine.password
     key = '{0}_{1}'.format(ip, username)
     if key not in Factory.hypervisors:
         mutex = file_mutex('hypervisor_{0}'.format(key))
         try:
             mutex.acquire(30)
             if key not in Factory.hypervisors:
                 if hvtype == 'VMWARE':
                     from hypervisors.vmware import VMware
                     hypervisor = VMware(ip, username, password)
                 elif hvtype == 'KVM':
                     from hypervisors.kvm import KVM
                     hypervisor = KVM(ip, username, password)
                 else:
                     raise NotImplementedError('Hypervisor {0} is not yet supported'.format(hvtype))
                 Factory.hypervisors[key] = hypervisor
         finally:
             mutex.release()
     return Factory.hypervisors[key]
Exemple #4
0
 def get(pmachine):
     """
     Returns the appropriate hypervisor client class for a given PMachine
     """
     hvtype = pmachine.hvtype
     ip = pmachine.ip
     username = pmachine.username
     password = pmachine.password
     key = '{0}_{1}'.format(ip, username)
     if key not in Factory.hypervisors:
         mutex = file_mutex('hypervisor_{0}'.format(key))
         try:
             mutex.acquire(30)
             if key not in Factory.hypervisors:
                 if hvtype == 'VMWARE':
                     from hypervisors.vmware import VMware
                     hypervisor = VMware(ip, username, password)
                 elif hvtype == 'KVM':
                     from hypervisors.kvm import KVM
                     hypervisor = KVM(ip, username, password)
                 else:
                     raise NotImplementedError('Hypervisor {0} is not yet supported'.format(hvtype))
                 Factory.hypervisors[key] = hypervisor
         finally:
             mutex.release()
     return Factory.hypervisors[key]
Exemple #5
0
    def add_fstab(partition_aliases, mountpoint, filesystem):
        """
        Add entry to /etc/fstab for mountpoint
        :param partition_aliases: Possible aliases of the partition to add
        :type partition_aliases: list
        :param mountpoint: Mountpoint on which device is mounted
        :type mountpoint: str
        :param filesystem: Filesystem used
        :type filesystem: str
        :return: None
        """
        if len(partition_aliases) == 0:
            raise ValueError('No partition aliases provided')

        with open('/etc/fstab', 'r') as fstab_file:
            lines = [line.strip() for line in fstab_file.readlines()]

        used_path = None
        used_index = None
        mount_line = None
        for device_alias in partition_aliases:
            for index, line in enumerate(lines):
                if line.startswith('#'):
                    continue
                if line.startswith(device_alias) and re.match(
                        '^{0}\s+'.format(re.escape(device_alias)), line):
                    used_path = device_alias
                    used_index = index
                if len(line.split()) == 6 and line.split(
                )[1] == mountpoint:  # Example line: 'UUID=40d99523-a1e7-4374-84f2-85b5d14b516e  /  swap  sw  0  0'
                    mount_line = line
            if used_path is not None:
                break

        if used_path is None:  # Partition not yet present with any of its possible aliases
            lines.append(
                OSManager.get_fstab_entry(partition_aliases[0], mountpoint,
                                          filesystem))
        else:  # Partition present, update information
            lines.pop(used_index)
            lines.insert(
                used_index,
                OSManager.get_fstab_entry(used_path, mountpoint, filesystem))

        if mount_line is not None:  # Mountpoint already in use by another device (potentially same device, but other device_path)
            lines.remove(mount_line)

        with file_mutex('ovs-fstab-lock'):
            with open('/etc/fstab', 'w') as fstab_file:
                fstab_file.write('{0}\n'.format('\n'.join(lines)))
Exemple #6
0
 def new_function(*args, **kw):
     """
     Executes the decorated function in a locked context
     """
     filemutex = file_mutex('messaging')
     try:
         filemutex.acquire(wait=60)
         mutex = volatile_mutex('messaging')
         try:
             mutex.acquire(wait=60)
             return f(*args, **kw)
         finally:
             mutex.release()
     finally:
         filemutex.release()
Exemple #7
0
 def new_function(*args, **kw):
     """
     Executes the decorated function in a locked context
     """
     filemutex = file_mutex('messaging')
     try:
         filemutex.acquire(wait=60)
         mutex = volatile_mutex('messaging')
         try:
             mutex.acquire(wait=60)
             return f(*args, **kw)
         finally:
             mutex.release()
     finally:
         filemutex.release()
Exemple #8
0
    def add_fstab(partition_aliases, mountpoint, filesystem):
        """
        Add entry to /etc/fstab for mountpoint
        :param partition_aliases: Possible aliases of the partition to add
        :type partition_aliases: list
        :param mountpoint: Mountpoint on which device is mounted
        :type mountpoint: str
        :param filesystem: Filesystem used
        :type filesystem: str
        :return: None
        """
        if len(partition_aliases) == 0:
            raise ValueError('No partition aliases provided')

        with open('/etc/fstab', 'r') as fstab_file:
            lines = [line.strip() for line in fstab_file.readlines()]

        used_path = None
        used_index = None
        mount_line = None
        for device_alias in partition_aliases:
            for index, line in enumerate(lines):
                if line.startswith('#'):
                    continue
                if line.startswith(device_alias) and re.match('^{0}\s+'.format(re.escape(device_alias)), line):
                    used_path = device_alias
                    used_index = index
                if len(line.split()) == 6 and line.split()[1] == mountpoint:  # Example line: 'UUID=40d99523-a1e7-4374-84f2-85b5d14b516e  /  swap  sw  0  0'
                    mount_line = line
            if used_path is not None:
                break

        if used_path is None:  # Partition not yet present with any of its possible aliases
            lines.append(OSManager.get_fstab_entry(partition_aliases[0], mountpoint, filesystem))
        else:  # Partition present, update information
            lines.pop(used_index)
            lines.insert(used_index, OSManager.get_fstab_entry(used_path, mountpoint, filesystem))

        if mount_line is not None:  # Mountpoint already in use by another device (potentially same device, but other device_path)
            lines.remove(mount_line)

        with file_mutex('ovs-fstab-lock'):
            with open('/etc/fstab', 'w') as fstab_file:
                fstab_file.write('{0}\n'.format('\n'.join(lines)))
Exemple #9
0
    def update_framework():
        """
        Update the framework
        :return: None
        """
        filemutex = file_mutex('system_update', wait=2)
        upgrade_file = '/etc/ready_for_upgrade'
        upgrade_ongoing_check_file = '/etc/upgrade_ongoing'
        ssh_clients = []
        try:
            filemutex.acquire()
            UpdateController._log_message('+++ Starting framework update +++')

            from ovs.dal.lists.storagerouterlist import StorageRouterList

            UpdateController._log_message('Generating SSH client connections for each storage router')
            upgrade_file = '/etc/ready_for_upgrade'
            upgrade_ongoing_check_file = '/etc/upgrade_ongoing'
            storage_routers = StorageRouterList.get_storagerouters()
            ssh_clients = []
            master_ips = []
            extra_ips = []
            for sr in storage_routers:
                ssh_clients.append(SSHClient(sr.ip, username='******'))
                if sr.node_type == 'MASTER':
                    master_ips.append(sr.ip)
                elif sr.node_type == 'EXTRA':
                    extra_ips.append(sr.ip)
            this_client = [client for client in ssh_clients if client.is_local is True][0]

            # Create locks
            UpdateController._log_message('Creating lock files', client_ip=this_client.ip)
            for client in ssh_clients:
                client.run('touch {0}'.format(upgrade_file))  # Prevents manual install or upgrade individual packages
                client.run('touch {0}'.format(upgrade_ongoing_check_file))  # Prevents clicking x times on 'Update' btn

            # Check requirements
            packages_to_update = set()
            all_services_to_restart = []
            for client in ssh_clients:
                for function in Toolbox.fetch_hooks('update', 'metadata'):
                    UpdateController._log_message('Executing function {0}'.format(function.__name__),
                                                  client_ip=client.ip)
                    output = function(client)
                    for key, value in output.iteritems():
                        if key != 'framework':
                            continue
                        for package_info in value:
                            packages_to_update.update(package_info['packages'])
                            all_services_to_restart += package_info['services']

            services_to_restart = []
            for service in all_services_to_restart:
                if service not in services_to_restart:
                    services_to_restart.append(service)  # Filter out duplicates maintaining the order of services (eg: watcher-framework before memcached)

            UpdateController._log_message('Services which will be restarted --> {0}'.format(', '.join(services_to_restart)))
            UpdateController._log_message('Packages which will be installed --> {0}'.format(', '.join(packages_to_update)))

            # Stop services
            if UpdateController._change_services_state(services=services_to_restart,
                                                       ssh_clients=ssh_clients,
                                                       action='stop') is False:
                UpdateController._log_message('Stopping all services on every node failed, cannot continue',
                                              client_ip=this_client.ip, severity='warning')
                UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients)

                # Start services again if a service could not be stopped
                UpdateController._log_message('Attempting to start the services again', client_ip=this_client.ip)
                UpdateController._change_services_state(services=services_to_restart,
                                                        ssh_clients=ssh_clients,
                                                        action='start')

                UpdateController._log_message('Failed to stop all required services, aborting update',
                                              client_ip=this_client.ip, severity='error')
                return

            # Update packages
            failed_clients = []
            for client in ssh_clients:
                PackageManager.update(client=client)
                try:
                    UpdateController._log_message('Installing latest packages', client.ip)
                    for package in packages_to_update:
                        UpdateController._log_message('Installing {0}'.format(package), client.ip)
                        PackageManager.install(package_name=package,
                                               client=client,
                                               force=True)
                        UpdateController._log_message('Installed {0}'.format(package), client.ip)
                    client.file_delete(upgrade_file)
                except subprocess.CalledProcessError as cpe:
                    UpdateController._log_message('Upgrade failed with error: {0}'.format(cpe.output), client.ip,
                                                  'error')
                    failed_clients.append(client)
                    break

            if failed_clients:
                UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients)
                UpdateController._log_message('Error occurred. Attempting to start all services again',
                                              client_ip=this_client.ip, severity='error')
                UpdateController._change_services_state(services=services_to_restart,
                                                        ssh_clients=ssh_clients,
                                                        action='start')
                UpdateController._log_message('Failed to upgrade following nodes:\n - {0}\nPlease check /var/log/ovs/lib.log on {1} for more information'.format('\n - '.join([client.ip for client in failed_clients]), this_client.ip),
                                              this_client.ip,
                                              'error')
                return

            # Migrate code
            for client in ssh_clients:
                try:
                    UpdateController._log_message('Started code migration', client.ip)
                    try:
                        with remote(client.ip, [Migrator]) as rem:
                            rem.Migrator.migrate(master_ips, extra_ips)
                    except EOFError as eof:
                        UpdateController._log_message('EOFError during code migration, retrying {0}'.format(eof), client.ip, 'warning')
                        with remote(client.ip, [Migrator]) as rem:
                            rem.Migrator.migrate(master_ips, extra_ips)
                    UpdateController._log_message('Finished code migration', client.ip)
                except Exception as ex:
                    UpdateController._remove_lock_files([upgrade_ongoing_check_file], ssh_clients)
                    UpdateController._log_message('Code migration failed with error: {0}'.format(ex), client.ip, 'error')
                    return

            # Start services
            UpdateController._log_message('Starting services', client_ip=this_client.ip)
            model_services = []
            if 'arakoon-ovsdb' in services_to_restart:
                model_services.append('arakoon-ovsdb')
                services_to_restart.remove('arakoon-ovsdb')
            if 'memcached' in services_to_restart:
                model_services.append('memcached')
                services_to_restart.remove('memcached')
            UpdateController._change_services_state(services=model_services,
                                                    ssh_clients=ssh_clients,
                                                    action='start')

            # Migrate model
            UpdateController._log_message('Started model migration', client_ip=this_client.ip)
            try:
                from ovs.dal.helpers import Migration
                with remote(ssh_clients[0].ip, [Migration]) as rem:
                    rem.Migration.migrate()
                UpdateController._log_message('Finished model migration', client_ip=this_client.ip)
            except Exception as ex:
                UpdateController._remove_lock_files([upgrade_ongoing_check_file], ssh_clients)
                UpdateController._log_message('An unexpected error occurred: {0}'.format(ex), client_ip=this_client.ip,
                                              severity='error')
                return

            # Post upgrade actions
            UpdateController._log_message('Executing post upgrade actions', client_ip=this_client.ip)
            for client in ssh_clients:
                with remote(client.ip, [Toolbox, SSHClient]) as rem:
                    for function in rem.Toolbox.fetch_hooks('update', 'postupgrade'):
                        UpdateController._log_message('Executing action {0}'.format(function.__name__),
                                                      client_ip=client.ip)
                        try:
                            function(rem.SSHClient(client.ip, username='******'))
                            UpdateController._log_message('Executing action {0} completed'.format(function.__name__),
                                                          client_ip=client.ip)
                        except Exception as ex:
                            UpdateController._log_message('Post upgrade action failed with error: {0}'.format(ex),
                                                          client.ip, 'error')

            # Start watcher and restart support-agent
            UpdateController._change_services_state(services=services_to_restart,
                                                    ssh_clients=ssh_clients,
                                                    action='start')
            UpdateController._change_services_state(services=['support-agent'],
                                                    ssh_clients=ssh_clients,
                                                    action='restart')

            UpdateController._remove_lock_files([upgrade_ongoing_check_file], ssh_clients)
            UpdateController._log_message('+++ Finished updating +++')
        except RuntimeError as rte:
            UpdateController._log_message('Error during framework update: {0}'.format(rte), severity='error')
            UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients)
        except NoLockAvailableException:
            UpdateController._log_message('Another framework update is currently in progress!')
        except Exception as ex:
            UpdateController._log_message('Error during framework update: {0}'.format(ex), severity='error')
            UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients)
        finally:
            filemutex.release()
Exemple #10
0
    def update_volumedriver():
        """
        Update the volumedriver
        :return: None
        """
        filemutex = file_mutex('system_update', wait=2)
        upgrade_file = '/etc/ready_for_upgrade'
        upgrade_ongoing_check_file = '/etc/upgrade_ongoing'
        ssh_clients = []
        try:
            filemutex.acquire()
            UpdateController._log_message('+++ Starting volumedriver update +++')

            from ovs.dal.lists.storagerouterlist import StorageRouterList

            UpdateController._log_message('Generating SSH client connections for each storage router')
            storage_routers = StorageRouterList.get_storagerouters()
            ssh_clients = [SSHClient(storage_router.ip, 'root') for storage_router in storage_routers]
            this_client = [client for client in ssh_clients if client.is_local is True][0]

            # Commence update !!!!!!!
            # 0. Create locks
            UpdateController._log_message('Creating lock files', client_ip=this_client.ip)
            for client in ssh_clients:
                client.run('touch {0}'.format(upgrade_file))  # Prevents manual install or upgrade individual packages
                client.run('touch {0}'.format(upgrade_ongoing_check_file))  # Prevents clicking x times on 'Update' btn

            # 1. Check requirements
            packages_to_update = set()
            all_services_to_restart = []
            for client in ssh_clients:
                for function in Toolbox.fetch_hooks('update', 'metadata'):
                    UpdateController._log_message('Executing function {0}'.format(function.__name__),
                                                  client_ip=client.ip)
                    output = function(client)
                    for key, value in output.iteritems():
                        if key != 'volumedriver':
                            continue
                        for package_info in value:
                            packages_to_update.update(package_info['packages'])
                            all_services_to_restart += package_info['services']

            services_to_restart = []
            for service in all_services_to_restart:
                if service not in services_to_restart:
                    services_to_restart.append(service)  # Filter out duplicates keeping the order of services (eg: watcher-framework before memcached)

            UpdateController._log_message('Services which will be restarted --> {0}'.format(', '.join(services_to_restart)))
            UpdateController._log_message('Packages which will be installed --> {0}'.format(', '.join(packages_to_update)))

            # 1. Stop services
            if UpdateController._change_services_state(services=services_to_restart,
                                                       ssh_clients=ssh_clients,
                                                       action='stop') is False:
                UpdateController._log_message('Stopping all services on every node failed, cannot continue',
                                              client_ip=this_client.ip, severity='warning')
                UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients)

                UpdateController._log_message('Attempting to start the services again', client_ip=this_client.ip)
                UpdateController._change_services_state(services=services_to_restart,
                                                        ssh_clients=ssh_clients,
                                                        action='start')
                UpdateController._log_message('Failed to stop all required services, update aborted',
                                              client_ip=this_client.ip, severity='error')
                return

            # 2. Update packages
            failed_clients = []
            for client in ssh_clients:
                PackageManager.update(client=client)
                try:
                    for package_name in packages_to_update:
                        UpdateController._log_message('Installing {0}'.format(package_name), client.ip)
                        PackageManager.install(package_name=package_name,
                                               client=client,
                                               force=True)
                        UpdateController._log_message('Installed {0}'.format(package_name), client.ip)
                    client.file_delete(upgrade_file)
                except subprocess.CalledProcessError as cpe:
                    UpdateController._log_message('Upgrade failed with error: {0}'.format(cpe.output), client.ip,
                                                  'error')
                    failed_clients.append(client)
                    break

            if failed_clients:
                UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients)
                UpdateController._log_message('Error occurred. Attempting to start all services again',
                                              client_ip=this_client.ip, severity='error')
                UpdateController._change_services_state(services=services_to_restart,
                                                        ssh_clients=ssh_clients,
                                                        action='start')
                UpdateController._log_message('Failed to upgrade following nodes:\n - {0}\nPlease check /var/log/ovs/lib.log on {1} for more information'.format('\n - '.join([client.ip for client in failed_clients]), this_client.ip),
                                              this_client.ip,
                                              'error')
                return

            # 3. Post upgrade actions
            UpdateController._log_message('Executing post upgrade actions', client_ip=this_client.ip)
            for client in ssh_clients:
                for function in Toolbox.fetch_hooks('update', 'postupgrade'):
                    UpdateController._log_message('Executing action: {0}'.format(function.__name__), client_ip=client.ip)
                    try:
                        function(client)
                    except Exception as ex:
                        UpdateController._log_message('Post upgrade action failed with error: {0}'.format(ex),
                                                      client.ip, 'error')

            # 4. Start services
            UpdateController._log_message('Starting services', client_ip=this_client.ip)
            UpdateController._change_services_state(services=services_to_restart,
                                                    ssh_clients=ssh_clients,
                                                    action='start')

            UpdateController._remove_lock_files([upgrade_ongoing_check_file], ssh_clients)
            UpdateController._log_message('+++ Finished updating +++')
        except RuntimeError as rte:
            UpdateController._log_message('Error during volumedriver update: {0}'.format(rte), severity='error')
            UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients)
        except NoLockAvailableException:
            UpdateController._log_message('Another volumedriver update is currently in progress!')
        except Exception as ex:
            UpdateController._log_message('Error during volumedriver update: {0}'.format(ex), severity='error')
            UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients)
        finally:
            filemutex.release()
Exemple #11
0
    def execute_update(components):
        """
        Update the specified components on all StorageRouters
        This is called upon by 'at'
        :return: None
        """
        filemutex = file_mutex('system_update', wait=2)
        ssh_clients = []
        services_stop_start = set()
        try:
            filemutex.acquire()
            UpdateController._logger.debug('+++ Starting update +++')

            from ovs.dal.lists.storagerouterlist import StorageRouterList

            # Create SSHClients to all nodes
            UpdateController._logger.debug('Generating SSH client connections for each storage router')
            storage_routers = StorageRouterList.get_storagerouters()
            master_ips = []
            extra_ips = []
            for sr in storage_routers:
                try:
                    ssh_clients.append(SSHClient(sr.ip, username='******'))
                    if sr.node_type == 'MASTER':
                        master_ips.append(sr.ip)
                    elif sr.node_type == 'EXTRA':
                        extra_ips.append(sr.ip)
                except UnableToConnectException:
                    raise Exception('Update is only allowed on systems where all nodes are online and fully functional')

            # Create locks
            for client in ssh_clients:
                UpdateController._logger.debug('{0}: Creating lock files'.format(client.ip))
                client.run(['touch', UpdateController._update_file])  # Prevents manual install or update individual packages
                client.run(['touch', UpdateController._update_ongoing_file])

            # Check requirements
            packages_to_update = {}
            services_post_update = set()
            update_information = UpdateController.get_update_information_all()
            for component, component_info in update_information.iteritems():
                if component in components:
                    UpdateController._logger.debug('Verifying update information for component: {0}'.format(component.upper()))
                    Toolbox.verify_required_params(actual_params=component_info,
                                                   required_params={'downtime': (list, None),
                                                                    'packages': (dict, None),
                                                                    'prerequisites': (list, None),
                                                                    'services_stop_start': (set, None),
                                                                    'services_post_update': (set, None)})
                    if len(component_info['prerequisites']) > 0:
                        raise Exception('Update is only allowed when all prerequisites have been met')

                    packages_to_update.update(component_info['packages'])
                    services_stop_start.update(component_info['services_stop_start'])
                    services_post_update.update(component_info['services_post_update'])
            if len(packages_to_update) > 0:
                UpdateController._logger.debug('Packages to be updated: {0}'.format(', '.join(sorted(packages_to_update.keys()))))
            if len(services_stop_start) > 0:
                UpdateController._logger.debug('Services to stop before package update: {0}'.format(', '.join(sorted(services_stop_start))))
            if len(services_post_update) > 0:
                UpdateController._logger.debug('Services which will be restarted after update: {0}'.format(', '.join(sorted(services_post_update))))

            # Stop services
            if UpdateController.change_services_state(services=services_stop_start,
                                                      ssh_clients=ssh_clients,
                                                      action='stop') is False:
                raise Exception('Stopping all services on every node failed, cannot continue')

            # Install packages
            # First install packages on all StorageRouters individually
            if packages_to_update:
                failures = False
                for client in ssh_clients:
                    UpdateController._logger.debug('{0}: Installing packages'.format(client.ip))
                    for function in Toolbox.fetch_hooks('update', 'package_install_multi'):
                        try:
                            function(client=client, package_info=packages_to_update, components=components)
                        except Exception as ex:
                            UpdateController._logger.error('{0}: Package installation hook {1} failed with error: {2}'.format(client.ip, function.__name__, ex))
                            failures = True

                if set(components).difference({'framework', 'storagedriver'}):
                    # Second install packages on all ALBA nodes
                    for function in Toolbox.fetch_hooks('update', 'package_install_single'):
                        try:
                            function(package_info=packages_to_update, components=components)
                        except Exception as ex:
                            UpdateController._logger.exception('Package installation hook {0} failed with error: {1}'.format(function.__name__, ex))
                            failures = True

                if failures is True:
                    raise Exception('Installing the packages failed on 1 or more nodes')

            # Remove update file
            for client in ssh_clients:
                client.file_delete(UpdateController._update_file)

            # Migrate code
            if 'framework' in components:
                failures = []
                for client in ssh_clients:
                    UpdateController._logger.debug('{0}: Verifying extensions code migration is required'.format(client.ip))
                    try:
                        key = '/ovs/framework/hosts/{0}/versions'.format(System.get_my_machine_id(client=client))
                        old_versions = Configuration.get(key) if Configuration.exists(key) else {}
                        try:
                            with remote(client.ip, [Migrator]) as rem:
                                rem.Migrator.migrate(master_ips, extra_ips)
                        except EOFError as eof:
                            UpdateController._logger.warning('{0}: EOFError during code migration, retrying {1}'.format(client.ip, eof))
                            with remote(client.ip, [Migrator]) as rem:
                                rem.Migrator.migrate(master_ips, extra_ips)
                        new_versions = Configuration.get(key) if Configuration.exists(key) else {}
                        if old_versions != new_versions:
                            UpdateController._logger.debug('{0}: Finished extensions code migration. Old versions: {1} --> New versions: {2}'.format(client.ip, old_versions, new_versions))
                    except Exception as ex:
                        failures.append('{0}: {1}'.format(client.ip, str(ex)))
                if len(failures) > 0:
                    raise Exception('Failed to run the extensions migrate code on all nodes. Errors found:\n\n{0}'.format('\n\n'.join(failures)))

            # Start memcached
            if 'memcached' in services_stop_start:
                services_stop_start.remove('memcached')
                UpdateController._logger.debug('Starting memcached')
                UpdateController.change_services_state(services=['memcached'],
                                                       ssh_clients=ssh_clients,
                                                       action='start')

            # Migrate model
            if 'framework' in components:
                UpdateController._logger.debug('Verifying DAL code migration is required')
                old_versions = PersistentFactory.get_client().get('ovs_model_version') if PersistentFactory.get_client().exists('ovs_model_version') else {}

                from ovs.dal.helpers import Migration
                with remote(ssh_clients[0].ip, [Migration]) as rem:
                    rem.Migration.migrate()

                new_versions = PersistentFactory.get_client().get('ovs_model_version') if PersistentFactory.get_client().exists('ovs_model_version') else {}
                if old_versions != new_versions:
                    UpdateController._logger.debug('Finished DAL code migration. Old versions: {0} --> New versions: {1}'.format(old_versions, new_versions))

            # Post update actions
            for client in ssh_clients:
                UpdateController._logger.debug('{0}: Executing post-update actions'.format(client.ip))
                for function in Toolbox.fetch_hooks('update', 'post_update_multi'):
                    try:
                        function(client=client, components=components)
                    except Exception as ex:
                        UpdateController._logger.exception('{0}: Post update hook {1} failed with error: {2}'.format(client.ip, function.__name__, ex))

            for function in Toolbox.fetch_hooks('update', 'post_update_single'):
                try:
                    function(components=components)
                except Exception as ex:
                    UpdateController._logger.exception('Post update hook {0} failed with error: {1}'.format(function.__name__, ex))

            # Start services
            UpdateController.change_services_state(services=services_stop_start,
                                                   ssh_clients=ssh_clients,
                                                   action='start')

            UpdateController._refresh_package_information()
            UpdateController._logger.debug('+++ Finished updating +++')
        except NoLockAvailableException:
            UpdateController._logger.debug('Another update is currently in progress!')
        except Exception as ex:
            UpdateController._logger.exception('Error during update: {0}'.format(ex))
            if len(ssh_clients) > 0:
                UpdateController.change_services_state(services=services_stop_start,
                                                       ssh_clients=ssh_clients,
                                                       action='start')
                UpdateController._refresh_package_information()
                UpdateController._logger.error('Failed to update. Please check all the logs for more information')
        finally:
            filemutex.release()
            for ssh_client in ssh_clients:
                for file_name in [UpdateController._update_file, UpdateController._update_ongoing_file]:
                    try:
                        if ssh_client.file_exists(file_name):
                            ssh_client.file_delete(file_name)
                    except:
                        UpdateController._logger.warning('[0}: Failed to remove lock file {1}'.format(ssh_client.ip, file_name))
Exemple #12
0
    def execute_scrub_work(queue, vpool, scrub_info, error_messages):
        """
        Executes scrub work for a given vDisk queue and vPool, based on scrub_info
        :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool)
        :type queue: Queue
        :param vpool: the vPool object of the vDisks
        :type vpool: VPool
        :param scrub_info: A dict containing scrub information: `scrub_path` with the path where to scrub and `storage_router` with the StorageRouter
                           that needs to do the work
        :type scrub_info: dict
        :param error_messages: A list of error messages to be filled
        :type error_messages: list
        :return: a list of error messages
        :rtype: list
        """

        def _verify_mds_config(current_vdisk):
            current_vdisk.invalidate_dynamics('info')
            vdisk_configs = current_vdisk.info['metadata_backend_config']
            if len(vdisk_configs) == 0:
                raise RuntimeError('Could not load MDS configuration')
            return vdisk_configs

        client = None
        lock_time = 5 * 60
        storagerouter = scrub_info['storage_router']
        scrub_directory = '{0}/scrub_work_{1}_{2}'.format(scrub_info['scrub_path'], vpool.name, storagerouter.name)
        scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format(vpool.guid, storagerouter.guid)
        backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format(vpool.guid, storagerouter.guid)
        alba_proxy_service = 'ovs-albaproxy_{0}_{1}_scrub'.format(vpool.name, storagerouter.name)

        # Deploy a proxy
        try:
            with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time):
                ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service))
                client = SSHClient(storagerouter, 'root')
                client.dir_create(scrub_directory)
                client.dir_chmod(scrub_directory, 0777)  # Celery task executed by 'ovs' user and should be able to write in it
                if ServiceManager.has_service(name=alba_proxy_service, client=client) is True and ServiceManager.get_service_status(name=alba_proxy_service, client=client) is True:
                    ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service))
                    scrub_config = Configuration.get(scrub_config_key)
                else:
                    machine_id = System.get_my_machine_id(client)
                    port_range = Configuration.get('/ovs/framework/hosts/{0}/ports|storagedriver'.format(machine_id))
                    port = System.get_free_ports(selected_range=port_range, nr=1, client=client)[0]
                    # Scrub config
                    # {u'albamgr_cfg_url': u'arakoon://config/ovs/vpools/71e2f717-f270-4a41-bbb0-d4c8c084d43e/proxies/64759516-3471-4321-b912-fb424568fc5b/config/abm?ini=%2Fopt%2FOpenvStorage%2Fconfig%2Farakoon_cacc.ini',
                    #  u'fragment_cache': [u'none'],
                    #  u'ips': [u'127.0.0.1'],
                    #  u'log_level': u'info',
                    #  u'manifest_cache_size': 17179869184,
                    #  u'port': 0,
                    #  u'transport': u'tcp'}

                    # Backend config
                    # {u'alba_connection_host': u'10.100.193.155',
                    #  u'alba_connection_port': 26204,
                    #  u'alba_connection_preset': u'preset',
                    #  u'alba_connection_timeout': 15,
                    #  u'alba_connection_transport': u'TCP',
                    #  u'backend_interface_retries_on_error': 5,
                    #  u'backend_interface_retry_backoff_multiplier': 2.0,
                    #  u'backend_interface_retry_interval_secs': 1,
                    #  u'backend_type': u'ALBA'}
                    scrub_config = Configuration.get('ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid))
                    scrub_config['port'] = port
                    scrub_config['transport'] = 'tcp'
                    Configuration.set(scrub_config_key, json.dumps(scrub_config, indent=4), raw=True)

                    params = {'VPOOL_NAME': vpool.name,
                              'LOG_SINK': LogHandler.get_sink_path('alba_proxy'),
                              'CONFIG_PATH': Configuration.get_configuration_path(scrub_config_key)}
                    ServiceManager.add_service(name='ovs-albaproxy', params=params, client=client, target_name=alba_proxy_service)
                    ServiceManager.start_service(name=alba_proxy_service, client=client)
                    ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service))

                backend_config = Configuration.get('ovs/vpools/{0}/hosts/{1}/config'.format(vpool.guid, vpool.storagedrivers[0].storagedriver_id))['backend_connection_manager']
                backend_config['alba_connection_host'] = '127.0.0.1'
                backend_config['alba_connection_port'] = scrub_config['port']
                Configuration.set(backend_config_key, json.dumps({"backend_connection_manager": backend_config}, indent=4), raw=True)
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)
            error_messages.append(message)
            ScheduledTaskController._logger.exception(message)
            if client is not None and ServiceManager.has_service(name=alba_proxy_service, client=client) is True:
                if ServiceManager.get_service_status(name=alba_proxy_service, client=client) is True:
                    ServiceManager.stop_service(name=alba_proxy_service, client=client)
                ServiceManager.remove_service(name=alba_proxy_service, client=client)
            if Configuration.exists(scrub_config_key):
                Configuration.delete(scrub_config_key)

        try:
            # Empty the queue with vDisks to scrub
            with remote(storagerouter.ip, [VDisk]) as rem:
                while True:
                    vdisk = None
                    vdisk_guid = queue.get(False)
                    try:
                        # Check MDS master is local. Trigger MDS handover if necessary
                        vdisk = rem.VDisk(vdisk_guid)
                        ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Started scrubbing at location {3}'.format(vpool.name, storagerouter.name, vdisk.name, scrub_directory))
                        configs = _verify_mds_config(current_vdisk=vdisk)
                        storagedriver = StorageDriverList.get_by_storagedriver_id(vdisk.storagedriver_id)
                        if configs[0].get('ip') != storagedriver.storagerouter.ip:
                            ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - MDS master is not local, trigger handover'.format(vpool.name, storagerouter.name, vdisk.name))
                            MDSServiceController.ensure_safety(VDisk(vdisk_guid))  # Do not use a remote VDisk instance here
                            configs = _verify_mds_config(current_vdisk=vdisk)
                            if configs[0].get('ip') != storagedriver.storagerouter.ip:
                                ScheduledTaskController._logger.warning('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because master MDS still not local'.format(vpool.name, storagerouter.name, vdisk.name))
                                continue

                        # Do the actual scrubbing
                        with vdisk.storagedriver_client.make_locked_client(str(vdisk.volume_id)) as locked_client:
                            ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Retrieve and apply scrub work'.format(vpool.name, storagerouter.name, vdisk.name))
                            work_units = locked_client.get_scrubbing_workunits()
                            for work_unit in work_units:
                                res = locked_client.scrub(work_unit=work_unit,
                                                          scratch_dir=scrub_directory,
                                                          log_sinks=[LogHandler.get_sink_path('scrubber', allow_override=True)],
                                                          backend_config=Configuration.get_configuration_path(backend_config_key))
                                locked_client.apply_scrubbing_result(scrubbing_work_result=res)
                            if work_units:
                                ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - {3} work units successfully applied'.format(vpool.name, storagerouter.name, vdisk.name, len(work_units)))
                            else:
                                ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - No scrubbing required'.format(vpool.name, storagerouter.name, vdisk.name))
                    except Exception:
                        if vdisk is None:
                            message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk with guid {2} could not be found'.format(vpool.name, storagerouter.name, vdisk_guid)
                        else:
                            message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Scrubbing failed'.format(vpool.name, storagerouter.name, vdisk.name)
                        error_messages.append(message)
                        ScheduledTaskController._logger.exception(message)

        except Empty:  # Raised when all items have been fetched from the queue
            ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Queue completely processed'.format(vpool.name, storagerouter.name))
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - Scrubbing failed'.format(vpool.name, storagerouter.name)
            error_messages.append(message)
            ScheduledTaskController._logger.exception(message)

        # Delete the proxy again
        try:
            with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time):
                ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service))
                client = SSHClient(storagerouter, 'root')
                client.dir_delete(scrub_directory)
                if ServiceManager.has_service(alba_proxy_service, client=client):
                    ServiceManager.stop_service(alba_proxy_service, client=client)
                    ServiceManager.remove_service(alba_proxy_service, client=client)
                if Configuration.exists(scrub_config_key):
                    Configuration.delete(scrub_config_key)
                ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service))
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format(vpool.name, storagerouter.name, alba_proxy_service)
            error_messages.append(message)
            ScheduledTaskController._logger.exception(message)
Exemple #13
0
    def update_framework():
        """
        Update the framework
        :return: None
        """
        filemutex = file_mutex('system_update', wait=2)
        upgrade_file = '/etc/ready_for_upgrade'
        upgrade_ongoing_check_file = '/etc/upgrade_ongoing'
        ssh_clients = []
        try:
            filemutex.acquire()
            UpdateController._log_message('+++ Starting framework update +++')

            from ovs.dal.lists.storagerouterlist import StorageRouterList

            UpdateController._log_message(
                'Generating SSH client connections for each storage router')
            upgrade_file = '/etc/ready_for_upgrade'
            upgrade_ongoing_check_file = '/etc/upgrade_ongoing'
            storage_routers = StorageRouterList.get_storagerouters()
            ssh_clients = []
            master_ips = []
            extra_ips = []
            for sr in storage_routers:
                ssh_clients.append(SSHClient(sr.ip, username='******'))
                if sr.node_type == 'MASTER':
                    master_ips.append(sr.ip)
                elif sr.node_type == 'EXTRA':
                    extra_ips.append(sr.ip)
            this_client = [
                client for client in ssh_clients if client.is_local is True
            ][0]

            # Create locks
            UpdateController._log_message('Creating lock files',
                                          client_ip=this_client.ip)
            for client in ssh_clients:
                client.run(
                    'touch {0}'.format(upgrade_file)
                )  # Prevents manual install or upgrade individual packages
                client.run('touch {0}'.format(upgrade_ongoing_check_file)
                           )  # Prevents clicking x times on 'Update' btn

            # Check requirements
            packages_to_update = set()
            all_services_to_restart = []
            for client in ssh_clients:
                for function in Toolbox.fetch_hooks('update', 'metadata'):
                    UpdateController._log_message(
                        'Executing function {0}'.format(function.__name__),
                        client_ip=client.ip)
                    output = function(client)
                    for key, value in output.iteritems():
                        if key != 'framework':
                            continue
                        for package_info in value:
                            packages_to_update.update(package_info['packages'])
                            all_services_to_restart += package_info['services']

            services_to_restart = []
            for service in all_services_to_restart:
                if service not in services_to_restart:
                    services_to_restart.append(
                        service
                    )  # Filter out duplicates maintaining the order of services (eg: watcher-framework before memcached)

            UpdateController._log_message(
                'Services which will be restarted --> {0}'.format(
                    ', '.join(services_to_restart)))
            UpdateController._log_message(
                'Packages which will be installed --> {0}'.format(
                    ', '.join(packages_to_update)))

            # Stop services
            if UpdateController._change_services_state(
                    services=services_to_restart,
                    ssh_clients=ssh_clients,
                    action='stop') is False:
                UpdateController._log_message(
                    'Stopping all services on every node failed, cannot continue',
                    client_ip=this_client.ip,
                    severity='warning')
                UpdateController._remove_lock_files(
                    [upgrade_file, upgrade_ongoing_check_file], ssh_clients)

                # Start services again if a service could not be stopped
                UpdateController._log_message(
                    'Attempting to start the services again',
                    client_ip=this_client.ip)
                UpdateController._change_services_state(
                    services=services_to_restart,
                    ssh_clients=ssh_clients,
                    action='start')

                UpdateController._log_message(
                    'Failed to stop all required services, aborting update',
                    client_ip=this_client.ip,
                    severity='error')
                return

            # Update packages
            failed_clients = []
            for client in ssh_clients:
                PackageManager.update(client=client)
                try:
                    UpdateController._log_message('Installing latest packages',
                                                  client.ip)
                    for package in packages_to_update:
                        UpdateController._log_message(
                            'Installing {0}'.format(package), client.ip)
                        PackageManager.install(package_name=package,
                                               client=client,
                                               force=True)
                        UpdateController._log_message(
                            'Installed {0}'.format(package), client.ip)
                    client.file_delete(upgrade_file)
                except subprocess.CalledProcessError as cpe:
                    UpdateController._log_message(
                        'Upgrade failed with error: {0}'.format(cpe.output),
                        client.ip, 'error')
                    failed_clients.append(client)
                    break

            if failed_clients:
                UpdateController._remove_lock_files(
                    [upgrade_file, upgrade_ongoing_check_file], ssh_clients)
                UpdateController._log_message(
                    'Error occurred. Attempting to start all services again',
                    client_ip=this_client.ip,
                    severity='error')
                UpdateController._change_services_state(
                    services=services_to_restart,
                    ssh_clients=ssh_clients,
                    action='start')
                UpdateController._log_message(
                    'Failed to upgrade following nodes:\n - {0}\nPlease check /var/log/ovs/lib.log on {1} for more information'
                    .format(
                        '\n - '.join([client.ip for client in failed_clients]),
                        this_client.ip), this_client.ip, 'error')
                return

            # Migrate code
            for client in ssh_clients:
                try:
                    UpdateController._log_message('Started code migration',
                                                  client.ip)
                    try:
                        with remote(client.ip, [Migrator]) as rem:
                            rem.Migrator.migrate(master_ips, extra_ips)
                    except EOFError as eof:
                        UpdateController._log_message(
                            'EOFError during code migration, retrying {0}'.
                            format(eof), client.ip, 'warning')
                        with remote(client.ip, [Migrator]) as rem:
                            rem.Migrator.migrate(master_ips, extra_ips)
                    UpdateController._log_message('Finished code migration',
                                                  client.ip)
                except Exception as ex:
                    UpdateController._remove_lock_files(
                        [upgrade_ongoing_check_file], ssh_clients)
                    UpdateController._log_message(
                        'Code migration failed with error: {0}'.format(ex),
                        client.ip, 'error')
                    return

            # Start services
            UpdateController._log_message('Starting services',
                                          client_ip=this_client.ip)
            model_services = []
            if 'arakoon-ovsdb' in services_to_restart:
                model_services.append('arakoon-ovsdb')
                services_to_restart.remove('arakoon-ovsdb')
            if 'memcached' in services_to_restart:
                model_services.append('memcached')
                services_to_restart.remove('memcached')
            UpdateController._change_services_state(services=model_services,
                                                    ssh_clients=ssh_clients,
                                                    action='start')

            # Migrate model
            UpdateController._log_message('Started model migration',
                                          client_ip=this_client.ip)
            try:
                from ovs.dal.helpers import Migration
                with remote(ssh_clients[0].ip, [Migration]) as rem:
                    rem.Migration.migrate()
                UpdateController._log_message('Finished model migration',
                                              client_ip=this_client.ip)
            except Exception as ex:
                UpdateController._remove_lock_files(
                    [upgrade_ongoing_check_file], ssh_clients)
                UpdateController._log_message(
                    'An unexpected error occurred: {0}'.format(ex),
                    client_ip=this_client.ip,
                    severity='error')
                return

            # Post upgrade actions
            UpdateController._log_message('Executing post upgrade actions',
                                          client_ip=this_client.ip)
            for client in ssh_clients:
                with remote(client.ip, [Toolbox, SSHClient]) as rem:
                    for function in rem.Toolbox.fetch_hooks(
                            'update', 'postupgrade'):
                        UpdateController._log_message(
                            'Executing action {0}'.format(function.__name__),
                            client_ip=client.ip)
                        try:
                            function(rem.SSHClient(client.ip, username='******'))
                            UpdateController._log_message(
                                'Executing action {0} completed'.format(
                                    function.__name__),
                                client_ip=client.ip)
                        except Exception as ex:
                            UpdateController._log_message(
                                'Post upgrade action failed with error: {0}'.
                                format(ex), client.ip, 'error')

            # Start watcher and restart support-agent
            UpdateController._change_services_state(
                services=services_to_restart,
                ssh_clients=ssh_clients,
                action='start')
            UpdateController._change_services_state(services=['support-agent'],
                                                    ssh_clients=ssh_clients,
                                                    action='restart')

            UpdateController._remove_lock_files([upgrade_ongoing_check_file],
                                                ssh_clients)
            UpdateController._log_message('+++ Finished updating +++')
        except RuntimeError as rte:
            UpdateController._log_message(
                'Error during framework update: {0}'.format(rte),
                severity='error')
            UpdateController._remove_lock_files(
                [upgrade_file, upgrade_ongoing_check_file], ssh_clients)
        except NoLockAvailableException:
            UpdateController._log_message(
                'Another framework update is currently in progress!')
        except Exception as ex:
            UpdateController._log_message(
                'Error during framework update: {0}'.format(ex),
                severity='error')
            UpdateController._remove_lock_files(
                [upgrade_file, upgrade_ongoing_check_file], ssh_clients)
        finally:
            filemutex.release()
Exemple #14
0
    def update_volumedriver():
        """
        Update the volumedriver
        :return: None
        """
        filemutex = file_mutex('system_update', wait=2)
        upgrade_file = '/etc/ready_for_upgrade'
        upgrade_ongoing_check_file = '/etc/upgrade_ongoing'
        ssh_clients = []
        try:
            filemutex.acquire()
            UpdateController._log_message(
                '+++ Starting volumedriver update +++')

            from ovs.dal.lists.storagerouterlist import StorageRouterList

            UpdateController._log_message(
                'Generating SSH client connections for each storage router')
            storage_routers = StorageRouterList.get_storagerouters()
            ssh_clients = [
                SSHClient(storage_router.ip, 'root')
                for storage_router in storage_routers
            ]
            this_client = [
                client for client in ssh_clients if client.is_local is True
            ][0]

            # Commence update !!!!!!!
            # 0. Create locks
            UpdateController._log_message('Creating lock files',
                                          client_ip=this_client.ip)
            for client in ssh_clients:
                client.run(
                    'touch {0}'.format(upgrade_file)
                )  # Prevents manual install or upgrade individual packages
                client.run('touch {0}'.format(upgrade_ongoing_check_file)
                           )  # Prevents clicking x times on 'Update' btn

            # 1. Check requirements
            packages_to_update = set()
            all_services_to_restart = []
            for client in ssh_clients:
                for function in Toolbox.fetch_hooks('update', 'metadata'):
                    UpdateController._log_message(
                        'Executing function {0}'.format(function.__name__),
                        client_ip=client.ip)
                    output = function(client)
                    for key, value in output.iteritems():
                        if key != 'volumedriver':
                            continue
                        for package_info in value:
                            packages_to_update.update(package_info['packages'])
                            all_services_to_restart += package_info['services']

            services_to_restart = []
            for service in all_services_to_restart:
                if service not in services_to_restart:
                    services_to_restart.append(
                        service
                    )  # Filter out duplicates keeping the order of services (eg: watcher-framework before memcached)

            UpdateController._log_message(
                'Services which will be restarted --> {0}'.format(
                    ', '.join(services_to_restart)))
            UpdateController._log_message(
                'Packages which will be installed --> {0}'.format(
                    ', '.join(packages_to_update)))

            # 1. Stop services
            if UpdateController._change_services_state(
                    services=services_to_restart,
                    ssh_clients=ssh_clients,
                    action='stop') is False:
                UpdateController._log_message(
                    'Stopping all services on every node failed, cannot continue',
                    client_ip=this_client.ip,
                    severity='warning')
                UpdateController._remove_lock_files(
                    [upgrade_file, upgrade_ongoing_check_file], ssh_clients)

                UpdateController._log_message(
                    'Attempting to start the services again',
                    client_ip=this_client.ip)
                UpdateController._change_services_state(
                    services=services_to_restart,
                    ssh_clients=ssh_clients,
                    action='start')
                UpdateController._log_message(
                    'Failed to stop all required services, update aborted',
                    client_ip=this_client.ip,
                    severity='error')
                return

            # 2. Update packages
            failed_clients = []
            for client in ssh_clients:
                PackageManager.update(client=client)
                try:
                    for package_name in packages_to_update:
                        UpdateController._log_message(
                            'Installing {0}'.format(package_name), client.ip)
                        PackageManager.install(package_name=package_name,
                                               client=client,
                                               force=True)
                        UpdateController._log_message(
                            'Installed {0}'.format(package_name), client.ip)
                    client.file_delete(upgrade_file)
                except subprocess.CalledProcessError as cpe:
                    UpdateController._log_message(
                        'Upgrade failed with error: {0}'.format(cpe.output),
                        client.ip, 'error')
                    failed_clients.append(client)
                    break

            if failed_clients:
                UpdateController._remove_lock_files(
                    [upgrade_file, upgrade_ongoing_check_file], ssh_clients)
                UpdateController._log_message(
                    'Error occurred. Attempting to start all services again',
                    client_ip=this_client.ip,
                    severity='error')
                UpdateController._change_services_state(
                    services=services_to_restart,
                    ssh_clients=ssh_clients,
                    action='start')
                UpdateController._log_message(
                    'Failed to upgrade following nodes:\n - {0}\nPlease check /var/log/ovs/lib.log on {1} for more information'
                    .format(
                        '\n - '.join([client.ip for client in failed_clients]),
                        this_client.ip), this_client.ip, 'error')
                return

            # 3. Post upgrade actions
            UpdateController._log_message('Executing post upgrade actions',
                                          client_ip=this_client.ip)
            for client in ssh_clients:
                for function in Toolbox.fetch_hooks('update', 'postupgrade'):
                    UpdateController._log_message(
                        'Executing action: {0}'.format(function.__name__),
                        client_ip=client.ip)
                    try:
                        function(client)
                    except Exception as ex:
                        UpdateController._log_message(
                            'Post upgrade action failed with error: {0}'.
                            format(ex), client.ip, 'error')

            # 4. Start services
            UpdateController._log_message('Starting services',
                                          client_ip=this_client.ip)
            UpdateController._change_services_state(
                services=services_to_restart,
                ssh_clients=ssh_clients,
                action='start')

            UpdateController._remove_lock_files([upgrade_ongoing_check_file],
                                                ssh_clients)
            UpdateController._log_message('+++ Finished updating +++')
        except RuntimeError as rte:
            UpdateController._log_message(
                'Error during volumedriver update: {0}'.format(rte),
                severity='error')
            UpdateController._remove_lock_files(
                [upgrade_file, upgrade_ongoing_check_file], ssh_clients)
        except NoLockAvailableException:
            UpdateController._log_message(
                'Another volumedriver update is currently in progress!')
        except Exception as ex:
            UpdateController._log_message(
                'Error during volumedriver update: {0}'.format(ex),
                severity='error')
            UpdateController._remove_lock_files(
                [upgrade_file, upgrade_ongoing_check_file], ssh_clients)
        finally:
            filemutex.release()
Exemple #15
0
    def execute_scrub_work(queue, vpool, scrub_info, error_messages):
        """
        Executes scrub work for a given vDisk queue and vPool, based on scrub_info
        :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool)
        :type queue: Queue
        :param vpool: the vPool object of the vDisks
        :type vpool: VPool
        :param scrub_info: A dict containing scrub information: `scrub_path` with the path where to scrub and `storage_router` with the StorageRouter
                           that needs to do the work
        :type scrub_info: dict
        :param error_messages: A list of error messages to be filled
        :type error_messages: list
        :return: a list of error messages
        :rtype: list
        """
        def _verify_mds_config(current_vdisk):
            current_vdisk.invalidate_dynamics('info')
            vdisk_configs = current_vdisk.info['metadata_backend_config']
            if len(vdisk_configs) == 0:
                raise RuntimeError('Could not load MDS configuration')
            return vdisk_configs

        client = None
        lock_time = 5 * 60
        storagerouter = scrub_info['storage_router']
        scrub_directory = '{0}/scrub_work_{1}_{2}'.format(
            scrub_info['scrub_path'], vpool.name, storagerouter.name)
        scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format(
            vpool.guid, storagerouter.guid)
        backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format(
            vpool.guid, storagerouter.guid)
        alba_proxy_service = 'ovs-albaproxy_{0}_{1}_scrub'.format(
            vpool.name, storagerouter.name)

        # Deploy a proxy
        try:
            with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time):
                ScheduledTaskController._logger.info(
                    'Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}'
                    .format(vpool.name, storagerouter.name,
                            alba_proxy_service))
                client = SSHClient(storagerouter, 'root')
                client.dir_create(scrub_directory)
                client.dir_chmod(
                    scrub_directory, 0777
                )  # Celery task executed by 'ovs' user and should be able to write in it
                if ServiceManager.has_service(
                        name=alba_proxy_service, client=client
                ) is True and ServiceManager.get_service_status(
                        name=alba_proxy_service, client=client) is True:
                    ScheduledTaskController._logger.info(
                        'Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}'
                        .format(vpool.name, storagerouter.name,
                                alba_proxy_service))
                    scrub_config = Configuration.get(scrub_config_key)
                else:
                    machine_id = System.get_my_machine_id(client)
                    port_range = Configuration.get(
                        '/ovs/framework/hosts/{0}/ports|storagedriver'.format(
                            machine_id))
                    port = System.get_free_ports(selected_range=port_range,
                                                 nr=1,
                                                 client=client)[0]
                    # Scrub config
                    # {u'albamgr_cfg_url': u'arakoon://config/ovs/vpools/71e2f717-f270-4a41-bbb0-d4c8c084d43e/proxies/64759516-3471-4321-b912-fb424568fc5b/config/abm?ini=%2Fopt%2FOpenvStorage%2Fconfig%2Farakoon_cacc.ini',
                    #  u'fragment_cache': [u'none'],
                    #  u'ips': [u'127.0.0.1'],
                    #  u'log_level': u'info',
                    #  u'manifest_cache_size': 17179869184,
                    #  u'port': 0,
                    #  u'transport': u'tcp'}

                    # Backend config
                    # {u'alba_connection_host': u'10.100.193.155',
                    #  u'alba_connection_port': 26204,
                    #  u'alba_connection_preset': u'preset',
                    #  u'alba_connection_timeout': 15,
                    #  u'alba_connection_transport': u'TCP',
                    #  u'backend_interface_retries_on_error': 5,
                    #  u'backend_interface_retry_backoff_multiplier': 2.0,
                    #  u'backend_interface_retry_interval_secs': 1,
                    #  u'backend_type': u'ALBA'}
                    scrub_config = Configuration.get(
                        'ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(
                            vpool.guid))
                    scrub_config['port'] = port
                    scrub_config['transport'] = 'tcp'
                    Configuration.set(scrub_config_key,
                                      json.dumps(scrub_config, indent=4),
                                      raw=True)

                    params = {
                        'VPOOL_NAME':
                        vpool.name,
                        'LOG_SINK':
                        LogHandler.get_sink_path('alba_proxy'),
                        'CONFIG_PATH':
                        Configuration.get_configuration_path(scrub_config_key)
                    }
                    ServiceManager.add_service(name='ovs-albaproxy',
                                               params=params,
                                               client=client,
                                               target_name=alba_proxy_service)
                    ServiceManager.start_service(name=alba_proxy_service,
                                                 client=client)
                    ScheduledTaskController._logger.info(
                        'Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}'
                        .format(vpool.name, storagerouter.name,
                                alba_proxy_service))

                backend_config = Configuration.get(
                    'ovs/vpools/{0}/hosts/{1}/config'.format(
                        vpool.guid, vpool.storagedrivers[0].storagedriver_id
                    ))['backend_connection_manager']
                backend_config['alba_connection_host'] = '127.0.0.1'
                backend_config['alba_connection_port'] = scrub_config['port']
                Configuration.set(
                    backend_config_key,
                    json.dumps({"backend_connection_manager": backend_config},
                               indent=4),
                    raw=True)
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format(
                vpool.name, storagerouter.name, alba_proxy_service)
            error_messages.append(message)
            ScheduledTaskController._logger.exception(message)
            if client is not None and ServiceManager.has_service(
                    name=alba_proxy_service, client=client) is True:
                if ServiceManager.get_service_status(name=alba_proxy_service,
                                                     client=client) is True:
                    ServiceManager.stop_service(name=alba_proxy_service,
                                                client=client)
                ServiceManager.remove_service(name=alba_proxy_service,
                                              client=client)
            if Configuration.exists(scrub_config_key):
                Configuration.delete(scrub_config_key)

        try:
            # Empty the queue with vDisks to scrub
            with remote(storagerouter.ip, [VDisk]) as rem:
                while True:
                    vdisk = None
                    vdisk_guid = queue.get(False)
                    try:
                        # Check MDS master is local. Trigger MDS handover if necessary
                        vdisk = rem.VDisk(vdisk_guid)
                        ScheduledTaskController._logger.info(
                            'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Started scrubbing at location {3}'
                            .format(vpool.name, storagerouter.name, vdisk.name,
                                    scrub_directory))
                        configs = _verify_mds_config(current_vdisk=vdisk)
                        storagedriver = StorageDriverList.get_by_storagedriver_id(
                            vdisk.storagedriver_id)
                        if configs[0].get(
                                'ip') != storagedriver.storagerouter.ip:
                            ScheduledTaskController._logger.info(
                                'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - MDS master is not local, trigger handover'
                                .format(vpool.name, storagerouter.name,
                                        vdisk.name))
                            MDSServiceController.ensure_safety(
                                VDisk(vdisk_guid)
                            )  # Do not use a remote VDisk instance here
                            configs = _verify_mds_config(current_vdisk=vdisk)
                            if configs[0].get(
                                    'ip') != storagedriver.storagerouter.ip:
                                ScheduledTaskController._logger.warning(
                                    'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because master MDS still not local'
                                    .format(vpool.name, storagerouter.name,
                                            vdisk.name))
                                continue

                        # Do the actual scrubbing
                        with vdisk.storagedriver_client.make_locked_client(
                                str(vdisk.volume_id)) as locked_client:
                            ScheduledTaskController._logger.info(
                                'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Retrieve and apply scrub work'
                                .format(vpool.name, storagerouter.name,
                                        vdisk.name))
                            work_units = locked_client.get_scrubbing_workunits(
                            )
                            for work_unit in work_units:
                                res = locked_client.scrub(
                                    work_unit=work_unit,
                                    scratch_dir=scrub_directory,
                                    log_sinks=[
                                        LogHandler.get_sink_path(
                                            'scrubber', allow_override=True)
                                    ],
                                    backend_config=Configuration.
                                    get_configuration_path(backend_config_key))
                                locked_client.apply_scrubbing_result(
                                    scrubbing_work_result=res)
                            if work_units:
                                ScheduledTaskController._logger.info(
                                    'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - {3} work units successfully applied'
                                    .format(vpool.name, storagerouter.name,
                                            vdisk.name, len(work_units)))
                            else:
                                ScheduledTaskController._logger.info(
                                    'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - No scrubbing required'
                                    .format(vpool.name, storagerouter.name,
                                            vdisk.name))
                    except Exception:
                        if vdisk is None:
                            message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk with guid {2} could not be found'.format(
                                vpool.name, storagerouter.name, vdisk_guid)
                        else:
                            message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Scrubbing failed'.format(
                                vpool.name, storagerouter.name, vdisk.name)
                        error_messages.append(message)
                        ScheduledTaskController._logger.exception(message)

        except Empty:  # Raised when all items have been fetched from the queue
            ScheduledTaskController._logger.info(
                'Scrubber - vPool {0} - StorageRouter {1} - Queue completely processed'
                .format(vpool.name, storagerouter.name))
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - Scrubbing failed'.format(
                vpool.name, storagerouter.name)
            error_messages.append(message)
            ScheduledTaskController._logger.exception(message)

        # Delete the proxy again
        try:
            with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time):
                ScheduledTaskController._logger.info(
                    'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}'
                    .format(vpool.name, storagerouter.name,
                            alba_proxy_service))
                client = SSHClient(storagerouter, 'root')
                client.dir_delete(scrub_directory)
                if ServiceManager.has_service(alba_proxy_service,
                                              client=client):
                    ServiceManager.stop_service(alba_proxy_service,
                                                client=client)
                    ServiceManager.remove_service(alba_proxy_service,
                                                  client=client)
                if Configuration.exists(scrub_config_key):
                    Configuration.delete(scrub_config_key)
                ScheduledTaskController._logger.info(
                    'Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}'
                    .format(vpool.name, storagerouter.name,
                            alba_proxy_service))
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format(
                vpool.name, storagerouter.name, alba_proxy_service)
            error_messages.append(message)
            ScheduledTaskController._logger.exception(message)