コード例 #1
0
 def cache_flush(self, pool):
     backing_pool = pool['pool_name']
     cache_pool = backing_pool + '-cache'
     try:
         # set target_max_objects to a small value to force evacuation of
         # objects from cache before we use rados cache-flush-evict-all
         # WARNING: assuming cache_pool will be deleted after flush so
         # we don't have to save/restore the value of target_max_objects
         #
         self.cache_pool_set_param(pool, 'target_max_objects', 1)
         prev_object_count = None
         wait_interval = MIN_WAIT
         while True:
             response, body = self.service.ceph_api.df(body='json')
             if not response.ok:
                 LOG.warn(
                     _LW("Failed to retrieve cluster free space stats: "
                         "status_code=%d, reason=%s") %
                     (response.status_code, response.reason))
                 break
             stats = None
             for s in body['output']['pools']:
                 if s['name'] == cache_pool:
                     stats = s['stats']
                     break
             if not stats:
                 LOG.warn(
                     _LW("Missing pool free space stats: "
                         "cache_pool=%s") % cache_pool)
                 break
             object_count = stats['objects']
             if object_count < constants.CACHE_FLUSH_OBJECTS_THRESHOLD:
                 break
             if prev_object_count is not None:
                 delta_objects = object_count - prev_object_count
                 if delta_objects > 0:
                     LOG.warn(
                         _LW("Unexpected increase in number "
                             "of objects in cache pool: "
                             "cache_pool=%s, prev_object_count=%d, "
                             "object_count=%d") %
                         (cache_pool, prev_object_count, object_count))
                     break
                 if delta_objects == 0:
                     wait_interval *= 2
                     if wait_interval > MAX_WAIT:
                         LOG.warn(
                             _LW("Cache pool number of objects did not "
                                 "decrease: cache_pool=%s, object_count=%d, "
                                 "wait_interval=%d") %
                             (cache_pool, object_count, wait_interval))
                         break
                 else:
                     wait_interval = MIN_WAIT
             time.sleep(wait_interval)
             prev_object_count = object_count
     except exception.CephPoolSetParamFailure as e:
         LOG.warn(e)
     finally:
         self.rados_cache_flush_evict_all(pool)
コード例 #2
0
    def _get_health(self):
        try:
            # we use text since it has all info
            response, body = self.service.ceph_api.health(body='text',
                                                          timeout=30)
        except IOError as e:
            LOG.warning(_LW("ceph_api.health failed: %s") % str(e.message))
            self.cluster_is_up = False
            return {
                'health': constants.CEPH_HEALTH_DOWN,
                'detail': 'Ceph cluster is down.'
            }

        if not response.ok:
            LOG.warning(_LW("CEPH health check failed: %s") % response.reason)
            health_info = [constants.CEPH_HEALTH_DOWN, response.reason]
            self.cluster_is_up = False
        else:
            health_info = body.split(' ', 1)
            self.cluster_is_up = True

        health = health_info[0]

        if len(health_info) > 1:
            detail = health_info[1]
        else:
            detail = health_info[0]

        return {'health': health.strip(), 'detail': detail.strip()}
コード例 #3
0
    def _get_fsid(self):
        try:
            response, fsid = self.service.ceph_api.fsid(body='text',
                                                        timeout=30)
        except IOError as e:
            LOG.warning(_LW("ceph_api.fsid failed: %s") % str(e.message))
            self.cluster_is_up = False
            return None

        if not response.ok:
            LOG.warning(_LW("Get fsid failed: %s") % response.reason)
            self.cluster_is_up = False
            return None

        self.cluster_is_up = True
        return fsid.strip()
コード例 #4
0
ファイル: monitor.py プロジェクト: bavery22/stx-upstream
 def auto_heal(self, health):
     if (health['health'] == constants.CEPH_HEALTH_WARN
             and (constants.CEPH_HEALTH_WARN_REQUIRE_JEWEL_OSDS_NOT_SET
                  in health['detail'])):
         try:
             upgrade = self.service.get_software_upgrade_status()
         except Exception as ex:
             LOG.warn(
                 _LW("Getting software upgrade status failed "
                     "with: %s. Skip auto-heal attempt "
                     "(will retry on next ceph status poll).") % str(ex))
             return
         state = upgrade.get('state')
         # surpress require_jewel_osds in case upgrade is
         # in progress but not completed or aborting
         if (not self.surpress_require_jewel_osds_warning
                 and (upgrade.get('from_version')
                      == constants.TITANIUM_SERVER_VERSION_16_10)
                 and state not in [
                     None, constants.UPGRADE_COMPLETED,
                     constants.UPGRADE_ABORTING,
                     constants.UPGRADE_ABORT_COMPLETING,
                     constants.UPGRADE_ABORTING_ROLLBACK
                 ]):
             LOG.info(_LI("Surpress require_jewel_osds health warning"))
             self.surpress_require_jewel_osds_warning = True
         # set require_jewel_osds in case upgrade is
         # not in progress or completed
         if (state in [None, constants.UPGRADE_COMPLETED]):
             LOG.warn(
                 _LW("No upgrade in progress or update completed "
                     "and require_jewel_osds health warning raised. "
                     "Set require_jewel_osds flag."))
             self.set_flag_require_jewel_osds()
             health = self._remove_require_jewel_osds_warning(health)
             LOG.info(_LI("Unsurpress require_jewel_osds health warning"))
             self.surpress_require_jewel_osds_warning = False
         # unsurpress require_jewel_osds in case upgrade
         # is aborting
         if (self.surpress_require_jewel_osds_warning and state in [
                 constants.UPGRADE_ABORTING,
                 constants.UPGRADE_ABORT_COMPLETING,
                 constants.UPGRADE_ABORTING_ROLLBACK
         ]):
             LOG.info(_LI("Unsurpress require_jewel_osds health warning"))
             self.surpress_require_jewel_osds_warning = False
     return health
コード例 #5
0
    def _select_ds_for_volume(self,
                              dc_moid,
                              cluster_moid,
                              ds_moid,
                              host_moid=None,
                              folders=None):
        """Select datastore that can accommodate the given volume's backing.

        Returns the selected datastore summary along with a compute host and
        its resource pool and folder where the volume can be created
        :return: (host, resource_pool, folder, summary)
        """
        dc_ref = utils.get_datacenter_moref(self._content, moid=dc_moid)
        if not dc_ref:
            LOG.error(_LE("No valid datacenter is available."))
            raise exception.NoValidDatacenter()

        resource_pool = None
        host_ref = None
        cluster_ref = utils.get_child_ref_by_moid(dc_ref.hostFolder,
                                                  cluster_moid)
        if not cluster_ref:
            LOG.warn(_LW("No valid cluster is available."))
            host_ref = utils.get_child_ref_by_moid(dc_ref.hostFolder,
                                                   host_moid)
            if not host_ref:
                LOG.error(_LE("No valid host is available."))
                raise exception.NoValidHost()
        else:
            resource_pool = cluster_ref.resourcePool

        host_ref = utils.get_ref_from_array_by_moid(cluster_ref.host,
                                                    host_moid)
        if not host_ref:
            LOG.warn(_LW("No valid host is specified."))

        ds_ref = utils.get_ref_from_array_by_moid(cluster_ref.datastore,
                                                  ds_moid)
        if not ds_ref:
            LOG.error(_LE("No valid datastore is available."))
            raise exception.NoValidDatastore()

        folder_ref = self._get_volume_group_folder(dc_ref, folders)

        return (resource_pool, host_ref, ds_ref, folder_ref)
コード例 #6
0
ファイル: exception.py プロジェクト: ShuichengLin/stx-integ
 def __init__(self, message=None, **kwargs):
     self.kwargs = kwargs
     if not message:
         try:
             message = self.message % kwargs
         except TypeError:
             LOG.warn(_LW('Exception in string format operation'))
             for name, value in kwargs.iteritems():
                 LOG.error("%s: %s" % (name, value))
             # at least get the core message out if something happened
             message = self.message
     super(CephManagerException, self).__init__(message)
コード例 #7
0
    def volumedriver_remove(self, name):
        """
        Remove a Docker volume.

        :param unicode name: The name of the volume.

        :return: Result indicating success.
        """
        contents = json.loads(name.content.getvalue())
        volname = contents['Name']

        # Only 1 node in a multinode cluster can try to remove the volume.
        # Grab lock for volume name. If lock is inuse, just return with no
        # error.
        self._lock_volume(volname, 'Remove')

        vol = self._etcd.get_vol_byname(volname)
        if vol is None:
            # Just log an error, but don't fail the docker rm command
            msg = (_LE('Volume remove name not found %s'), volname)
            LOG.error(msg)
            self._unlock_volume(volname)
            return json.dumps({u"Err": ''})

        try:
            self.hpeplugin_driver.delete_volume(vol)
            LOG.info(_LI('volume: %(name)s,' 'was successfully deleted'),
                     {'name': volname})
        except Exception as ex:
            msg = (_LE('Err: Failed to remove volume %s, error is %s'),
                   volname, six.text_type(ex))
            LOG.error(msg)
            self._unlock_volume(volname)
            raise exception.HPEPluginRemoveException(reason=msg)

        try:
            self._etcd.delete_vol(vol)
        except KeyError:
            msg = (_LW('Warning: Failed to delete volume key: %s from '
                       'etcd due to KeyError'), volname)
            LOG.warning(msg)
            pass

        self._unlock_volume(volname)
        return json.dumps({u"Err": ''})
コード例 #8
0
    def volumedriver_remove(self, name):
        """
        Remove a Docker volume.

        :param unicode name: The name of the volume.

        :return: Result indicating success.
        """
        contents = json.loads(name.content.getvalue())
        volname = contents['Name']

        # Only 1 node in a multinode cluster can try to remove the volume.
        # Grab lock for volume name. If lock is inuse, just return with no
        # error.
        self._lock_volume(volname, 'Remove')

        vol = self._etcd.get_vol_byname(volname)
        if vol is None:
            # Just log an error, but don't fail the docker rm command
            msg = (_LE('Volume remove name not found %s'), volname)
            LOG.error(msg)
            self._unlock_volume(volname)
            return json.dumps({u"Err": ''})

        try:
            self.hpeplugin_driver.delete_volume(vol)
            LOG.info(_LI('volume: %(name)s,'
                         'was successfully deleted'), {'name': volname})
        except Exception as ex:
            msg = (_LE('Err: Failed to remove volume %s, error is %s'),
                   volname, six.text_type(ex))
            LOG.error(msg)
            self._unlock_volume(volname)
            raise exception.HPEPluginRemoveException(reason=msg)

        try:
            self._etcd.delete_vol(vol)
        except KeyError:
            msg = (_LW('Warning: Failed to delete volume key: %s from '
                       'etcd due to KeyError'), volname)
            LOG.warning(msg)
            pass

        self._unlock_volume(volname)
        return json.dumps({u"Err": ''})
コード例 #9
0
class SysinvConductorUpgradeApi(object):
    def __init__(self):
        self.sysinv_conductor = None
        super(SysinvConductorUpgradeApi, self).__init__()

    def get_software_upgrade_status(self):
        LOG.info(_LI("Getting software upgrade status from sysinv"))
        cctxt = self.sysinv_conductor.prepare(timeout=2)
        upgrade = cctxt.call({}, 'get_software_upgrade_status')
        LOG.info(_LI("Software upgrade status: %s") % str(upgrade))
        return upgrade

    @retry(wait_fixed=1000,
           retry_on_exception=lambda exception: LOG.warn(
               _LW("Getting software upgrade status failed "
                   "with: %s. Retrying... ") % str(exception)) or True)
    def retry_get_software_upgrade_status(self):
        return self.get_software_upgrade_status()
コード例 #10
0
    def update_cache_target_max_bytes(self):
        "Dynamically compute target_max_bytes of caching pools"

        # Only compute if cache tiering is enabled
        if self.config_applied and self.config_desired:
            if (not self.config_desired.cache_enabled
                    or not self.config_applied.cache_enabled):
                LOG.debug("Cache tiering disabled, no need to update "
                          "target_max_bytes.")
                return
        LOG.debug("Updating target_max_bytes")

        # Get available space
        response, body = self.service.ceph_api.osd_df(body='json',
                                                      output_method='tree')
        if not response.ok:
            LOG.warn(
                _LW("Failed to retrieve cluster free space stats: "
                    "status_code=%d, reason=%s") %
                (response.status_code, response.reason))
            return

        storage_tier_size = 0
        cache_tier_size = 0

        replication = constants.CEPH_REPLICATION_FACTOR
        for node in body['output']['nodes']:
            if node['name'] == 'storage-tier':
                storage_tier_size = node['kb'] * 1024 / replication
            elif node['name'] == 'cache-tier':
                cache_tier_size = node['kb'] * 1024 / replication

        if storage_tier_size == 0 or cache_tier_size == 0:
            LOG.info("Failed to get cluster size "
                     "(storage_tier_size=%s, cache_tier_size=%s),"
                     "retrying on next cycle" %
                     (storage_tier_size, cache_tier_size))
            return

        # Get available pools
        response, body = self.service.ceph_api.osd_lspools(body='json')
        if not response.ok:
            LOG.warn(
                _LW("Failed to retrieve available pools: "
                    "status_code=%d, reason=%s") %
                (response.status_code, response.reason))
            return
        pools = [p['poolname'] for p in body['output']]

        # Separate backing from caching for easy iteration
        backing_pools = []
        caching_pools = []
        for p in pools:
            if p.endswith('-cache'):
                caching_pools.append(p)
            else:
                backing_pools.append(p)
        LOG.debug("Pools: caching: %s, backing: %s" %
                  (caching_pools, backing_pools))

        if not len(caching_pools):
            # We do not have caching pools created yet
            return

        # Get quota from backing pools that are cached
        stats = {}
        for p in caching_pools:
            backing_name = p.replace('-cache', '')
            stats[backing_name] = {}
            try:
                quota = ceph.osd_pool_get_quota(self.service.ceph_api,
                                                backing_name)
            except exception.CephPoolGetQuotaFailure as e:
                LOG.warn(
                    _LW("Failed to retrieve quota: "
                        "exception: %s") % str(e))
                return
            stats[backing_name]['quota'] = quota['max_bytes']
            stats[backing_name]['quota_pt'] = (quota['max_bytes'] * 100.0 /
                                               storage_tier_size)
            LOG.debug("Quota for pool: %s "
                      "is: %s B representing %s pt" %
                      (backing_name, quota['max_bytes'],
                       stats[backing_name]['quota_pt']))

        # target_max_bytes logic:
        # - For computing target_max_bytes cache_tier_size must be equal than
        #   the sum of target_max_bytes of each caching pool
        # - target_max_bytes for each caching pool is computed as the
        #   percentage of quota in corresponding backing pool
        # - the caching tiers has to work at full capacity, so if the sum of
        #   all quotas in the backing tier is different than 100% we need to
        #   normalize
        # - if the quota is zero for any pool we add CACHE_TIERING_MIN_QUOTA
        #   by default *after* normalization so that we have real minimum

        # We compute the real percentage that need to be normalized after
        # ensuring that we have CACHE_TIERING_MIN_QUOTA for each pool with
        # a quota of 0
        real_100pt = 90.0  # we start from max and decrease it for each 0 pool
        # Note: We must avoid reaching 100% at all costs! and
        # cache_target_full_ratio, the Ceph parameter that is supposed to
        # protect the cluster against this does not work in Ceph v0.94.6!
        # Therefore a value of 90% is better suited for this
        for p in caching_pools:
            backing_name = p.replace('-cache', '')
            if stats[backing_name]['quota_pt'] == 0:
                real_100pt -= constants.CACHE_TIERING_MIN_QUOTA
            LOG.debug("Quota before normalization for %s is: %s pt" %
                      (p, stats[backing_name]['quota_pt']))

        # Compute total percentage of quotas for all backing pools.
        # Should be 100% if correctly configured
        total_quota_pt = 0
        for p in caching_pools:
            backing_name = p.replace('-cache', '')
            total_quota_pt += stats[backing_name]['quota_pt']
        LOG.debug("Total quota pt is: %s" % total_quota_pt)

        # Normalize quota pt to 100% (or real_100pt)
        if total_quota_pt != 0:  # to avoid divide by zero
            for p in caching_pools:
                backing_name = p.replace('-cache', '')
                stats[backing_name]['quota_pt'] = \
                    (stats[backing_name]['quota_pt'] *
                     (real_100pt / total_quota_pt))

        # Do not allow quota to be 0 for any pool
        total = 0
        for p in caching_pools:
            backing_name = p.replace('-cache', '')
            if stats[backing_name]['quota_pt'] == 0:
                stats[backing_name]['quota_pt'] = \
                    constants.CACHE_TIERING_MIN_QUOTA
            total += stats[backing_name]['quota_pt']
            LOG.debug("Quota after normalization for %s is: %s:" %
                      (p, stats[backing_name]['quota_pt']))

        if total > 100:
            # Supplementary protection, we really have to avoid going above
            # 100%. Note that real_100pt is less than 100% but we still got
            # more than 100!
            LOG.warn("Total sum of quotas should not go above 100% "
                     "but is: %s, recalculating in next cycle" % total)
            return
        LOG.debug("Total sum of quotas is %s pt" % total)

        # Get current target_max_bytes. We cache it to reduce requests
        # to ceph-rest-api. We are the ones changing it, so not an issue.
        for p in caching_pools:
            if p not in self.target_max_bytes:
                try:
                    value = ceph.osd_get_pool_param(self.service.ceph_api, p,
                                                    constants.TARGET_MAX_BYTES)
                except exception.CephPoolGetParamFailure as e:
                    LOG.warn(e)
                    return
                self.target_max_bytes[p] = value
        LOG.debug("Existing target_max_bytes got from "
                  "Ceph: %s" % self.target_max_bytes)

        # Set TARGET_MAX_BYTES
        LOG.debug("storage_tier_size: %s "
                  "cache_tier_size: %s" % (storage_tier_size, cache_tier_size))
        for p in caching_pools:
            backing_name = p.replace('-cache', '')
            s = stats[backing_name]
            target_max_bytes = math.floor(s['quota_pt'] * cache_tier_size /
                                          100.0)
            target_max_bytes = int(target_max_bytes)
            LOG.debug("New Target max bytes of pool: %s is: %s B" %
                      (p, target_max_bytes))

            # Set the new target_max_bytes only if it changed
            if self.target_max_bytes.get(p) == target_max_bytes:
                LOG.debug("Target max bytes of pool: %s "
                          "is already updated" % p)
                continue
            try:
                ceph.osd_set_pool_param(self.service.ceph_api, p,
                                        constants.TARGET_MAX_BYTES,
                                        target_max_bytes)
                self.target_max_bytes[p] = target_max_bytes
            except exception.CephPoolSetParamFailure as e:
                LOG.warn(e)
                continue
        return
コード例 #11
0
    def do_enable_cache(self, new_config, applied_config, lock_ownership):
        LOG.info(
            _LI("cache_tiering_enable_cache: "
                "new_config={}, applied_config={}").format(
                    new_config.to_dict(), applied_config.to_dict()))
        _unwind_actions = []
        with lock_ownership():
            success = False
            _exception = None
            try:
                self.config_desired.cache_enabled = True
                self.update_pools_info()
                for pool in CEPH_POOLS:
                    if (pool['pool_name']
                            == constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL
                            or pool['pool_name']
                            == constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER):
                        object_pool_name = \
                          self.service.monitor._get_object_pool_name()
                        pool['pool_name'] = object_pool_name

                    self.cache_pool_create(pool)
                    _unwind_actions.append(
                        functools.partial(self.cache_pool_delete, pool))
                for pool in CEPH_POOLS:
                    if (pool['pool_name']
                            == constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL
                            or pool['pool_name']
                            == constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER):
                        object_pool_name = \
                            self.service.monitor._get_object_pool_name()
                        pool['pool_name'] = object_pool_name

                    self.cache_tier_add(pool)
                    _unwind_actions.append(
                        functools.partial(self.cache_tier_remove, pool))
                for pool in CEPH_POOLS:
                    if (pool['pool_name']
                            == constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL
                            or pool['pool_name']
                            == constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER):
                        object_pool_name = \
                          self.service.monitor._get_object_pool_name()
                        pool['pool_name'] = object_pool_name

                    self.cache_mode_set(pool, 'writeback')
                    self.cache_pool_set_config(pool, new_config)
                    self.cache_overlay_create(pool)
                success = True
            except Exception as e:
                LOG.error(
                    _LE('Failed to enable cache: reason=%s') %
                    traceback.format_exc())
                for action in reversed(_unwind_actions):
                    try:
                        action()
                    except Exception:
                        LOG.warn(
                            _LW('Failed cache enable '
                                'unwind action: reason=%s') %
                            traceback.format_exc())
                success = False
                _exception = str(e)
            finally:
                self.service.monitor.monitor_check_cache_tier(success)
                if success:
                    self.config_applied.cache_enabled = True
                self.service.sysinv_conductor.call(
                    {},
                    'cache_tiering_enable_cache_complete',
                    success=success,
                    exception=_exception,
                    new_config=new_config.to_dict(),
                    applied_config=applied_config.to_dict())
                # Run first update of periodic target_max_bytes
                self.update_cache_target_max_bytes()
コード例 #12
0
ファイル: vmopt.py プロジェクト: lilingxing20/pyvmomi-samples
    def _consolidate_vmdk_volume(self,
                                 instance,
                                 vm_ref,
                                 device,
                                 volume_ref,
                                 adapter_type=None,
                                 disk_type=None):
        """Consolidate volume backing VMDK files if needed.

        The volume's VMDK file attached to an instance can be moved by SDRS
        if enabled on the cluster.
        By this the VMDK files can get copied onto another datastore and the
        copy on this new location will be the latest version of the VMDK file.
        So at the time of detach, we need to consolidate the current backing
        VMDK file with the VMDK file in the new location.

        We need to ensure that the VMDK chain (snapshots) remains intact during
        the consolidation. SDRS retains the chain when it copies VMDK files
        over, so for consolidation we relocate the backing with move option
        as moveAllDiskBackingsAndAllowSharing and then delete the older version
        of the VMDK file attaching the new version VMDK file.

        In the case of a volume boot the we need to ensure that the volume
        is on the datastore of the instance.
        """

        original_device = self._get_vmdk_base_volume_device(volume_ref)

        original_device_path = original_device.backing.fileName
        current_device_path = device.backing.fileName

        if original_device_path == current_device_path:
            # The volume is not moved from its original location.
            # No consolidation is required.
            LOG.debug(
                "The volume has not been displaced from "
                "its original location: %s. No consolidation "
                "needed.", current_device_path)
            return

        # The volume has been moved from its original location.
        # Need to consolidate the VMDK files.
        LOG.info(
            _LI("The volume's backing has been relocated to %s. Need to "
                "consolidate backing disk file."), current_device_path)

        # Pick the host and resource pool on which the instance resides.
        # Move the volume to the datastore where the new VMDK file is present.
        host = vm_ref.runtime.host
        res_pool = host.parent.resourcePool
        datastore = device.backing.datastore
        detached = False
        LOG.debug(
            "Relocating volume's backing: %(backing)s to resource "
            "pool: %(rp)s, datastore: %(ds)s, host: %(host)s.", {
                'backing': volume_ref,
                'rp': res_pool,
                'ds': datastore,
                'host': host
            })
        try:
            self._relocate_vmdk_volume(volume_ref, res_pool, datastore, host)
        except exception.FileNotFound:
            # Volume's vmdk was moved; remove the device so that we can
            # relocate the volume.
            LOG.warn(_LW("Virtual disk: %s of volume's backing not found."),
                     original_device_path,
                     exc_info=True)
            LOG.debug("Removing disk device of volume's backing and "
                      "reattempting relocate.")
            self.detach_disk_from_vm(volume_ref, instance, original_device)
            detached = True
            self._relocate_vmdk_volume(volume_ref, res_pool, datastore, host)

        # Volume's backing is relocated now; detach the old vmdk if not done
        # already.
        if not detached:
            self.detach_disk_from_vm(volume_ref,
                                     instance,
                                     original_device,
                                     destroy_disk=True)

        # Attach the current volume to the volume_ref
        self._attach_disk_to_vm(volume_ref,
                                instance,
                                adapter_type,
                                disk_type,
                                vmdk_path=current_device_path)