def get_overrides(self, namespace=None):
        replicas = 2
        if utils.is_aio_system(self.dbapi):
            if utils.is_aio_simplex_system(self.dbapi):
                replicas = 1

        if (utils.is_aio_system(self.dbapi)
                and not self._is_distributed_cloud_role_system_controller()):
            esJavaOpts = \
                "-Djava.net.preferIPv6Addresses=true -Xmx512m -Xms512m"
        else:
            esJavaOpts = \
                "-Djava.net.preferIPv6Addresses=true -Xmx1024m -Xms1024m"

        overrides = {
            common.HELM_NS_MONITOR: {
                'replicas': replicas,
                'esJavaOpts': esJavaOpts,
                'nodeSelector': {
                    common.LABEL_MONITOR_CLIENT: "enabled"
                },
                'resources': self._get_client_resources_overrides(),
            }
        }

        if namespace in self.SUPPORTED_NAMESPACES:
            return overrides[namespace]
        elif namespace:
            raise exception.InvalidHelmNamespace(chart=self.CHART,
                                                 namespace=namespace)
        else:
            return overrides
Example #2
0
    def get_monitors_status(self, db_api):
        num_inv_monitors = 0
        if cutils.is_aio_system(db_api):
            required_monitors = constants.MIN_STOR_MONITORS_AIO
        else:
            required_monitors = constants.MIN_STOR_MONITORS_MULTINODE
        quorum_names = []
        inventory_monitor_names = []

        # first check that the monitors are available in sysinv
        monitor_list = db_api.ceph_mon_get_list()
        for mon in monitor_list:
            ihost = db_api.ihost_get(mon['forihostid'])
            host_action = ihost['ihost_action'] or ""
            locking = (host_action.startswith(constants.LOCK_ACTION)
                       or host_action.startswith(constants.FORCE_LOCK_ACTION))
            if (ihost['administrative'] == constants.ADMIN_UNLOCKED
                    and ihost['operational'] == constants.OPERATIONAL_ENABLED
                    and not locking):
                num_inv_monitors += 1
                inventory_monitor_names.append(ihost['hostname'])

        LOG.info("Active ceph monitors in inventory = %s" %
                 str(inventory_monitor_names))

        # check that the cluster is actually operational.
        # if we can get the monitor quorum from ceph, then
        # the cluster is truly operational
        if num_inv_monitors >= required_monitors:
            try:
                quorum_names = self._osd_quorum_names()
            except Exception:
                # if the cluster is not responding to requests
                # we set the quorum_names to an empty list , indicating a problem
                quorum_names = []
                LOG.error("Ceph cluster not responding to requests.")

        LOG.info("Active ceph monitors in ceph cluster = %s" %
                 str(quorum_names))

        # There may be cases where a host is in an unlocked-available state,
        # but the monitor is down due to crashes or manual removal.
        # For such cases, we determine the list of active ceph monitors to be
        # the intersection of the sysinv reported unlocked-available monitor
        # hosts and the monitors reported in the quorum via the ceph API.
        active_monitors = list(
            set(inventory_monitor_names) & set(quorum_names))
        num_active_monitors = len(active_monitors)
        if (num_inv_monitors and num_active_monitors == 0
                and cutils.is_initial_config_complete()
                and not cutils.is_aio_system(db_api)):
            # The active controller always has a monitor.
            # We are on standard or storage, initial configuration
            # was completed and Ceph is down so we can't check if
            # it is working. Assume it is.
            num_active_monitors = 1
        LOG.info("Active ceph monitors = %s" % str(active_monitors))

        return num_active_monitors, required_monitors, active_monitors
    def get_overrides(self, namespace=None):

        combined_data_and_master = False
        replicas = 2
        if utils.is_aio_system(self.dbapi):
            if (utils.is_aio_duplex_system(self.dbapi)
                    and self._count_hosts_by_label(
                        common.LABEL_MONITOR_MASTER) < 3):
                # For AIO-DX without master labelled worker nodes,
                # configure elasticsearch data pods as master capable,
                # so they will form a cluster of 3 masters with the single
                # elasticsearch master pod.
                combined_data_and_master = True

            if utils.is_aio_simplex_system(self.dbapi):
                replicas = 1
        if (utils.is_aio_system(self.dbapi)
                and not self._is_distributed_cloud_role_system_controller()):
            esJavaOpts = \
                "-Djava.net.preferIPv6Addresses=true -Xmx1536m -Xms1536m"
        else:
            esJavaOpts = \
                "-Djava.net.preferIPv6Addresses=true -Xmx4096m -Xms4096m"

        overrides = {
            common.HELM_NS_MONITOR: {
                'nodeGroup': 'data',
                'replicas': replicas,
                'esJavaOpts': esJavaOpts,
                'resources': self._get_data_resources_overrides(),
                'volumeClaimTemplate': {
                    'accessModes': ["ReadWriteOnce"],
                    'resources': {
                        'requests': {
                            'storage': str(self.DATA_VOLUME_SIZE_GB) + 'Gi'
                        }
                    },
                    'storageClass': 'general'
                },
                'nodeSelector': {
                    common.LABEL_MONITOR_DATA: "enabled"
                },
                'antiAffinity': "hard",
            }
        }

        if combined_data_and_master:
            overrides[common.HELM_NS_MONITOR]['roles'] = {'master': 'true'}
            overrides[common.HELM_NS_MONITOR]['minimumMasterNodes'] = 1

        if namespace in self.SUPPORTED_NAMESPACES:
            return overrides[namespace]
        elif namespace:
            raise exception.InvalidHelmNamespace(chart=self.CHART,
                                                 namespace=namespace)
        else:
            return overrides
    def _get_data_overrides(self):
        # Note memory values are to be system engineered.

        if utils.is_aio_system(self.dbapi):
            heap_size = "512m"
            memory_size = "512Mi"
        else:
            heap_size = "1536m"
            memory_size = "1536Mi"

        conf = {
            'replicas': self._count_hosts_by_label(common.LABEL_MONITOR_DATA),
            'heapSize': heap_size,
            'resources': {
                'limits': {
                    'cpu': "1"
                },
                'requests': {
                    'cpu': "25m",
                    'memory': memory_size,
                },
            },
            'persistence': {
                'storageClass': 'general',
                'size': "100Gi"
            },
            'nodeSelector': {
                common.LABEL_MONITOR_DATA: "enabled"
            },
        }
        return conf
    def get_overrides(self, namespace=None):

        minimumMasterNodes = 1

        replicas = 3
        if utils.is_aio_system(self.dbapi):
            if self._count_hosts_by_label(common.LABEL_MONITOR_MASTER) < 3:
                # For AIO-SX, we will get here by definition, as there will
                # only be 1 master labelled host.
                # For AIO-DX without master labelled worker, we only
                # need 1 elasticsearch master pod, as the 2 data
                # pods will be master capable to form a cluster of 3 masters.
                replicas = 1

        if (utils.is_aio_system(self.dbapi) and not
                self._is_distributed_cloud_role_system_controller()):
            esJavaOpts = "-Djava.net.preferIPv6Addresses=true -Xmx256m -Xms256m"
        else:
            esJavaOpts = "-Djava.net.preferIPv6Addresses=true -Xmx512m -Xms512m"

        overrides = {
            common.HELM_NS_MONITOR: {
                'nodeGroup': 'master',
                'replicas': replicas,
                'esJavaOpts': esJavaOpts,
                'minimumMasterNodes': minimumMasterNodes,
                'nodeSelector': {common.LABEL_MONITOR_MASTER: "enabled"},
                'resources': self._get_master_resource_overrides(),
                'volumeClaimTemplate': {
                    'accessModes': ["ReadWriteOnce"],
                    'resources': {
                        'requests': {'storage': '4Gi'}
                    },
                    'storageClass': 'general'
                },
            }
        }

        if namespace in self.SUPPORTED_NAMESPACES:
            return overrides[namespace]
        elif namespace:
            raise exception.InvalidHelmNamespace(chart=self.CHART,
                                                 namespace=namespace)
        else:
            return overrides
Example #6
0
def _check_host(stor):
    ihost_id = stor['forihostid']
    ihost = pecan.request.dbapi.ihost_get(ihost_id)
    stor_model = ceph.get_ceph_storage_model()

    # semantic check: whether OSD can be added to this host.
    if stor_model == constants.CEPH_STORAGE_MODEL:
        if ihost.personality != constants.STORAGE:
            msg = ("Storage model is '%s'. Storage devices can only be added "
                   "to storage nodes." % stor_model)
            raise wsme.exc.ClientSideError(_(msg))
    elif stor_model == constants.CEPH_CONTROLLER_MODEL:
        if ihost.personality != constants.CONTROLLER:
            msg = ("Storage model is '%s'. Storage devices can only be added "
                   "to controller nodes." % stor_model)
            raise wsme.exc.ClientSideError(_(msg))
    elif stor_model == constants.CEPH_UNDEFINED_MODEL:
        msg = ("Please install storage-0 or configure a Ceph monitor "
               "on a worker node before adding storage devices.")
        raise wsme.exc.ClientSideError(_(msg))

    # semantic check: whether host is operationally acceptable
    if (stor_model == constants.CEPH_CONTROLLER_MODEL or
            stor_model == constants.CEPH_AIO_SX_MODEL):
        if (ihost['administrative'] == constants.ADMIN_UNLOCKED and
                ihost['operational'] != constants.OPERATIONAL_ENABLED):
            msg = _("Host %s must be unlocked and operational state "
                    "enabled." % ihost['hostname'])
            raise wsme.exc.ClientSideError(msg)
    else:
        if ihost['administrative'] != constants.ADMIN_LOCKED:
            raise wsme.exc.ClientSideError(_("Host %s must be locked." %
                                             ihost['hostname']))

    # semantic check: whether system has a ceph backend
    if not StorageBackendConfig.has_backend_configured(
            pecan.request.dbapi,
            constants.SB_TYPE_CEPH
    ):
        raise wsme.exc.ClientSideError(_(
            "System must have a %s backend" % constants.SB_TYPE_CEPH))

    # semantic check: whether at least 2 unlocked hosts are monitors
    if not cutils.is_aio_system(pecan.request.dbapi):
        ceph_helper = ceph.CephApiOperator()
        num_monitors, required_monitors, __ = \
            ceph_helper.get_monitors_status(pecan.request.dbapi)
        # CGTS 503 for now update monitors requirement until controller-0 is
        # inventoried
        # CGTS 1448
        if num_monitors < required_monitors:
            raise wsme.exc.ClientSideError(_(
                "Only %d storage monitor available. "
                "At least %s unlocked and enabled hosts with monitors are "
                "required. Please ensure hosts with monitors are unlocked "
                "and enabled.") % (num_monitors, required_monitors))
    def _get_master_overrides(self):
        if utils.is_aio_system(self.dbapi):
            heap_size = "256m"
        else:
            heap_size = "512m"

        conf = {
            'replicas':
            self._count_hosts_by_label(common.LABEL_MONITOR_CONTROLLER),
            'heapSize': heap_size,
            'nodeSelector': {
                common.LABEL_MONITOR_CONTROLLER: "enabled"
            },
        }
        return conf
Example #8
0
    def _get_resources_overrides(self):

        if (utils.is_aio_system(self.dbapi)
                and not self._is_distributed_cloud_role_system_controller()):
            cpu_limits = "500m"
            memory_limits = "1024Mi"
        else:
            cpu_limits = "500m"
            memory_limits = "2048Mi"

        return {
            'limits': {
                'cpu': cpu_limits,
                'memory': memory_limits
            },
        }
    def _get_master_resource_overrides(self):
        if (utils.is_aio_system(self.dbapi) and not
                self._is_distributed_cloud_role_system_controller()):
            cpu_requests = "200m"
            memory_size = "256Mi"
        else:
            cpu_requests = "500m"
            memory_size = "512Mi"

        resources = {
            'requests': {
                'cpu': cpu_requests,
                'memory': memory_size
            },
            'limits': {
                'cpu': "1",
                'memory': "1024Mi"
            },
        }
        return resources
    def _get_client_resources_overrides(self):
        if (utils.is_aio_system(self.dbapi)
                and not self._is_distributed_cloud_role_system_controller()):
            cpu_requests = "50m"
            cpu_limits = "1"  # high watermark
            memory_size = "1024Mi"
        else:
            cpu_requests = "100m"
            cpu_limits = "1"  # high watermark
            memory_size = "2048Mi"

        resources = {
            'requests': {
                'cpu': cpu_requests,
                'memory': memory_size
            },
            'limits': {
                'cpu': cpu_limits,
                'memory': memory_size
            }
        }
        return resources
    def _get_data_resources_overrides(self):
        # Default values based upon AIO+4 and Standard+20 system test

        if (utils.is_aio_system(self.dbapi)
                and not self._is_distributed_cloud_role_system_controller()):
            cpu_requests = "200m"
            cpu_limits = "1"
            memory_size = "4096Mi"
        else:
            cpu_requests = "500m"
            cpu_limits = "2"
            memory_size = "6144Mi"

        resources = {
            'requests': {
                'cpu': cpu_requests,
                'memory': memory_size
            },
            'limits': {
                'cpu': cpu_limits,
                'memory': memory_size
            }
        }
        return resources
Example #12
0
def fix_crushmap(dbapi=None):
    """ Set Ceph's CRUSH Map based on storage model """
    def _create_crushmap_flag_file():
        try:
            open(crushmap_flag_file, "w").close()
        except IOError as e:
            LOG.warn(('Failed to create flag file: {}. '
                      'Reason: {}').format(crushmap_flag_file, e))

    if not dbapi:
        dbapi = pecan.request.dbapi
    crushmap_flag_file = os.path.join(constants.SYSINV_CONFIG_PATH,
                                      constants.CEPH_CRUSH_MAP_APPLIED)

    if not os.path.isfile(crushmap_flag_file):
        _operator = CephApiOperator()
        if not cutils.is_aio_system(dbapi):
            # At least two monitors have to be running on a standard deployment,
            # otherwise don't even try to load the crushmap.
            active_mons, required_mons, __ = _operator.get_monitors_status(
                dbapi)
            if required_mons > active_mons:
                LOG.info("Not enough monitors yet available to fix crushmap.")
                return False

        # For AIO system, crushmap should be already loaded through puppet.
        # If it was loaded, set the crushmap flag to avoid loading it twice.
        default_ceph_tier_name = constants.SB_TIER_DEFAULT_NAMES[
            constants.SB_TIER_TYPE_CEPH] + constants.CEPH_CRUSH_TIER_SUFFIX
        rule_is_present, __, __ = _operator._crush_rule_status(
            default_ceph_tier_name)
        if rule_is_present:
            _create_crushmap_flag_file()
            return False

        try:
            # For AIO system, crushmap should alreadby be loaded through
            # puppet. If for any reason it is not, as a precaution we set
            # the crushmap here.

            # Check if a backup crushmap exists. If it does, that means
            # it is during restore. We need to restore the backup crushmap
            # instead of generating it. For non-AIO system, it is stored in
            # /opt/platform/sysinv which is a drbd fs. For AIO systems because
            # when unlocking controller-0 for the first time, the crushmap is
            # set thru ceph puppet when /opt/platform is not mounted yet, we
            # store the crushmap in /etc/sysinv.

            if cutils.is_aio_system(dbapi):
                backup = os.path.join(
                    constants.CEPH_CRUSH_MAP_BACKUP_DIR_FOR_AIO,
                    constants.CEPH_CRUSH_MAP_BACKUP)
            else:
                backup = os.path.join(constants.SYSINV_CONFIG_PATH,
                                      constants.CEPH_CRUSH_MAP_BACKUP)
            crushmap_bin = "/etc/sysinv/crushmap.bin"
            if os.path.exists(backup):
                shutil.copyfile(backup, crushmap_bin)
            else:
                stor_model = get_ceph_storage_model(dbapi)
                if stor_model == constants.CEPH_AIO_SX_MODEL:
                    crushmap_txt = "/etc/sysinv/crushmap-aio-sx.txt"
                elif stor_model == constants.CEPH_CONTROLLER_MODEL:
                    crushmap_txt = "/etc/sysinv/crushmap-controller-model.txt"
                elif stor_model == constants.CEPH_STORAGE_MODEL:
                    crushmap_txt = "/etc/sysinv/crushmap-storage-model.txt"
                else:
                    reason = "Error: Undefined ceph storage model %s" % stor_model
                    raise exception.CephCrushMapNotApplied(reason=reason)
                LOG.info("Updating crushmap with: %s" % crushmap_txt)

                # Compile crushmap
                subprocess.check_output("crushtool -c %s "
                                        "-o %s" % (crushmap_txt, crushmap_bin),
                                        stderr=subprocess.STDOUT,
                                        shell=True)
            # Set crushmap
            subprocess.check_output("ceph osd setcrushmap -i %s" %
                                    crushmap_bin,
                                    stderr=subprocess.STDOUT,
                                    shell=True)

            if os.path.exists(backup):
                os.remove(backup)
        except (IOError, subprocess.CalledProcessError) as e:
            # May not be critical, depends on where this is called.
            reason = "Error: %s Output: %s" % (str(e), e.output)
            raise exception.CephCrushMapNotApplied(reason=reason)

        _create_crushmap_flag_file()

        return True
    return False
Example #13
0
def _check(self, op, tier):
    # Semantic checks
    LOG.debug("storage_tier: Semantic check for %s operation" % op)

    # Check storage tier parameters
    _check_parameters(tier)

    if op == "add":
        # See if this storage tier already exists
        tiers = pecan.request.dbapi.storage_tier_get_all(name=tier['name'])
        if len(tiers) != 0:
            raise wsme.exc.ClientSideError(
                _("Storage tier (%s) "
                  "already present." % tier['name']))

        # Deny adding secondary tier if initial configuration is not done.
        if not cutils.is_initial_config_complete():
            msg = _(
                "Operation denied. Adding secondary tiers to a cluster requires "
                "initial configuration to be complete and controller node unlocked."
            )
            raise wsme.exc.ClientSideError(msg)

        if cutils.is_aio_system(pecan.request.dbapi):
            # Deny adding secondary tiers if primary tier backend is not configured
            # for cluster.
            clusterId = tier.get('forclusterid') or tier.get('cluster_uuid')
            cluster_tiers = pecan.request.dbapi.storage_tier_get_by_cluster(
                clusterId)
            configured = False if cluster_tiers else True
            for t in cluster_tiers:
                if t.forbackendid:
                    bk = pecan.request.dbapi.storage_backend_get(
                        t.forbackendid)
                    if bk.state != constants.SB_STATE_CONFIGURED:
                        msg = _("Operation denied. Storage backend '%s' "
                                "of tier '%s' must be in '%s' state." %
                                (bk.name, t['name'],
                                 constants.SB_STATE_CONFIGURED))
                        raise wsme.exc.ClientSideError(msg)
                    configured = True
            if not configured:
                msg = _(
                    "Operation denied. Adding secondary tiers to a cluster requires "
                    "primary tier storage backend of this cluster to be configured."
                )
                raise wsme.exc.ClientSideError(msg)
        else:
            # Deny adding secondary tier if ceph is down on standard
            num_monitors, required_monitors, __ = \
                self._ceph.get_monitors_status(pecan.request.dbapi)
            if num_monitors < required_monitors:
                raise wsme.exc.ClientSideError(
                    _("Operation denied. Ceph is not operational. "
                      "Only %d storage monitor available. "
                      "At least %s unlocked and enabled hosts with "
                      "monitors are required. Please ensure hosts "
                      "with monitors are unlocked and enabled.") %
                    (num_monitors, required_monitors))

    elif op == "delete":
        if tier['name'] == constants.SB_TIER_DEFAULT_NAMES[
                constants.SB_TIER_TYPE_CEPH]:
            raise wsme.exc.ClientSideError(
                _("Storage Tier %s cannot be "
                  "deleted.") % tier['name'])

        if tier['status'] != constants.SB_TIER_STATUS_DEFINED:
            raise wsme.exc.ClientSideError(
                _("Storage Tier %s cannot be "
                  "deleted. It is %s") % (tier['name'], tier['status']))
    elif op == "modify":
        pass
    else:
        raise wsme.exc.ClientSideError(
            _("Internal Error: Invalid storage tier operation: %s" % op))

    return tier