Exemple #1
0
    def _remove_daemon(self, name: str, host: str) -> str:
        """
        Remove a daemon
        """
        (daemon_type, daemon_id) = name.split('.', 1)
        daemon = orchestrator.DaemonDescription(daemon_type=daemon_type,
                                                daemon_id=daemon_id,
                                                hostname=host)

        with set_exception_subject('service',
                                   daemon.service_id(),
                                   overwrite=True):

            self.mgr.cephadm_services[daemon_type_to_service(
                daemon_type)].pre_remove(daemon)

            args = ['--name', name, '--force']
            self.log.info('Removing daemon %s from %s' % (name, host))
            out, err, code = self._run_cephadm(host, name, 'rm-daemon', args)
            if not code:
                # remove item from cache
                self.mgr.cache.rm_daemon(host, name)
            self.mgr.cache.invalidate_host_daemons(host)

            self.mgr.cephadm_services[daemon_type_to_service(
                daemon_type)].post_remove(daemon)

            return "Removed {} from host '{}'".format(name, host)
Exemple #2
0
 def _refresh_host_daemons(self, host: str) -> Optional[str]:
     try:
         out, err, code = self._run_cephadm(host,
                                            'mon',
                                            'ls', [],
                                            no_fsid=True)
         if code:
             return 'host %s cephadm ls returned %d: %s' % (host, code, err)
         ls = json.loads(''.join(out))
     except ValueError:
         msg = 'host %s scrape failed: Cannot decode JSON' % host
         self.log.exception('%s: \'%s\'' % (msg, ''.join(out)))
         return msg
     except Exception as e:
         return 'host %s scrape failed: %s' % (host, e)
     dm = {}
     for d in ls:
         if not d['style'].startswith('cephadm'):
             continue
         if d['fsid'] != self.mgr._cluster_fsid:
             continue
         if '.' not in d['name']:
             continue
         sd = orchestrator.DaemonDescription()
         sd.last_refresh = datetime_now()
         for k in [
                 'created', 'started', 'last_configured', 'last_deployed'
         ]:
             v = d.get(k, None)
             if v:
                 setattr(sd, k, str_to_datetime(d[k]))
         sd.daemon_type = d['name'].split('.')[0]
         sd.daemon_id = '.'.join(d['name'].split('.')[1:])
         sd.hostname = host
         sd.container_id = d.get('container_id')
         if sd.container_id:
             # shorten the hash
             sd.container_id = sd.container_id[0:12]
         sd.container_image_name = d.get('container_image_name')
         sd.container_image_id = d.get('container_image_id')
         sd.version = d.get('version')
         if sd.daemon_type == 'osd':
             sd.osdspec_affinity = self.mgr.osd_service.get_osdspec_affinity(
                 sd.daemon_id)
         if 'state' in d:
             sd.status_desc = d['state']
             sd.status = {
                 'running': 1,
                 'stopped': 0,
                 'error': -1,
                 'unknown': -1,
             }[d['state']]
         else:
             sd.status_desc = 'unknown'
             sd.status = None
         dm[sd.name()] = sd
     self.log.debug('Refreshed host %s daemons (%d)' % (host, len(dm)))
     self.mgr.cache.update_host_daemons(host, dm)
     self.mgr.cache.save_host(host)
     return None
Exemple #3
0
    def list_daemons(self, daemon_type=None, daemon_id=None, host=None, refresh=False):
        """
        There is no guarantee which daemons are returned by describe_service, except that
        it returns the mgr we're running in.
        """
        if daemon_type:
            daemon_types = ("mds", "osd", "mon", "rgw", "mgr", "iscsi")
            assert daemon_type in daemon_types, daemon_type + " unsupported"

        if self._daemons:
            if host:
                return list(filter(lambda svc: svc.hostname == host, self._daemons))
            return self._daemons

        out = map(str, check_output(['ps', 'aux']).splitlines())
        types = (daemon_type, ) if daemon_type else ("mds", "osd", "mon", "rgw", "mgr")
        assert isinstance(types, tuple)
        processes = [p for p in out if any([('ceph-' + t in p) for t in types])]

        result = []
        for p in processes:
            sd = orchestrator.DaemonDescription()
            sd.hostname = 'localhost'
            res = re.search('ceph-[^ ]+', p)
            assert res
            sd.daemon_id = res.group()
            result.append(sd)

        return result
Exemple #4
0
    def _get_ceph_daemons(self):
        # type: () -> List[orchestrator.DaemonDescription]
        """ Return ceph daemons on the running host."""
        types = ("mds", "osd", "mon", "rgw", "mgr", "nfs", "iscsi")
        out = map(str, check_output(['ps', 'aux']).splitlines())
        processes = [p for p in out if any(
            [('ceph-{} '.format(t) in p) for t in types])]

        daemons = []
        for p in processes:
            # parse daemon type
            m = re.search('ceph-([^ ]+)', p)
            if m:
                _daemon_type = m.group(1)
            else:
                raise AssertionError('Fail to determine daemon type from {}'.format(p))

            # parse daemon ID. Possible options: `-i <id>`, `--id=<id>`, `--id <id>`
            patterns = [r'-i\s(\w+)', r'--id[\s=](\w+)']
            for pattern in patterns:
                m = re.search(pattern, p)
                if m:
                    daemon_id = m.group(1)
                    break
            else:
                raise AssertionError('Fail to determine daemon ID from {}'.format(p))
            daemon = orchestrator.DaemonDescription(
                daemon_type=_daemon_type, daemon_id=daemon_id, hostname='localhost')
            daemons.append(daemon)
        return daemons
Exemple #5
0
 def _refresh_host_daemons(self, host: str) -> Optional[str]:
     try:
         ls = self._run_cephadm_json(host, 'mon', 'ls', [], no_fsid=True)
     except OrchestratorError as e:
         return str(e)
     dm = {}
     for d in ls:
         if not d['style'].startswith('cephadm'):
             continue
         if d['fsid'] != self.mgr._cluster_fsid:
             continue
         if '.' not in d['name']:
             continue
         sd = orchestrator.DaemonDescription()
         sd.last_refresh = datetime_now()
         for k in ['created', 'started', 'last_configured', 'last_deployed']:
             v = d.get(k, None)
             if v:
                 setattr(sd, k, str_to_datetime(d[k]))
         sd.daemon_type = d['name'].split('.')[0]
         sd.daemon_id = '.'.join(d['name'].split('.')[1:])
         sd.hostname = host
         sd.container_id = d.get('container_id')
         if sd.container_id:
             # shorten the hash
             sd.container_id = sd.container_id[0:12]
         sd.container_image_name = d.get('container_image_name')
         sd.container_image_id = d.get('container_image_id')
         sd.container_image_digests = d.get('container_image_digests')
         sd.memory_usage = d.get('memory_usage')
         sd.memory_request = d.get('memory_request')
         sd.memory_limit = d.get('memory_limit')
         sd._service_name = d.get('service_name')
         sd.version = d.get('version')
         sd.ports = d.get('ports')
         sd.ip = d.get('ip')
         if sd.daemon_type == 'osd':
             sd.osdspec_affinity = self.mgr.osd_service.get_osdspec_affinity(sd.daemon_id)
         if 'state' in d:
             sd.status_desc = d['state']
             sd.status = {
                 'running': DaemonDescriptionStatus.running,
                 'stopped': DaemonDescriptionStatus.stopped,
                 'error': DaemonDescriptionStatus.error,
                 'unknown': DaemonDescriptionStatus.error,
             }[d['state']]
         else:
             sd.status_desc = 'unknown'
             sd.status = None
         dm[sd.name()] = sd
     self.log.debug('Refreshed host %s daemons (%d)' % (host, len(dm)))
     self.mgr.cache.update_host_daemons(host, dm)
     self.mgr.cache.save_host(host)
     return None
Exemple #6
0
    def generate_config(
            self, daemon_spec: CephadmDaemonSpec
    ) -> Tuple[Dict[str, Any], List[str]]:
        assert self.TYPE == daemon_spec.daemon_type

        daemon_type = daemon_spec.daemon_type
        daemon_id = daemon_spec.daemon_id
        host = daemon_spec.host

        deps = []  # type: List[str]

        # find the matching NFSServiceSpec
        # TODO: find the spec and pass via _create_daemon instead ??
        dd = orchestrator.DaemonDescription()
        dd.daemon_type = daemon_type
        dd.daemon_id = daemon_id
        dd.hostname = host

        service_name = dd.service_name()
        specs = self.mgr.spec_store.find(service_name)

        if not specs:
            raise OrchestratorError('Cannot find service spec %s' %
                                    (service_name))
        elif len(specs) > 1:
            raise OrchestratorError('Found multiple service specs for %s' %
                                    (service_name))
        else:
            # cast to keep mypy happy
            spec = cast(NFSServiceSpec, specs[0])

        nfs = NFSGanesha(self.mgr, daemon_id, spec)

        # create the keyring
        entity = nfs.get_keyring_entity()
        keyring = nfs.get_or_create_keyring(entity=entity)

        # update the caps after get-or-create, the keyring might already exist!
        nfs.update_keyring_caps(entity=entity)

        # create the rados config object
        nfs.create_rados_config_obj()

        # generate the cephadm config
        cephadm_config = nfs.get_cephadm_config()
        cephadm_config.update(
            self.mgr._get_config_and_keyring(daemon_type,
                                             daemon_id,
                                             keyring=keyring,
                                             host=host))

        return cephadm_config, deps
Exemple #7
0
    def list_daemons(self,
                     daemon_type=None,
                     daemon_id=None,
                     host_name=None,
                     refresh=False):

        pods = self.rook_cluster.describe_pods(daemon_type, daemon_id,
                                               host_name)

        result = []
        for p in pods:
            sd = orchestrator.DaemonDescription()
            sd.hostname = p['hostname']
            sd.container_id = p['name']
            sd.daemon_type = p['labels']['app'].replace('rook-ceph-', '')
            status = {
                'Pending': -1,
                'Running': 1,
                'Succeeded': 0,
                'Failed': -1,
                'Unknown': -1,
            }[p['phase']]
            sd.status = status
            sd.status_desc = p['phase']

            if sd.daemon_type == "osd":
                sd.daemon_id = "%s" % p['labels']["ceph-osd-id"]
            elif sd.daemon_type == "mds":
                pfx = "{0}-".format(p['labels']['rook_file_system'])
                sd.daemon_id = p['labels']['ceph_daemon_id'].replace(
                    pfx, '', 1)
            elif sd.daemon_type == "mon":
                sd.daemon_id = p['labels']["mon"]
            elif sd.daemon_type == "mgr":
                sd.daemon_id = p['labels']["mgr"]
            elif sd.daemon_type == "nfs":
                sd.daemon_id = p['labels']['instance']
            elif sd.daemon_type == "rgw":
                sd.daemon_id = p['labels']['ceph_daemon_id']
            else:
                # Unknown type -- skip it
                continue

            result.append(sd)

        return result
Exemple #8
0
    def _list_daemons(
            self,
            service_name: Optional[str] = None,
            daemon_type: Optional[str] = None,
            daemon_id: Optional[str] = None,
            host: Optional[str] = None,
            refresh: bool = False) -> List[orchestrator.DaemonDescription]:
        pods = self.rook_cluster.describe_pods(daemon_type, daemon_id, host)
        self.log.debug('pods %s' % pods)
        result = []
        for p in pods:
            sd = orchestrator.DaemonDescription()
            sd.hostname = p['hostname']
            sd.daemon_type = p['labels']['app'].replace('rook-ceph-', '')
            status = {
                'Pending': orchestrator.DaemonDescriptionStatus.error,
                'Running': orchestrator.DaemonDescriptionStatus.running,
                'Succeeded': orchestrator.DaemonDescriptionStatus.stopped,
                'Failed': orchestrator.DaemonDescriptionStatus.error,
                'Unknown': orchestrator.DaemonDescriptionStatus.error,
            }[p['phase']]
            sd.status = status
            sd.status_desc = p['phase']

            if 'ceph_daemon_id' in p['labels']:
                sd.daemon_id = p['labels']['ceph_daemon_id']
            elif 'ceph-osd-id' in p['labels']:
                sd.daemon_id = p['labels']['ceph-osd-id']
            else:
                # Unknown type -- skip it
                continue

            if service_name is not None and service_name != sd.service_name():
                continue
            sd.container_image_name = p['container_image_name']
            sd.container_image_id = p['container_image_id']
            sd.created = p['created']
            sd.last_configured = p['created']
            sd.last_deployed = p['created']
            sd.started = p['started']
            sd.last_refresh = p['refreshed']
            result.append(sd)

        return result
Exemple #9
0
    def _list_daemons(self,
                      daemon_type=None,
                      daemon_id=None,
                      host=None,
                      refresh=False):
        pods = self.rook_cluster.describe_pods(daemon_type, daemon_id, host)
        self.log.debug('pods %s' % pods)
        result = []
        for p in pods:
            sd = orchestrator.DaemonDescription()
            sd.hostname = p['hostname']
            sd.container_id = p['name']
            sd.daemon_type = p['labels']['app'].replace('rook-ceph-', '')
            status = {
                'Pending': -1,
                'Running': 1,
                'Succeeded': 0,
                'Failed': -1,
                'Unknown': -1,
            }[p['phase']]
            sd.status = status
            sd.status_desc = p['phase']

            if 'ceph_daemon_id' in p['labels']:
                sd.daemon_id = p['labels']['ceph_daemon_id']
            elif 'ceph-osd-id' in p['labels']:
                sd.daemon_id = p['labels']['ceph-osd-id']
            else:
                # Unknown type -- skip it
                continue

            sd.container_image_name = p['container_image_name']

            sd.created = p['created']
            sd.last_configured = p['created']
            sd.last_deployed = p['created']
            sd.started = p['started']
            sd.last_refresh = p['refreshed']
            result.append(sd)

        return result
Exemple #10
0
    def _apply_service(self, spec: ServiceSpec) -> bool:
        """
        Schedule a service.  Deploy new daemons or remove old ones, depending
        on the target label and count specified in the placement.
        """
        self.mgr.migration.verify_no_migration()

        daemon_type = spec.service_type
        service_name = spec.service_name()
        if spec.unmanaged:
            self.log.debug('Skipping unmanaged service %s' % service_name)
            return False
        if spec.preview_only:
            self.log.debug('Skipping preview_only service %s' % service_name)
            return False
        self.log.debug('Applying service %s spec' % service_name)

        config_func = self._config_fn(daemon_type)

        if daemon_type == 'osd':
            self.mgr.osd_service.create_from_spec(cast(DriveGroupSpec, spec))
            # TODO: return True would result in a busy loop
            # can't know if daemon count changed; create_from_spec doesn't
            # return a solid indication
            return False

        daemons = self.mgr.cache.get_daemons_by_service(service_name)

        public_network = None
        if daemon_type == 'mon':
            ret, out, err = self.mgr.check_mon_command({
                'prefix': 'config get',
                'who': 'mon',
                'key': 'public_network',
            })
            if '/' in out:
                public_network = out.strip()
                self.log.debug('mon public_network is %s' % public_network)

        def matches_network(host):
            # type: (str) -> bool
            if not public_network:
                return False
            # make sure we have 1 or more IPs for that network on that
            # host
            return len(self.mgr.cache.networks[host].get(public_network,
                                                         [])) > 0

        ha = HostAssignment(
            spec=spec,
            hosts=self.mgr._hosts_with_daemon_inventory(),
            get_daemons_func=self.mgr.cache.get_daemons_by_service,
            filter_new_host=matches_network if daemon_type == 'mon' else None,
        )

        hosts: List[HostPlacementSpec] = ha.place()
        self.log.debug('Usable hosts: %s' % hosts)

        r = None

        # sanity check
        if daemon_type in ['mon', 'mgr'] and len(hosts) < 1:
            self.log.debug('cannot scale mon|mgr below 1 (hosts=%s)' % hosts)
            return False

        # add any?
        did_config = False

        add_daemon_hosts: Set[HostPlacementSpec] = ha.add_daemon_hosts(hosts)
        self.log.debug('Hosts that will receive new daemons: %s' %
                       add_daemon_hosts)

        remove_daemon_hosts: Set[
            orchestrator.DaemonDescription] = ha.remove_daemon_hosts(hosts)
        self.log.debug('Hosts that will loose daemons: %s' %
                       remove_daemon_hosts)

        for host, network, name in add_daemon_hosts:
            daemon_id = self.mgr.get_unique_name(daemon_type,
                                                 host,
                                                 daemons,
                                                 prefix=spec.service_id,
                                                 forcename=name)

            if not did_config and config_func:
                if daemon_type == 'rgw':
                    rgw_config_func = cast(Callable[[RGWSpec, str], None],
                                           config_func)
                    rgw_config_func(cast(RGWSpec, spec), daemon_id)
                else:
                    config_func(spec)
                did_config = True

            daemon_spec = self.mgr.cephadm_services[
                daemon_type].make_daemon_spec(host, daemon_id, network, spec)
            self.log.debug('Placing %s.%s on host %s' %
                           (daemon_type, daemon_id, host))

            try:
                daemon_spec = self.mgr.cephadm_services[
                    daemon_type].prepare_create(daemon_spec)
                self.mgr._create_daemon(daemon_spec)
                r = True
            except (RuntimeError, OrchestratorError) as e:
                self.mgr.events.for_service(
                    spec, 'ERROR',
                    f"Failed while placing {daemon_type}.{daemon_id}"
                    f"on {host}: {e}")
                # only return "no change" if no one else has already succeeded.
                # later successes will also change to True
                if r is None:
                    r = False
                continue

            # add to daemon list so next name(s) will also be unique
            sd = orchestrator.DaemonDescription(
                hostname=host,
                daemon_type=daemon_type,
                daemon_id=daemon_id,
            )
            daemons.append(sd)

        # remove any?
        def _ok_to_stop(
                remove_daemon_hosts: Set[orchestrator.DaemonDescription]
        ) -> bool:
            daemon_ids = [d.daemon_id for d in remove_daemon_hosts]
            r = self.mgr.cephadm_services[daemon_type].ok_to_stop(daemon_ids)
            return not r.retval

        while remove_daemon_hosts and not _ok_to_stop(remove_daemon_hosts):
            # let's find a subset that is ok-to-stop
            remove_daemon_hosts.pop()
        for d in remove_daemon_hosts:
            r = True
            # NOTE: we are passing the 'force' flag here, which means
            # we can delete a mon instances data.
            self.mgr._remove_daemon(d.name(), d.hostname)

        if r is None:
            r = False
        return r
Exemple #11
0
    def _create_daemon(
        self,
        daemon_spec: CephadmDaemonSpec,
        reconfig: bool = False,
        osd_uuid_map: Optional[Dict[str, Any]] = None,
    ) -> str:

        with set_exception_subject('service',
                                   orchestrator.DaemonDescription(
                                       daemon_type=daemon_spec.daemon_type,
                                       daemon_id=daemon_spec.daemon_id,
                                       hostname=daemon_spec.host,
                                   ).service_id(),
                                   overwrite=True):

            image = ''
            start_time = datetime_now()
            ports: List[int] = daemon_spec.ports if daemon_spec.ports else []

            if daemon_spec.daemon_type == 'container':
                spec: Optional[CustomContainerSpec] = daemon_spec.spec
                if spec is None:
                    # Exit here immediately because the required service
                    # spec to create a daemon is not provided. This is only
                    # provided when a service is applied via 'orch apply'
                    # command.
                    msg = "Failed to {} daemon {} on {}: Required " \
                          "service specification not provided".format(
                              'reconfigure' if reconfig else 'deploy',
                              daemon_spec.name(), daemon_spec.host)
                    self.log.info(msg)
                    return msg
                image = spec.image
                if spec.ports:
                    ports.extend(spec.ports)

            if daemon_spec.daemon_type == 'cephadm-exporter':
                if not reconfig:
                    assert daemon_spec.host
                    deploy_ok = self._deploy_cephadm_binary(daemon_spec.host)
                    if not deploy_ok:
                        msg = f"Unable to deploy the cephadm binary to {daemon_spec.host}"
                        self.log.warning(msg)
                        return msg

            if daemon_spec.daemon_type == 'haproxy':
                haspec = cast(HA_RGWSpec, daemon_spec.spec)
                if haspec.haproxy_container_image:
                    image = haspec.haproxy_container_image

            if daemon_spec.daemon_type == 'keepalived':
                haspec = cast(HA_RGWSpec, daemon_spec.spec)
                if haspec.keepalived_container_image:
                    image = haspec.keepalived_container_image

            cephadm_config, deps = self.mgr.cephadm_services[
                daemon_type_to_service(
                    daemon_spec.daemon_type)].generate_config(daemon_spec)

            # TCP port to open in the host firewall
            if len(ports) > 0:
                daemon_spec.extra_args.extend(
                    ['--tcp-ports', ' '.join(map(str, ports))])

            # osd deployments needs an --osd-uuid arg
            if daemon_spec.daemon_type == 'osd':
                if not osd_uuid_map:
                    osd_uuid_map = self.mgr.get_osd_uuid_map()
                osd_uuid = osd_uuid_map.get(daemon_spec.daemon_id)
                if not osd_uuid:
                    raise OrchestratorError('osd.%s not in osdmap' %
                                            daemon_spec.daemon_id)
                daemon_spec.extra_args.extend(['--osd-fsid', osd_uuid])

            if reconfig:
                daemon_spec.extra_args.append('--reconfig')
            if self.mgr.allow_ptrace:
                daemon_spec.extra_args.append('--allow-ptrace')

            if self.mgr.cache.host_needs_registry_login(
                    daemon_spec.host) and self.mgr.registry_url:
                self._registry_login(daemon_spec.host, self.mgr.registry_url,
                                     self.mgr.registry_username,
                                     self.mgr.registry_password)

            daemon_spec.extra_args.extend(['--config-json', '-'])

            self.log.info('%s daemon %s on %s' %
                          ('Reconfiguring' if reconfig else 'Deploying',
                           daemon_spec.name(), daemon_spec.host))

            out, err, code = self._run_cephadm(
                daemon_spec.host,
                daemon_spec.name(),
                'deploy', [
                    '--name',
                    daemon_spec.name(),
                ] + daemon_spec.extra_args,
                stdin=json.dumps(cephadm_config),
                image=image)
            if not code and daemon_spec.host in self.mgr.cache.daemons:
                # prime cached service state with what we (should have)
                # just created
                sd = orchestrator.DaemonDescription()
                sd.daemon_type = daemon_spec.daemon_type
                sd.daemon_id = daemon_spec.daemon_id
                sd.hostname = daemon_spec.host
                sd.status = 1
                sd.status_desc = 'starting'
                self.mgr.cache.add_daemon(daemon_spec.host, sd)
                if daemon_spec.daemon_type in [
                        'grafana', 'iscsi', 'prometheus', 'alertmanager'
                ]:
                    self.mgr.requires_post_actions.add(daemon_spec.daemon_type)
            self.mgr.cache.invalidate_host_daemons(daemon_spec.host)
            self.mgr.cache.update_daemon_config_deps(daemon_spec.host,
                                                     daemon_spec.name(), deps,
                                                     start_time)
            self.mgr.cache.save_host(daemon_spec.host)
            msg = "{} {} on host '{}'".format(
                'Reconfigured' if reconfig else 'Deployed', daemon_spec.name(),
                daemon_spec.host)
            if not code:
                self.mgr.events.for_daemon(daemon_spec.name(),
                                           OrchestratorEvent.INFO, msg)
            else:
                what = 'reconfigure' if reconfig else 'deploy'
                self.mgr.events.for_daemon(daemon_spec.name(),
                                           OrchestratorEvent.ERROR,
                                           f'Failed to {what}: {err}')
            return msg
Exemple #12
0
    def _apply_service(self, spec: ServiceSpec) -> bool:
        """
        Schedule a service.  Deploy new daemons or remove old ones, depending
        on the target label and count specified in the placement.
        """
        self.mgr.migration.verify_no_migration()

        service_type = spec.service_type
        service_name = spec.service_name()
        if spec.unmanaged:
            self.log.debug('Skipping unmanaged service %s' % service_name)
            return False
        if spec.preview_only:
            self.log.debug('Skipping preview_only service %s' % service_name)
            return False
        self.log.debug('Applying service %s spec' % service_name)

        if service_type == 'osd':
            self.mgr.osd_service.create_from_spec(cast(DriveGroupSpec, spec))
            # TODO: return True would result in a busy loop
            # can't know if daemon count changed; create_from_spec doesn't
            # return a solid indication
            return False

        daemons = self.mgr.cache.get_daemons_by_service(service_name)

        public_network = None
        if service_type == 'mon':
            out = str(self.mgr.get_foreign_ceph_option('mon',
                                                       'public_network'))
            if '/' in out:
                public_network = out.strip()
                self.log.debug('mon public_network is %s' % public_network)

        def matches_network(host):
            # type: (str) -> bool
            if not public_network:
                return False
            # make sure we have 1 or more IPs for that network on that
            # host
            return len(self.mgr.cache.networks[host].get(public_network,
                                                         [])) > 0

        def virtual_ip_allowed(host):
            # type: (str) -> bool
            # Verify that it is possible to use Virtual IPs in the host
            try:
                if self.mgr.cache.facts[host]['kernel_parameters'][
                        'net.ipv4.ip_nonlocal_bind'] == '0':
                    return False
            except KeyError:
                return False

            return True

        ha = HostAssignment(
            spec=spec,
            hosts=self.mgr._hosts_with_daemon_inventory(),
            get_daemons_func=self.mgr.cache.get_daemons_by_service,
            filter_new_host=matches_network if service_type == 'mon' else
            virtual_ip_allowed if service_type == 'ha-rgw' else None,
        )

        try:
            hosts: List[HostPlacementSpec] = ha.place()
            self.log.debug('Usable hosts: %s' % hosts)
        except OrchestratorError as e:
            self.log.error('Failed to apply %s spec %s: %s' %
                           (spec.service_name(), spec, e))
            self.mgr.events.for_service(spec, 'ERROR',
                                        'Failed to apply: ' + str(e))
            return False

        r = None

        # sanity check
        if service_type in ['mon', 'mgr'] and len(hosts) < 1:
            self.log.debug('cannot scale mon|mgr below 1 (hosts=%s)' % hosts)
            return False

        # add any?
        did_config = False

        add_daemon_hosts: Set[HostPlacementSpec] = ha.add_daemon_hosts(hosts)
        self.log.debug('Hosts that will receive new daemons: %s' %
                       add_daemon_hosts)

        remove_daemon_hosts: Set[
            orchestrator.DaemonDescription] = ha.remove_daemon_hosts(hosts)
        self.log.debug('Hosts that will loose daemons: %s' %
                       remove_daemon_hosts)

        if service_type == 'ha-rgw':
            spec = self.update_ha_rgw_definitive_hosts(spec, hosts,
                                                       add_daemon_hosts)

        for host, network, name in add_daemon_hosts:
            for daemon_type in service_to_daemon_types(service_type):
                daemon_id = self.mgr.get_unique_name(daemon_type,
                                                     host,
                                                     daemons,
                                                     prefix=spec.service_id,
                                                     forcename=name)

                if not did_config:
                    self.mgr.cephadm_services[service_type].config(
                        spec, daemon_id)
                    did_config = True

                daemon_spec = self.mgr.cephadm_services[
                    service_type].make_daemon_spec(host,
                                                   daemon_id,
                                                   network,
                                                   spec,
                                                   daemon_type=daemon_type)
                self.log.debug('Placing %s.%s on host %s' %
                               (daemon_type, daemon_id, host))

                try:
                    daemon_spec = self.mgr.cephadm_services[
                        service_type].prepare_create(daemon_spec)
                    self._create_daemon(daemon_spec)
                    r = True
                except (RuntimeError, OrchestratorError) as e:
                    self.mgr.events.for_service(
                        spec, 'ERROR',
                        f"Failed while placing {daemon_type}.{daemon_id}"
                        f"on {host}: {e}")
                    # only return "no change" if no one else has already succeeded.
                    # later successes will also change to True
                    if r is None:
                        r = False
                    continue

                # add to daemon list so next name(s) will also be unique
                sd = orchestrator.DaemonDescription(
                    hostname=host,
                    daemon_type=daemon_type,
                    daemon_id=daemon_id,
                )
                daemons.append(sd)

        # remove any?
        def _ok_to_stop(
                remove_daemon_hosts: Set[orchestrator.DaemonDescription]
        ) -> bool:
            daemon_ids = [d.daemon_id for d in remove_daemon_hosts]
            assert None not in daemon_ids
            # setting force flag retains previous behavior, should revisit later.
            r = self.mgr.cephadm_services[service_type].ok_to_stop(cast(
                List[str], daemon_ids),
                                                                   force=True)
            return not r.retval

        while remove_daemon_hosts and not _ok_to_stop(remove_daemon_hosts):
            # let's find a subset that is ok-to-stop
            remove_daemon_hosts.pop()
        for d in remove_daemon_hosts:
            r = True
            # NOTE: we are passing the 'force' flag here, which means
            # we can delete a mon instances data.
            assert d.hostname is not None
            self._remove_daemon(d.name(), d.hostname)

        if r is None:
            r = False
        return r
Exemple #13
0
    def _create_daemon(self,
                       daemon_spec: CephadmDaemonDeploySpec,
                       reconfig: bool = False,
                       osd_uuid_map: Optional[Dict[str, Any]] = None,
                       ) -> str:

        with set_exception_subject('service', orchestrator.DaemonDescription(
                daemon_type=daemon_spec.daemon_type,
                daemon_id=daemon_spec.daemon_id,
                hostname=daemon_spec.host,
        ).service_id(), overwrite=True):

            try:
                image = ''
                start_time = datetime_now()
                ports: List[int] = daemon_spec.ports if daemon_spec.ports else []

                if daemon_spec.daemon_type == 'container':
                    spec = cast(CustomContainerSpec,
                                self.mgr.spec_store[daemon_spec.service_name].spec)
                    image = spec.image
                    if spec.ports:
                        ports.extend(spec.ports)

                if daemon_spec.daemon_type == 'cephadm-exporter':
                    if not reconfig:
                        assert daemon_spec.host
                        self._deploy_cephadm_binary(daemon_spec.host)

                if daemon_spec.daemon_type == 'haproxy':
                    haspec = cast(HA_RGWSpec, self.mgr.spec_store[daemon_spec.service_name].spec)
                    if haspec.haproxy_container_image:
                        image = haspec.haproxy_container_image

                if daemon_spec.daemon_type == 'keepalived':
                    haspec = cast(HA_RGWSpec, self.mgr.spec_store[daemon_spec.service_name].spec)
                    if haspec.keepalived_container_image:
                        image = haspec.keepalived_container_image

                # TCP port to open in the host firewall
                if len(ports) > 0:
                    daemon_spec.extra_args.extend([
                        '--tcp-ports', ' '.join(map(str, ports))
                    ])

                # osd deployments needs an --osd-uuid arg
                if daemon_spec.daemon_type == 'osd':
                    if not osd_uuid_map:
                        osd_uuid_map = self.mgr.get_osd_uuid_map()
                    osd_uuid = osd_uuid_map.get(daemon_spec.daemon_id)
                    if not osd_uuid:
                        raise OrchestratorError('osd.%s not in osdmap' % daemon_spec.daemon_id)
                    daemon_spec.extra_args.extend(['--osd-fsid', osd_uuid])

                if reconfig:
                    daemon_spec.extra_args.append('--reconfig')
                if self.mgr.allow_ptrace:
                    daemon_spec.extra_args.append('--allow-ptrace')

                if self.mgr.cache.host_needs_registry_login(daemon_spec.host) and self.mgr.registry_url:
                    self._registry_login(daemon_spec.host, self.mgr.registry_url,
                                         self.mgr.registry_username, self.mgr.registry_password)

                self.log.info('%s daemon %s on %s' % (
                    'Reconfiguring' if reconfig else 'Deploying',
                    daemon_spec.name(), daemon_spec.host))

                out, err, code = self._run_cephadm(
                    daemon_spec.host, daemon_spec.name(), 'deploy',
                    [
                        '--name', daemon_spec.name(),
                        '--meta-json', json.dumps({
                            'service_name': daemon_spec.service_name,
                            'ports': daemon_spec.ports,
                            'ip': daemon_spec.ip,
                        }),
                        '--config-json', '-',
                    ] + daemon_spec.extra_args,
                    stdin=json.dumps(daemon_spec.final_config),
                    image=image)

                # refresh daemon state?  (ceph daemon reconfig does not need it)
                if not reconfig or daemon_spec.daemon_type not in CEPH_TYPES:
                    if not code and daemon_spec.host in self.mgr.cache.daemons:
                        # prime cached service state with what we (should have)
                        # just created
                        sd = daemon_spec.to_daemon_description(
                            DaemonDescriptionStatus.running, 'starting')
                        self.mgr.cache.add_daemon(daemon_spec.host, sd)
                        if daemon_spec.daemon_type in [
                            'grafana', 'iscsi', 'prometheus', 'alertmanager'
                        ]:
                            self.mgr.requires_post_actions.add(daemon_spec.daemon_type)
                    self.mgr.cache.invalidate_host_daemons(daemon_spec.host)

                self.mgr.cache.update_daemon_config_deps(
                    daemon_spec.host, daemon_spec.name(), daemon_spec.deps, start_time)
                self.mgr.cache.save_host(daemon_spec.host)
                msg = "{} {} on host '{}'".format(
                    'Reconfigured' if reconfig else 'Deployed', daemon_spec.name(), daemon_spec.host)
                if not code:
                    self.mgr.events.for_daemon(daemon_spec.name(), OrchestratorEvent.INFO, msg)
                else:
                    what = 'reconfigure' if reconfig else 'deploy'
                    self.mgr.events.for_daemon(
                        daemon_spec.name(), OrchestratorEvent.ERROR, f'Failed to {what}: {err}')
                return msg
            except OrchestratorError:
                if not reconfig:
                    # we have to clean up the daemon. E.g. keyrings.
                    servict_type = daemon_type_to_service(daemon_spec.daemon_type)
                    dd = daemon_spec.to_daemon_description(DaemonDescriptionStatus.error, 'failed')
                    self.mgr.cephadm_services[servict_type].post_remove(dd)
                raise