def update_keyring_caps(self, entity: Optional[str] = None) -> None: if not entity: entity = self.get_keyring_entity() osd_caps = 'allow rw pool=%s' % (self.spec.pool) if self.spec.namespace: osd_caps = '%s namespace=%s' % (osd_caps, self.spec.namespace) logger.info('Updating keyring caps: %s' % entity) ret, out, err = self.mgr.mon_command({ 'prefix': 'auth caps', 'entity': entity, 'caps': ['mon', 'allow r', 'osd', osd_caps], }) if ret != 0: raise OrchestratorError( 'Unable to update keyring caps %s: %s %s' \ % (entity, ret, err))
def generate_config( self, daemon_spec: CephadmDaemonSpec ) -> Tuple[Dict[str, Any], List[str]]: assert self.TYPE == daemon_spec.daemon_type assert daemon_spec.spec deps: List[str] = [] cfg = CephadmExporterConfig(self.mgr) cfg.load_from_store() if cfg.ready: rc, reason = cfg.validate_config() if rc: raise OrchestratorError(reason) else: logger.info("Using default configuration for cephadm-exporter") self.mgr._set_exporter_defaults() cfg.load_from_store() config = {"crt": cfg.crt, "key": cfg.key, "token": cfg.token} return config, deps
def _get_container_image_info(self, image_name: str) -> ContainerInspectInfo: # pick a random host... host = None for host_name in self.mgr.inventory.keys(): host = host_name break if not host: raise OrchestratorError('no hosts defined') if self.mgr.cache.host_needs_registry_login(host) and self.mgr.registry_url: self._registry_login(host, self.mgr.registry_url, self.mgr.registry_username, self.mgr.registry_password) j = self._run_cephadm_json(host, '', 'pull', [], image=image_name, no_fsid=True) r = ContainerInspectInfo( j['image_id'], j.get('ceph_version'), j.get('repo_digests') ) self.log.debug(f'image {image_name} -> {r}') return r
async def _execute_command(self, host: str, cmd: List[str], stdin: Optional[str] = None, addr: Optional[str] = None, ) -> Tuple[str, str, int]: conn = await self._remote_connection(host, addr) cmd = "sudo " + " ".join(quote(x) for x in cmd) logger.debug(f'Running command: {cmd}') try: r = await conn.run(cmd, input=stdin) # handle these Exceptions otherwise you might get a weird error like TypeError: __init__() missing 1 required positional argument: 'reason' (due to the asyncssh error interacting with raise_if_exception) except (asyncssh.ChannelOpenError, Exception) as e: # SSH connection closed or broken, will create new connection next call logger.debug(f'Connection to {host} failed. {str(e)}') await self._reset_con(host) self.mgr.offline_hosts.add(host) raise OrchestratorError(f'Unable to reach remote host {host}. {str(e)}') out = r.stdout.rstrip('\n') err = r.stderr.rstrip('\n') return out, err, r.returncode
def prepare_create( self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec: assert self.TYPE == daemon_spec.daemon_type cfg = CephadmExporterConfig(self.mgr) cfg.load_from_store() if cfg.ready: rc, reason = cfg.validate_config() if rc: raise OrchestratorError(reason) else: logger.info("Incomplete/Missing configuration, applying defaults") self.mgr._set_exporter_defaults() cfg.load_from_store() if not daemon_spec.ports: daemon_spec.ports = [int(cfg.port)] return daemon_spec
async def _execute_command( self, host: str, cmd: List[str], stdin: Optional[str] = None, addr: Optional[str] = None, ) -> Tuple[str, str, int]: conn = await self._remote_connection(host, addr) sudo_prefix = "sudo " if self.mgr.ssh_user != 'root' else "" cmd = sudo_prefix + " ".join(quote(x) for x in cmd) logger.debug(f'Running command: {cmd}') try: r = await conn.run('sudo true', check=True, timeout=5) r = await conn.run(cmd, input=stdin) # handle these Exceptions otherwise you might get a weird error like TypeError: __init__() missing 1 required positional argument: 'reason' (due to the asyncssh error interacting with raise_if_exception) except (asyncssh.ChannelOpenError, asyncssh.ProcessError, Exception) as e: # SSH connection closed or broken, will create new connection next call logger.debug(f'Connection to {host} failed. {str(e)}') await self._reset_con(host) self.mgr.offline_hosts.add(host) raise OrchestratorError( f'Unable to reach remote host {host}. {str(e)}') def _rstrip(v: Union[bytes, str, None]) -> str: if not v: return '' if isinstance(v, str): return v.rstrip('\n') if isinstance(v, bytes): return v.decode().rstrip('\n') raise OrchestratorError( f'Unable to parse ssh output with type {type(v)} from remote host {host}' ) out = _rstrip(r.stdout) err = _rstrip(r.stderr) rc = r.returncode if r.returncode else 0 return out, err, rc
def config(self, spec: RGWSpec, rgw_id: str) -> None: # type: ignore assert self.TYPE == spec.service_type # set rgw_realm and rgw_zone, if present if spec.rgw_realm: ret, out, err = self.mgr.check_mon_command({ 'prefix': 'config set', 'who': f"{utils.name_to_config_section('rgw')}.{spec.service_id}", 'name': 'rgw_realm', 'value': spec.rgw_realm, }) if spec.rgw_zone: ret, out, err = self.mgr.check_mon_command({ 'prefix': 'config set', 'who': f"{utils.name_to_config_section('rgw')}.{spec.service_id}", 'name': 'rgw_zone', 'value': spec.rgw_zone, }) if spec.rgw_frontend_ssl_certificate: if isinstance(spec.rgw_frontend_ssl_certificate, list): cert_data = '\n'.join(spec.rgw_frontend_ssl_certificate) elif isinstance(spec.rgw_frontend_ssl_certificate, str): cert_data = spec.rgw_frontend_ssl_certificate else: raise OrchestratorError( 'Invalid rgw_frontend_ssl_certificate: %s' % spec.rgw_frontend_ssl_certificate) ret, out, err = self.mgr.check_mon_command({ 'prefix': 'config-key set', 'key': f'rgw/cert/{spec.service_name()}', 'val': cert_data, }) # TODO: fail, if we don't have a spec logger.info('Saving service %s spec with placement %s' % (spec.service_name(), spec.placement.pretty_str())) self.mgr.spec_store.save(spec)
def generate_config( self, daemon_spec: CephadmDaemonDeploySpec ) -> Tuple[Dict[str, Any], List[str]]: try: assert self.mgr.cherrypy_thread assert self.mgr.cherrypy_thread.ssl_certs.get_root_cert() assert self.mgr.cherrypy_thread.server_port except Exception: raise OrchestratorError( 'Cannot deploy agent daemons until cephadm endpoint has finished generating certs' ) cfg = { 'target_ip': self.mgr.get_mgr_ip(), 'target_port': self.mgr.cherrypy_thread.server_port, 'refresh_period': self.mgr.agent_refresh_rate, 'listener_port': self.mgr.agent_starting_port, 'host': daemon_spec.host, 'device_enhanced_scan': str(self.mgr.device_enhanced_scan) } listener_cert, listener_key = self.mgr.cherrypy_thread.ssl_certs.generate_cert( self.mgr.inventory.get_addr(daemon_spec.host)) config = { 'agent.json': json.dumps(cfg), 'keyring': daemon_spec.keyring, 'root_cert.pem': self.mgr.cherrypy_thread.ssl_certs.get_root_cert(), 'listener.crt': listener_cert, 'listener.key': listener_key, } return config, sorted([ str(self.mgr.get_mgr_ip()), str(self.mgr.cherrypy_thread.server_port), self.mgr.cherrypy_thread.ssl_certs.get_root_cert(), str(self.mgr.get_module_option('device_enhanced_scan')) ])
async def _write_remote_file( self, host: str, path: str, content: bytes, mode: Optional[int] = None, uid: Optional[int] = None, gid: Optional[int] = None, addr: Optional[str] = None, ) -> None: try: dirname = os.path.dirname(path) await self._check_execute_command(host, ['mkdir', '-p', dirname], addr=addr) tmp_path = path + '.new' await self._check_execute_command(host, ['touch', tmp_path], addr=addr) if uid is not None and gid is not None and mode is not None: # shlex quote takes str or byte object, not int await self._check_execute_command( host, ['chown', '-R', str(uid) + ':' + str(gid), tmp_path], addr=addr) await self._check_execute_command( host, ['chmod', oct(mode)[2:], tmp_path], addr=addr) with NamedTemporaryFile(prefix='cephadm-write-remote-file-') as f: os.fchmod(f.fileno(), 0o600) f.write(content) f.flush() conn = await self._remote_connection(host, addr) await asyncssh.scp(f.name, (conn, tmp_path)) await self._check_execute_command(host, ['mv', tmp_path, path], addr=addr) except Exception as e: msg = f"Unable to write {host}:{path}: {e}" logger.exception(msg) raise OrchestratorError(msg)
def find_destroyed_osds(self) -> Dict[str, List[str]]: osd_host_map: Dict[str, List[str]] = dict() try: ret, out, err = self.mgr.check_mon_command({ 'prefix': 'osd tree', 'states': ['destroyed'], 'format': 'json' }) except MonCommandFailed as e: logger.exception('osd tree failed') raise OrchestratorError(str(e)) try: tree = json.loads(out) except json.decoder.JSONDecodeError: logger.exception(f"Could not decode json -> {out}") return osd_host_map nodes = tree.get('nodes', {}) for node in nodes: if node.get('type') == 'host': osd_host_map.update( {node.get('name'): [str(_id) for _id in node.get('children', list())]} ) return osd_host_map
def _run_cephadm( self, host: str, entity: Union[CephadmNoImage, str], command: str, args: List[str], addr: Optional[str] = "", stdin: Optional[str] = "", no_fsid: Optional[bool] = False, error_ok: Optional[bool] = False, image: Optional[str] = "", env_vars: Optional[List[str]] = None, ) -> Tuple[List[str], List[str], int]: """ Run cephadm on the remote host with the given command + args Important: You probably don't want to run _run_cephadm from CLI handlers :env_vars: in format -> [KEY=VALUE, ..] """ self.log.debug(f"_run_cephadm : command = {command}") self.log.debug(f"_run_cephadm : args = {args}") bypass_image = ('cephadm-exporter', ) with self._remote_connection(host, addr) as tpl: conn, connr = tpl assert image or entity # Skip the image check for daemons deployed that are not ceph containers if not str(entity).startswith(bypass_image): if not image and entity is not cephadmNoImage: image = self.mgr._get_container_image(entity) final_args = [] if env_vars: for env_var_pair in env_vars: final_args.extend(['--env', env_var_pair]) if image: final_args.extend(['--image', image]) final_args.append(command) if not no_fsid: final_args += ['--fsid', self.mgr._cluster_fsid] if self.mgr.container_init: final_args += ['--container-init'] final_args += args self.log.debug('args: %s' % (' '.join(final_args))) if self.mgr.mode == 'root': if stdin: self.log.debug('stdin: %s' % stdin) script = 'injected_argv = ' + json.dumps(final_args) + '\n' if stdin: script += 'injected_stdin = ' + json.dumps(stdin) + '\n' script += self.mgr._cephadm python = connr.choose_python() if not python: raise RuntimeError( 'unable to find python on %s (tried %s in %s)' % (host, remotes.PYTHONS, remotes.PATH)) try: out, err, code = remoto.process.check( conn, [python, '-u'], stdin=script.encode('utf-8')) except RuntimeError as e: self.mgr._reset_con(host) if error_ok: return [], [str(e)], 1 raise elif self.mgr.mode == 'cephadm-package': try: out, err, code = remoto.process.check( conn, ['sudo', '/usr/bin/cephadm'] + final_args, stdin=stdin) except RuntimeError as e: self.mgr._reset_con(host) if error_ok: return [], [str(e)], 1 raise else: assert False, 'unsupported mode' self.log.debug('code: %d' % code) if out: self.log.debug('out: %s' % '\n'.join(out)) if err: self.log.debug('err: %s' % '\n'.join(err)) if code and not error_ok: raise OrchestratorError( 'cephadm exited with an error code: %d, stderr:%s' % (code, '\n'.join(err))) return out, err, code
def _create_daemon( self, daemon_spec: CephadmDaemonSpec, reconfig: bool = False, osd_uuid_map: Optional[Dict[str, Any]] = None, ) -> str: with set_exception_subject('service', orchestrator.DaemonDescription( daemon_type=daemon_spec.daemon_type, daemon_id=daemon_spec.daemon_id, hostname=daemon_spec.host, ).service_id(), overwrite=True): image = '' start_time = datetime_now() ports: List[int] = daemon_spec.ports if daemon_spec.ports else [] if daemon_spec.daemon_type == 'container': spec: Optional[CustomContainerSpec] = daemon_spec.spec if spec is None: # Exit here immediately because the required service # spec to create a daemon is not provided. This is only # provided when a service is applied via 'orch apply' # command. msg = "Failed to {} daemon {} on {}: Required " \ "service specification not provided".format( 'reconfigure' if reconfig else 'deploy', daemon_spec.name(), daemon_spec.host) self.log.info(msg) return msg image = spec.image if spec.ports: ports.extend(spec.ports) if daemon_spec.daemon_type == 'cephadm-exporter': if not reconfig: assert daemon_spec.host deploy_ok = self._deploy_cephadm_binary(daemon_spec.host) if not deploy_ok: msg = f"Unable to deploy the cephadm binary to {daemon_spec.host}" self.log.warning(msg) return msg if daemon_spec.daemon_type == 'haproxy': haspec = cast(HA_RGWSpec, daemon_spec.spec) if haspec.haproxy_container_image: image = haspec.haproxy_container_image if daemon_spec.daemon_type == 'keepalived': haspec = cast(HA_RGWSpec, daemon_spec.spec) if haspec.keepalived_container_image: image = haspec.keepalived_container_image cephadm_config, deps = self.mgr.cephadm_services[ daemon_type_to_service( daemon_spec.daemon_type)].generate_config(daemon_spec) # TCP port to open in the host firewall if len(ports) > 0: daemon_spec.extra_args.extend( ['--tcp-ports', ' '.join(map(str, ports))]) # osd deployments needs an --osd-uuid arg if daemon_spec.daemon_type == 'osd': if not osd_uuid_map: osd_uuid_map = self.mgr.get_osd_uuid_map() osd_uuid = osd_uuid_map.get(daemon_spec.daemon_id) if not osd_uuid: raise OrchestratorError('osd.%s not in osdmap' % daemon_spec.daemon_id) daemon_spec.extra_args.extend(['--osd-fsid', osd_uuid]) if reconfig: daemon_spec.extra_args.append('--reconfig') if self.mgr.allow_ptrace: daemon_spec.extra_args.append('--allow-ptrace') if self.mgr.cache.host_needs_registry_login( daemon_spec.host) and self.mgr.registry_url: self._registry_login(daemon_spec.host, self.mgr.registry_url, self.mgr.registry_username, self.mgr.registry_password) daemon_spec.extra_args.extend(['--config-json', '-']) self.log.info('%s daemon %s on %s' % ('Reconfiguring' if reconfig else 'Deploying', daemon_spec.name(), daemon_spec.host)) out, err, code = self._run_cephadm( daemon_spec.host, daemon_spec.name(), 'deploy', [ '--name', daemon_spec.name(), ] + daemon_spec.extra_args, stdin=json.dumps(cephadm_config), image=image) if not code and daemon_spec.host in self.mgr.cache.daemons: # prime cached service state with what we (should have) # just created sd = orchestrator.DaemonDescription() sd.daemon_type = daemon_spec.daemon_type sd.daemon_id = daemon_spec.daemon_id sd.hostname = daemon_spec.host sd.status = 1 sd.status_desc = 'starting' self.mgr.cache.add_daemon(daemon_spec.host, sd) if daemon_spec.daemon_type in [ 'grafana', 'iscsi', 'prometheus', 'alertmanager' ]: self.mgr.requires_post_actions.add(daemon_spec.daemon_type) self.mgr.cache.invalidate_host_daemons(daemon_spec.host) self.mgr.cache.update_daemon_config_deps(daemon_spec.host, daemon_spec.name(), deps, start_time) self.mgr.cache.save_host(daemon_spec.host) msg = "{} {} on host '{}'".format( 'Reconfigured' if reconfig else 'Deployed', daemon_spec.name(), daemon_spec.host) if not code: self.mgr.events.for_daemon(daemon_spec.name(), OrchestratorEvent.INFO, msg) else: what = 'reconfigure' if reconfig else 'deploy' self.mgr.events.for_daemon(daemon_spec.name(), OrchestratorEvent.ERROR, f'Failed to {what}: {err}') return msg
def verify_no_migration(self) -> None: if self.is_migration_ongoing(): # this is raised in module.serve() raise OrchestratorError( "cephadm migration still ongoing. Please wait, until the migration is complete.")
def _find_inv_for_host(hostname: str, inventory_dict: dict) -> List[Device]: # This is stupid and needs to be loaded with the host for _host, _inventory in inventory_dict.items(): if _host == hostname: return _inventory raise OrchestratorError("No inventory found for host: {}".format(hostname))
def assert_host(self, host: str) -> None: if host not in self._inventory: raise OrchestratorError('host %s does not exist' % host)
def resolve_ip(hostname: str) -> str: try: return socket.getaddrinfo(hostname, None, flags=socket.AI_CANONNAME, type=socket.SOCK_STREAM)[0][4][0] except socket.gaierror as e: raise OrchestratorError(f"Cannot resolve ip for host {hostname}: {e}")
def create_realm_zonegroup_zone(self, spec: RGWSpec, rgw_id: str): if utils.get_cluster_health(self.mgr) != 'HEALTH_OK': raise OrchestratorError( 'Health not ok, will try agin when health ok') #get keyring needed to run rados commands and strip out just the keyring keyring = self.get_keyring(rgw_id).split('key = ', 1)[1].rstrip() # We can call radosgw-admin within the container, cause cephadm gives the MGR the required keyring permissions # get realms cmd = [ 'radosgw-admin', '--key=%s' % keyring, '--user', 'rgw.%s' % rgw_id, 'realm', 'list', '--format=json' ] result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # create realm if needed cmd = [ 'radosgw-admin', '--key=%s' % keyring, '--user', 'rgw.%s' % rgw_id, 'realm', 'create', '--rgw-realm=%s' % spec.rgw_realm, '--default' ] if not result.stdout: result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.mgr.log.info('created realm: %s' % spec.rgw_realm) else: try: j = json.loads(result.stdout) if 'realms' not in j or spec.rgw_realm not in j['realms']: result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.mgr.log.info('created realm: %s' % spec.rgw_realm) except Exception as e: raise OrchestratorError('failed to parse realm info') # get zonegroup cmd = [ 'radosgw-admin', '--key=%s' % keyring, '--user', 'rgw.%s' % rgw_id, 'zonegroup', 'list', '--format=json' ] result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) #create zonegroup if needed cmd = [ 'radosgw-admin', '--key=%s' % keyring, '--user', 'rgw.%s' % rgw_id, 'zonegroup', 'create', '--rgw-zonegroup=default', '--master', '--default' ] if not result.stdout: result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.mgr.log.info('created zonegroup: default') else: try: j = json.loads(result.stdout) if 'zonegroups' not in j or 'default' not in j['zonegroups']: result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.mgr.log.info('created zonegroup: default') except Exception as e: raise OrchestratorError('failed to parse zonegroup info') #get zones cmd = [ 'radosgw-admin', '--key=%s' % keyring, '--user', 'rgw.%s' % rgw_id, 'zone', 'list', '--format=json' ] result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) #create zone if needed cmd = [ 'radosgw-admin', '--key=%s' % keyring, '--user', 'rgw.%s' % rgw_id, 'zone', 'create', '--rgw-zonegroup=default', '--rgw-zone=%s' % spec.rgw_zone, '--master', '--default' ] if not result.stdout: result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.mgr.log.info('created zone: %s' % spec.rgw_zone) else: try: j = json.loads(result.stdout) if 'zones' not in j or spec.rgw_zone not in j['zones']: result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.mgr.log.info('created zone: %s' % spec.rgw_zone) except Exception as e: raise OrchestratorError('failed to parse zone info')
def _search(daemons): args = [('osd.%s' % d.service_instance, d.nodename) for d in daemons] if not args: raise OrchestratorError('Unable to find osd.%s' % name) return self._remove_daemon(args)
def keepalived_generate_config( self, daemon_spec: CephadmDaemonDeploySpec, ) -> Tuple[Dict[str, Any], List[str]]: spec = cast(IngressSpec, self.mgr.spec_store[daemon_spec.service_name].spec) assert spec.backend_service # generate password? pw_key = f'{spec.service_name()}/keepalived_password' password = self.mgr.get_store(pw_key) if password is None: if not spec.keepalived_password: password = ''.join( random.choice(string.ascii_lowercase) for _ in range(20)) self.mgr.set_store(pw_key, password) else: if spec.keepalived_password: self.mgr.set_store(pw_key, None) if spec.keepalived_password: password = spec.keepalived_password daemons = self.mgr.cache.get_daemons_by_service(spec.service_name()) if not daemons: raise OrchestratorError( f'Failed to generate keepalived.conf: No daemons deployed for {spec.service_name()}' ) deps = sorted( [d.name() for d in daemons if d.daemon_type == 'haproxy']) host = daemon_spec.host hosts = sorted(list(set([host] + [str(d.hostname) for d in daemons]))) # interface bare_ip = str(spec.virtual_ip).split('/')[0] interface = None for subnet, ifaces in self.mgr.cache.networks.get(host, {}).items(): if ifaces and ipaddress.ip_address( bare_ip) in ipaddress.ip_network(subnet): interface = list(ifaces.keys())[0] logger.info( f'{bare_ip} is in {subnet} on {host} interface {interface}' ) break # try to find interface by matching spec.virtual_interface_networks if not interface and spec.virtual_interface_networks: for subnet, ifaces in self.mgr.cache.networks.get(host, {}).items(): if subnet in spec.virtual_interface_networks: interface = list(ifaces.keys())[0] logger.info( f'{spec.virtual_ip} will be configured on {host} interface ' f'{interface} (which has guiding subnet {subnet})') break if not interface: raise OrchestratorError( f"Unable to identify interface for {spec.virtual_ip} on {host}" ) # script to monitor health script = '/usr/bin/false' for d in daemons: if d.hostname == host: if d.daemon_type == 'haproxy': assert d.ports port = d.ports[1] # monitoring port script = f'/usr/bin/curl {build_url(scheme="http", host=d.ip or "localhost", port=port)}/health' assert script # set state. first host in placement is master all others backups state = 'BACKUP' if hosts[0] == host: state = 'MASTER' # remove host, daemon is being deployed on from hosts list for # other_ips in conf file and converter to ips if host in hosts: hosts.remove(host) other_ips = [resolve_ip(self.mgr.inventory.get_addr(h)) for h in hosts] keepalived_conf = self.mgr.template.render( 'services/ingress/keepalived.conf.j2', { 'spec': spec, 'script': script, 'password': password, 'interface': interface, 'state': state, 'other_ips': other_ips, 'host_ip': resolve_ip(self.mgr.inventory.get_addr(host)), }) config_file = { 'files': { "keepalived.conf": keepalived_conf, } } return config_file, deps
def remove_osds(self, name): daemons = self._get_services('osd', service_id=name) args = [('osd.%s' % d.service_instance, d.nodename) for d in daemons] if not args: raise OrchestratorError('Unable to find osd.%s' % name) return self._remove_daemon(args)
def _create_daemon(self, daemon_spec: CephadmDaemonDeploySpec, reconfig: bool = False, osd_uuid_map: Optional[Dict[str, Any]] = None, ) -> str: with set_exception_subject('service', orchestrator.DaemonDescription( daemon_type=daemon_spec.daemon_type, daemon_id=daemon_spec.daemon_id, hostname=daemon_spec.host, ).service_id(), overwrite=True): try: image = '' start_time = datetime_now() ports: List[int] = daemon_spec.ports if daemon_spec.ports else [] if daemon_spec.daemon_type == 'container': spec = cast(CustomContainerSpec, self.mgr.spec_store[daemon_spec.service_name].spec) image = spec.image if spec.ports: ports.extend(spec.ports) if daemon_spec.daemon_type == 'cephadm-exporter': if not reconfig: assert daemon_spec.host self._deploy_cephadm_binary(daemon_spec.host) if daemon_spec.daemon_type == 'haproxy': haspec = cast(HA_RGWSpec, self.mgr.spec_store[daemon_spec.service_name].spec) if haspec.haproxy_container_image: image = haspec.haproxy_container_image if daemon_spec.daemon_type == 'keepalived': haspec = cast(HA_RGWSpec, self.mgr.spec_store[daemon_spec.service_name].spec) if haspec.keepalived_container_image: image = haspec.keepalived_container_image # TCP port to open in the host firewall if len(ports) > 0: daemon_spec.extra_args.extend([ '--tcp-ports', ' '.join(map(str, ports)) ]) # osd deployments needs an --osd-uuid arg if daemon_spec.daemon_type == 'osd': if not osd_uuid_map: osd_uuid_map = self.mgr.get_osd_uuid_map() osd_uuid = osd_uuid_map.get(daemon_spec.daemon_id) if not osd_uuid: raise OrchestratorError('osd.%s not in osdmap' % daemon_spec.daemon_id) daemon_spec.extra_args.extend(['--osd-fsid', osd_uuid]) if reconfig: daemon_spec.extra_args.append('--reconfig') if self.mgr.allow_ptrace: daemon_spec.extra_args.append('--allow-ptrace') if self.mgr.cache.host_needs_registry_login(daemon_spec.host) and self.mgr.registry_url: self._registry_login(daemon_spec.host, self.mgr.registry_url, self.mgr.registry_username, self.mgr.registry_password) self.log.info('%s daemon %s on %s' % ( 'Reconfiguring' if reconfig else 'Deploying', daemon_spec.name(), daemon_spec.host)) out, err, code = self._run_cephadm( daemon_spec.host, daemon_spec.name(), 'deploy', [ '--name', daemon_spec.name(), '--meta-json', json.dumps({ 'service_name': daemon_spec.service_name, 'ports': daemon_spec.ports, 'ip': daemon_spec.ip, }), '--config-json', '-', ] + daemon_spec.extra_args, stdin=json.dumps(daemon_spec.final_config), image=image) # refresh daemon state? (ceph daemon reconfig does not need it) if not reconfig or daemon_spec.daemon_type not in CEPH_TYPES: if not code and daemon_spec.host in self.mgr.cache.daemons: # prime cached service state with what we (should have) # just created sd = daemon_spec.to_daemon_description( DaemonDescriptionStatus.running, 'starting') self.mgr.cache.add_daemon(daemon_spec.host, sd) if daemon_spec.daemon_type in [ 'grafana', 'iscsi', 'prometheus', 'alertmanager' ]: self.mgr.requires_post_actions.add(daemon_spec.daemon_type) self.mgr.cache.invalidate_host_daemons(daemon_spec.host) self.mgr.cache.update_daemon_config_deps( daemon_spec.host, daemon_spec.name(), daemon_spec.deps, start_time) self.mgr.cache.save_host(daemon_spec.host) msg = "{} {} on host '{}'".format( 'Reconfigured' if reconfig else 'Deployed', daemon_spec.name(), daemon_spec.host) if not code: self.mgr.events.for_daemon(daemon_spec.name(), OrchestratorEvent.INFO, msg) else: what = 'reconfigure' if reconfig else 'deploy' self.mgr.events.for_daemon( daemon_spec.name(), OrchestratorEvent.ERROR, f'Failed to {what}: {err}') return msg except OrchestratorError: if not reconfig: # we have to clean up the daemon. E.g. keyrings. servict_type = daemon_type_to_service(daemon_spec.daemon_type) dd = daemon_spec.to_daemon_description(DaemonDescriptionStatus.error, 'failed') self.mgr.cephadm_services[servict_type].post_remove(dd) raise
def run(self) -> None: try: try: old_creds = self.mgr.get_store('cephadm_endpoint_credentials') if not old_creds: raise OrchestratorError( 'No old credentials for cephadm endpoint found') old_creds_dict = json.loads(old_creds) old_key = old_creds_dict['key'] old_cert = old_creds_dict['cert'] self.ssl_certs.load_root_credentials(old_cert, old_key) except (OrchestratorError, json.decoder.JSONDecodeError, KeyError, ValueError): self.ssl_certs.generate_root_cert() cert, key = self.ssl_certs.generate_cert() self.key_tmp = tempfile.NamedTemporaryFile() self.key_tmp.write(key.encode('utf-8')) self.key_tmp.flush() # pkey_tmp must not be gc'ed key_fname = self.key_tmp.name self.cert_tmp = tempfile.NamedTemporaryFile() self.cert_tmp.write(cert.encode('utf-8')) self.cert_tmp.flush() # cert_tmp must not be gc'ed cert_fname = self.cert_tmp.name verify_tls_files(cert_fname, key_fname) cherrypy.config.update({ 'server.socket_host': self.server_addr, 'server.socket_port': self.server_port, 'engine.autoreload.on': False, 'server.ssl_module': 'builtin', 'server.ssl_certificate': cert_fname, 'server.ssl_private_key': key_fname, }) root_conf = { '/': { 'request.dispatch': cherrypy.dispatch.MethodDispatcher(), 'tools.response_headers.on': True } } cherrypy.tree.mount(Root(self.mgr), '/', root_conf) self.mgr.log.debug('Starting cherrypy engine...') self.start_engine() self.mgr.log.debug('Cherrypy engine started.') cephadm_endpoint_creds = { 'cert': self.ssl_certs.get_root_cert(), 'key': self.ssl_certs.get_root_key() } self.mgr.set_store('cephadm_endpoint_credentials', json.dumps(cephadm_endpoint_creds)) self.mgr._kick_serve_loop() # wait for the shutdown event self.cherrypy_shutdown_event.wait() self.cherrypy_shutdown_event.clear() cherrypy.engine.stop() self.mgr.log.debug('Cherrypy engine stopped.') except Exception as e: self.mgr.log.error(f'Failed to run cephadm cherrypy endpoint: {e}')
def config(self, spec: RGWSpec, rgw_id: str) -> None: # type: ignore assert self.TYPE == spec.service_type # create realm, zonegroup, and zone if needed self.create_realm_zonegroup_zone(spec, rgw_id) # ensure rgw_realm and rgw_zone is set for these daemons ret, out, err = self.mgr.check_mon_command({ 'prefix': 'config set', 'who': f"{utils.name_to_config_section('rgw')}.{spec.service_id}", 'name': 'rgw_zone', 'value': spec.rgw_zone, }) ret, out, err = self.mgr.check_mon_command({ 'prefix': 'config set', 'who': f"{utils.name_to_config_section('rgw')}.{spec.rgw_realm}", 'name': 'rgw_realm', 'value': spec.rgw_realm, }) ret, out, err = self.mgr.check_mon_command({ 'prefix': 'config set', 'who': f"{utils.name_to_config_section('rgw')}.{spec.service_id}", 'name': 'rgw_frontends', 'value': spec.rgw_frontends_config_value(), }) if spec.rgw_frontend_ssl_certificate: if isinstance(spec.rgw_frontend_ssl_certificate, list): cert_data = '\n'.join(spec.rgw_frontend_ssl_certificate) elif isinstance(spec.rgw_frontend_ssl_certificate, str): cert_data = spec.rgw_frontend_ssl_certificate else: raise OrchestratorError( 'Invalid rgw_frontend_ssl_certificate: %s' % spec.rgw_frontend_ssl_certificate) ret, out, err = self.mgr.check_mon_command({ 'prefix': 'config-key set', 'key': f'rgw/cert/{spec.rgw_realm}/{spec.rgw_zone}.crt', 'val': cert_data, }) if spec.rgw_frontend_ssl_key: if isinstance(spec.rgw_frontend_ssl_key, list): key_data = '\n'.join(spec.rgw_frontend_ssl_key) elif isinstance(spec.rgw_frontend_ssl_certificate, str): key_data = spec.rgw_frontend_ssl_key else: raise OrchestratorError('Invalid rgw_frontend_ssl_key: %s' % spec.rgw_frontend_ssl_key) ret, out, err = self.mgr.check_mon_command({ 'prefix': 'config-key set', 'key': f'rgw/cert/{spec.rgw_realm}/{spec.rgw_zone}.key', 'val': key_data, }) # TODO: fail, if we don't have a spec logger.info('Saving service %s spec with placement %s' % (spec.service_name(), spec.placement.pretty_str())) self.mgr.spec_store.save(spec)
def __getitem__(self, name: str) -> SpecDescription: if name not in self._specs: raise OrchestratorError(f'Service {name} not found.') return SpecDescription(self._specs[name], self.spec_created[name], self.spec_deleted.get(name, None))
def create_realm_zonegroup_zone(self, spec: RGWSpec, rgw_id: str) -> None: if utils.get_cluster_health(self.mgr) != 'HEALTH_OK': raise OrchestratorError( 'Health not ok, will try again when health ok') # get keyring needed to run rados commands and strip out just the keyring keyring = self.get_keyring(rgw_id).split('key = ', 1)[1].rstrip() # We can call radosgw-admin within the container, cause cephadm gives the MGR the required keyring permissions def get_realms() -> List[str]: cmd = [ 'radosgw-admin', '--key=%s' % keyring, '--user', 'rgw.%s' % rgw_id, 'realm', 'list', '--format=json' ] result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out = result.stdout if not out: return [] try: j = json.loads(out) return j.get('realms', []) except Exception: raise OrchestratorError('failed to parse realm info') def create_realm() -> None: cmd = [ 'radosgw-admin', '--key=%s' % keyring, '--user', 'rgw.%s' % rgw_id, 'realm', 'create', '--rgw-realm=%s' % spec.rgw_realm, '--default' ] result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if result.returncode: err = 'failed to create RGW realm "%s": %r' % (spec.rgw_realm, result.stderr) raise OrchestratorError(err) self.mgr.log.info('created realm: %s' % spec.rgw_realm) def get_zonegroups() -> List[str]: cmd = [ 'radosgw-admin', '--key=%s' % keyring, '--user', 'rgw.%s' % rgw_id, 'zonegroup', 'list', '--format=json' ] result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out = result.stdout if not out: return [] try: j = json.loads(out) return j.get('zonegroups', []) except Exception: raise OrchestratorError('failed to parse zonegroup info') def create_zonegroup() -> None: cmd = [ 'radosgw-admin', '--key=%s' % keyring, '--user', 'rgw.%s' % rgw_id, 'zonegroup', 'create', '--rgw-zonegroup=default', '--master', '--default' ] result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if result.returncode: err = 'failed to create RGW zonegroup "%s": %r' % ( 'default', result.stderr) raise OrchestratorError(err) self.mgr.log.info('created zonegroup: default') def create_zonegroup_if_required() -> None: zonegroups = get_zonegroups() if 'default' not in zonegroups: create_zonegroup() def get_zones() -> List[str]: cmd = [ 'radosgw-admin', '--key=%s' % keyring, '--user', 'rgw.%s' % rgw_id, 'zone', 'list', '--format=json' ] result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out = result.stdout if not out: return [] try: j = json.loads(out) return j.get('zones', []) except Exception: raise OrchestratorError('failed to parse zone info') def create_zone() -> None: cmd = [ 'radosgw-admin', '--key=%s' % keyring, '--user', 'rgw.%s' % rgw_id, 'zone', 'create', '--rgw-zonegroup=default', '--rgw-zone=%s' % spec.rgw_zone, '--master', '--default' ] result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if result.returncode: err = 'failed to create RGW zone "%s": %r' % (spec.rgw_zone, result.stderr) raise OrchestratorError(err) self.mgr.log.info('created zone: %s' % spec.rgw_zone) changes = False realms = get_realms() if spec.rgw_realm not in realms: create_realm() changes = True zones = get_zones() if spec.rgw_zone not in zones: create_zonegroup_if_required() create_zone() changes = True # update period if changes were made if changes: cmd = [ 'radosgw-admin', '--key=%s' % keyring, '--user', 'rgw.%s' % rgw_id, 'period', 'update', '--rgw-realm=%s' % spec.rgw_realm, '--commit' ] result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if result.returncode: err = 'failed to update RGW period: %r' % (result.stderr) raise OrchestratorError(err) self.mgr.log.info('updated period')
def prepare_drivegroup( self, drive_group: DriveGroupSpec) -> List[Tuple[str, DriveSelection]]: # 1) use fn_filter to determine matching_hosts matching_hosts = drive_group.placement.filter_matching_hostspecs( self.mgr.cache.get_schedulable_hosts()) # 2) Map the inventory to the InventoryHost object host_ds_map = [] # set osd_id_claims def _find_inv_for_host(hostname: str, inventory_dict: dict) -> List[Device]: # This is stupid and needs to be loaded with the host for _host, _inventory in inventory_dict.items(): if _host == hostname: return _inventory raise OrchestratorError( "No inventory found for host: {}".format(hostname)) # 3) iterate over matching_host and call DriveSelection logger.debug(f"Checking matching hosts -> {matching_hosts}") for host in matching_hosts: inventory_for_host = _find_inv_for_host(host, self.mgr.cache.devices) logger.debug(f"Found inventory for host {inventory_for_host}") # List of Daemons on that host dd_for_spec = self.mgr.cache.get_daemons_by_service( drive_group.service_name()) dd_for_spec_and_host = [ dd for dd in dd_for_spec if dd.hostname == host ] drive_selection = DriveSelection( drive_group, inventory_for_host, existing_daemons=len(dd_for_spec_and_host)) logger.debug(f"Found drive selection {drive_selection}") if drive_group.method and drive_group.method == 'raw': # ceph-volume can currently only handle a 1:1 mapping # of data/db/wal devices for raw mode osds. If db/wal devices # are defined and the number does not match the number of data # devices, we need to bail out if drive_selection.data_devices( ) and drive_selection.db_devices(): if len(drive_selection.data_devices()) != len( drive_selection.db_devices()): raise OrchestratorError( 'Raw mode only supports a 1:1 ratio of data to db devices. Found ' f'{len(drive_selection.data_devices())} potential data device(s) and ' f'{len(drive_selection.db_devices())} potential db device(s) on host {host}' ) if drive_selection.data_devices( ) and drive_selection.wal_devices(): if len(drive_selection.data_devices()) != len( drive_selection.wal_devices()): raise OrchestratorError( 'Raw mode only supports a 1:1 ratio of data to wal devices. Found ' f'{len(drive_selection.data_devices())} potential data device(s) and ' f'{len(drive_selection.wal_devices())} potential wal device(s) on host {host}' ) host_ds_map.append((host, drive_selection)) return host_ds_map