def _check_agent(self, host: str) -> bool: down = False try: assert self.mgr.cherrypy_thread assert self.mgr.cherrypy_thread.ssl_certs.get_root_cert() except Exception: self.mgr.log.debug( f'Delaying checking agent on {host} until cephadm endpoint finished creating root cert') return down if self.mgr.agent_helpers._agent_down(host): down = True try: agent = self.mgr.cache.get_daemons_by_type('agent', host=host)[0] assert agent.daemon_id is not None assert agent.hostname is not None except Exception as e: self.mgr.log.debug( f'Could not retrieve agent on host {host} from daemon cache: {e}') return down try: spec = self.mgr.spec_store.active_specs.get('agent', None) deps = self.mgr._calc_daemon_deps(spec, 'agent', agent.daemon_id) last_deps, last_config = self.mgr.agent_cache.get_agent_last_config_deps(host) if not last_config or last_deps != deps: # if root cert is the dep that changed, we must use ssh to reconfig # so it's necessary to check this one specifically root_cert_match = False try: root_cert = self.mgr.cherrypy_thread.ssl_certs.get_root_cert() if last_deps and root_cert in last_deps: root_cert_match = True except Exception: pass daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(agent) # we need to know the agent port to try to reconfig w/ http # otherwise there is no choice but a full ssh reconfig if host in self.mgr.agent_cache.agent_ports and root_cert_match and not down: daemon_spec = self.mgr.cephadm_services[daemon_type_to_service( daemon_spec.daemon_type)].prepare_create(daemon_spec) self.mgr.agent_helpers._request_agent_acks( hosts={daemon_spec.host}, increment=True, daemon_spec=daemon_spec, ) else: self.mgr._daemon_action(daemon_spec, action='reconfig') return down except Exception as e: self.mgr.log.debug( f'Agent on host {host} not ready to have config and deps checked: {e}') action = self.mgr.cache.get_scheduled_daemon_action(agent.hostname, agent.name()) if action: try: daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(agent) self.mgr._daemon_action(daemon_spec, action=action) self.mgr.cache.rm_scheduled_daemon_action(agent.hostname, agent.name()) except Exception as e: self.mgr.log.debug( f'Agent on host {host} not ready to {action}: {e}') return down
def _check_agent(self, host: str) -> bool: try: assert self.mgr.cherrypy_thread assert self.mgr.cherrypy_thread.ssl_certs.get_root_cert() except Exception: self.mgr.log.debug( f'Delaying checking agent on {host} until cephadm endpoint finished creating root cert' ) return False if self.mgr.agent_helpers._agent_down(host): return True else: try: agent = self.mgr.cache.get_daemons_by_type('agent', host=host)[0] assert agent.daemon_id is not None assert agent.hostname is not None except Exception as e: self.mgr.log.debug( f'Could not retrieve agent on host {host} from daemon cache: {e}' ) return False try: spec = self.mgr.spec_store.active_specs.get('agent', None) deps = self.mgr._calc_daemon_deps(spec, 'agent', agent.daemon_id) last_deps, last_config = self.mgr.cache.get_daemon_last_config_deps( host, agent.name()) if not last_config or last_deps != deps: daemon_spec = CephadmDaemonDeploySpec.from_daemon_description( agent) self.mgr._daemon_action(daemon_spec, action='reconfig') return False except Exception as e: self.mgr.log.debug( f'Agent on host {host} not ready to have config and deps checked: {e}' ) action = self.mgr.cache.get_scheduled_daemon_action( agent.hostname, agent.name()) if action: try: daemon_spec = CephadmDaemonDeploySpec.from_daemon_description( agent) self.mgr._daemon_action(daemon_spec, action=action) self.mgr.cache.rm_scheduled_daemon_action( agent.hostname, agent.name()) except Exception as e: self.mgr.log.debug( f'Agent on host {host} not ready to {action}: {e}') return False
def deploy_osd_daemons_for_existing_osds( self, host: str, service_name: str, replace_osd_ids: Optional[List[str]] = None) -> str: if replace_osd_ids is None: replace_osd_ids = OsdIdClaims(self.mgr).filtered_by_host(host) assert replace_osd_ids is not None # check result osds_elems: dict = CephadmServe(self.mgr)._run_cephadm_json( host, 'osd', 'ceph-volume', [ '--', 'lvm', 'list', '--format', 'json', ]) before_osd_uuid_map = self.mgr.get_osd_uuid_map(only_up=True) fsid = self.mgr._cluster_fsid osd_uuid_map = self.mgr.get_osd_uuid_map() created = [] for osd_id, osds in osds_elems.items(): for osd in osds: if osd['type'] == 'db': continue if osd['tags']['ceph.cluster_fsid'] != fsid: logger.debug('mismatched fsid, skipping %s' % osd) continue if osd_id in before_osd_uuid_map and osd_id not in replace_osd_ids: # if it exists but is part of the replacement operation, don't skip continue if osd_id not in osd_uuid_map: logger.debug( 'osd id {} does not exist in cluster'.format(osd_id)) continue if osd_uuid_map.get(osd_id) != osd['tags']['ceph.osd_fsid']: logger.debug('mismatched osd uuid (cluster has %s, osd ' 'has %s)' % (osd_uuid_map.get(osd_id), osd['tags']['ceph.osd_fsid'])) continue created.append(osd_id) daemon_spec: CephadmDaemonDeploySpec = CephadmDaemonDeploySpec( service_name=service_name, daemon_id=osd_id, host=host, daemon_type='osd', ) daemon_spec.final_config, daemon_spec.deps = self.generate_config( daemon_spec) CephadmServe(self.mgr)._create_daemon( daemon_spec, osd_uuid_map=osd_uuid_map) if created: self.mgr.cache.invalidate_host_devices(host) self.mgr.cache.invalidate_autotune(host) return "Created osd(s) %s on host '%s'" % (','.join(created), host) else: return "Created no osd(s) on host %s; already created?" % host
def test_iscsi_client_caps(self): iscsi_daemon_spec = CephadmDaemonDeploySpec( host='host', daemon_id='a', service_name=self.iscsi_spec.service_name()) self.iscsi_service.prepare_create(iscsi_daemon_spec) expected_caps = [ 'mon', 'profile rbd, allow command "osd blocklist", allow command "config-key get" with "key" prefix "iscsi/"', 'mgr', 'allow command "service status"', 'osd', 'allow rwx' ] expected_call = call({ 'prefix': 'auth get-or-create', 'entity': 'client.iscsi.a', 'caps': expected_caps }) expected_call2 = call({ 'prefix': 'auth caps', 'entity': 'client.iscsi.a', 'caps': expected_caps }) assert expected_call in self.mgr.mon_command.mock_calls assert expected_call2 in self.mgr.mon_command.mock_calls
def test_iscsi_client_caps(self): mgr = FakeMgr() iscsi_service = self._get_services(mgr)['iscsi'] iscsi_spec = IscsiServiceSpec(service_type='iscsi', service_id="a") iscsi_spec.daemon_type = "iscsi" iscsi_spec.daemon_id = "a" iscsi_spec.spec = MagicMock() iscsi_spec.spec.daemon_type = "iscsi" iscsi_spec.spec.ssl_cert = '' mgr.spec_store = MagicMock() mgr.spec_store.__getitem__.return_value = iscsi_spec iscsi_daemon_spec = CephadmDaemonDeploySpec( host='host', daemon_id='a', service_name=iscsi_spec.service_name()) iscsi_service.prepare_create(iscsi_daemon_spec) expected_caps = ['mon', 'profile rbd, allow command "osd blocklist", allow command "config-key get" with "key" prefix "iscsi/"', 'mgr', 'allow command "service status"', 'osd', 'allow rwx'] expected_call = call({'prefix': 'auth get-or-create', 'entity': 'client.iscsi.a', 'caps': expected_caps}) expected_call2 = call({'prefix': 'auth caps', 'entity': 'client.iscsi.a', 'caps': expected_caps}) assert expected_call in mgr.mon_command.mock_calls assert expected_call2 in mgr.mon_command.mock_calls
def prepare_create( self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec: assert self.TYPE == daemon_spec.daemon_type daemon_spec.final_config, daemon_spec.deps = self.generate_config( daemon_spec) return daemon_spec
def prepare_create( self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec: assert self.TYPE == daemon_spec.daemon_type elasticsearch_nodes = get_elasticsearch_nodes(self, daemon_spec) daemon_spec.final_config = { 'elasticsearch_nodes': ",".join(elasticsearch_nodes) } return daemon_spec
def agent_config_successfully_delivered( self, daemon_spec: CephadmDaemonDeploySpec) -> None: # agent successfully received new config. Update config/deps assert daemon_spec.service_name == 'agent' self.update_daemon_config_deps(daemon_spec.host, daemon_spec.name(), daemon_spec.deps, datetime_now()) self.agent_timestamp[daemon_spec.host] = datetime_now() self.agent_counter[daemon_spec.host] = 1 self.save_host(daemon_spec.host)
def test_grafana_initial_admin_pw(self, cephadm_module: CephadmOrchestrator): with with_host(cephadm_module, 'test'): with with_service(cephadm_module, ServiceSpec('mgr')) as _, \ with_service(cephadm_module, GrafanaSpec(initial_admin_password='******')): out = cephadm_module.cephadm_services[ 'grafana'].generate_config( CephadmDaemonDeploySpec('test', 'daemon', 'grafana')) assert out == ({ 'files': { 'grafana.ini': '# This file is generated by cephadm.\n' '[users]\n' ' default_theme = light\n' '[auth.anonymous]\n' ' enabled = true\n' " org_name = 'Main Org.'\n" " org_role = 'Viewer'\n" '[server]\n' " domain = 'bootstrap.storage.lab'\n" ' protocol = https\n' ' cert_file = /etc/grafana/certs/cert_file\n' ' cert_key = /etc/grafana/certs/cert_key\n' ' http_port = 3000\n' ' http_addr = \n' '[security]\n' ' admin_user = admin\n' ' admin_password = secure\n' ' cookie_secure = true\n' ' cookie_samesite = none\n' ' allow_embedding = true', 'provisioning/datasources/ceph-dashboard.yml': "# This file is generated by cephadm.\n" 'deleteDatasources:\n\n' " - name: 'Loki'\n" ' orgId: 2\n\n' 'datasources:\n\n' " - name: 'Loki'\n" " type: 'loki'\n" " access: 'proxy'\n" ' orgId: 2\n' " url: 'http://[1::4]:3100'\n" ' basicAuth: false\n' ' isDefault: true\n' ' editable: false', 'certs/cert_file': ANY, 'certs/cert_key': ANY } }, [])
def prepare_create( self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec: assert self.TYPE == daemon_spec.daemon_type collectors = [] for dd in self.mgr.cache.get_daemons_by_type( JaegerCollectorService.TYPE): # scrape jaeger-collector nodes assert dd.hostname is not None port = dd.ports[ 0] if dd.ports else JaegerCollectorService.DEFAULT_SERVICE_PORT url = build_url(host=dd.hostname, port=port).lstrip('/') collectors.append(url) daemon_spec.final_config = {'collector_nodes': ",".join(collectors)} return daemon_spec
def haproxy_prepare_create( self, daemon_spec: CephadmDaemonDeploySpec, ) -> CephadmDaemonDeploySpec: assert daemon_spec.daemon_type == 'haproxy' daemon_id = daemon_spec.daemon_id host = daemon_spec.host spec = cast(IngressSpec, self.mgr.spec_store[daemon_spec.service_name].spec) logger.debug('prepare_create haproxy.%s on host %s with spec %s' % (daemon_id, host, spec)) daemon_spec.final_config, daemon_spec.deps = self.haproxy_generate_config( daemon_spec) return daemon_spec
def test_ingress_config(self, _run_cephadm, cephadm_module: CephadmOrchestrator): _run_cephadm.side_effect = async_side_effect(('{}', '', 0)) with with_host(cephadm_module, 'test'): cephadm_module.cache.update_host_networks( 'test', {'1.2.3.0/24': { 'if0': ['1.2.3.4/32'] }}) # the ingress backend s = RGWSpec(service_id="foo", placement=PlacementSpec(count=1), rgw_frontend_type='beast') ispec = IngressSpec(service_type='ingress', service_id='test', backend_service='rgw.foo', frontend_port=8089, monitor_port=8999, monitor_user='******', monitor_password='******', keepalived_password='******', virtual_interface_networks=['1.2.3.0/24'], virtual_ip="1.2.3.4/32") with with_service(cephadm_module, s) as _, with_service(cephadm_module, ispec) as _: # generate the keepalived conf based on the specified spec keepalived_generated_conf = cephadm_module.cephadm_services[ 'ingress'].keepalived_generate_config( CephadmDaemonDeploySpec( host='test', daemon_id='ingress', service_name=ispec.service_name())) keepalived_expected_conf = { 'files': { 'keepalived.conf': '# This file is generated by cephadm.\n' 'vrrp_script check_backend {\n ' 'script "/usr/bin/curl http://localhost:8999/health"\n ' 'weight -20\n ' 'interval 2\n ' 'rise 2\n ' 'fall 2\n}\n\n' 'vrrp_instance VI_0 {\n ' 'state MASTER\n ' 'priority 100\n ' 'interface if0\n ' 'virtual_router_id 51\n ' 'advert_int 1\n ' 'authentication {\n ' 'auth_type PASS\n ' 'auth_pass 12345\n ' '}\n ' 'unicast_src_ip 1::4\n ' 'unicast_peer {\n ' '}\n ' 'virtual_ipaddress {\n ' '1.2.3.4/32 dev if0\n ' '}\n ' 'track_script {\n ' 'check_backend\n }\n' '}' } } # check keepalived config assert keepalived_generated_conf[0] == keepalived_expected_conf # generate the haproxy conf based on the specified spec haproxy_generated_conf = cephadm_module.cephadm_services[ 'ingress'].haproxy_generate_config( CephadmDaemonDeploySpec( host='test', daemon_id='ingress', service_name=ispec.service_name())) haproxy_expected_conf = { 'files': { 'haproxy.cfg': '# This file is generated by cephadm.' '\nglobal\n log ' '127.0.0.1 local2\n ' 'chroot /var/lib/haproxy\n ' 'pidfile /var/lib/haproxy/haproxy.pid\n ' 'maxconn 8000\n ' 'daemon\n ' 'stats socket /var/lib/haproxy/stats\n' '\ndefaults\n ' 'mode http\n ' 'log global\n ' 'option httplog\n ' 'option dontlognull\n ' 'option http-server-close\n ' 'option forwardfor except 127.0.0.0/8\n ' 'option redispatch\n ' 'retries 3\n ' 'timeout queue 20s\n ' 'timeout connect 5s\n ' 'timeout http-request 1s\n ' 'timeout http-keep-alive 5s\n ' 'timeout client 1s\n ' 'timeout server 1s\n ' 'timeout check 5s\n ' 'maxconn 8000\n' '\nfrontend stats\n ' 'mode http\n ' 'bind 1.2.3.4:8999\n ' 'bind localhost:8999\n ' 'stats enable\n ' 'stats uri /stats\n ' 'stats refresh 10s\n ' 'stats auth admin:12345\n ' 'http-request use-service prometheus-exporter if { path /metrics }\n ' 'monitor-uri /health\n' '\nfrontend frontend\n ' 'bind 1.2.3.4:8089\n ' 'default_backend backend\n\n' 'backend backend\n ' 'option forwardfor\n ' 'balance static-rr\n ' 'option httpchk HEAD / HTTP/1.0\n ' 'server ' + haproxy_generated_conf[1][0] + ' 1::4:80 check weight 100\n' } } assert haproxy_generated_conf[0] == haproxy_expected_conf
def _check_agent(self, host: str) -> bool: try: assert self.mgr.cherrypy_thread assert self.mgr.cherrypy_thread.ssl_certs.get_root_cert() except Exception: self.mgr.log.debug( f'Delaying checking agent on {host} until cephadm endpoint finished creating root cert') return False if self.mgr.agent_helpers._agent_down(host): if host not in self.mgr.offline_hosts: self.mgr.cache.metadata_up_to_date[host] = False # In case host is actually offline, it's best to reset the connection to avoid # a long timeout trying to use an existing connection to an offline host self.mgr.ssh._reset_con(host) try: # try to schedule redeploy of agent in case it is individually down agent = self.mgr.cache.get_daemons_by_type('agent', host=host)[0] with self.mgr.agent_helpers.agent_lock(host): daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(agent) self.mgr._daemon_action(daemon_spec, action='redeploy') except AgentLockException: self.mgr.log.debug( f'Could not redeploy agent on host {host}. Someone else holds agent\'s lock') except Exception as e: self.mgr.log.debug( f'Failed to redeploy agent on host {host}. Agent possibly never deployed: {e}') return True else: try: agent = self.mgr.cache.get_daemons_by_type('agent', host=host)[0] assert agent.daemon_id is not None assert agent.hostname is not None except Exception as e: self.mgr.log.debug( f'Could not retrieve agent on host {host} from daemon cache: {e}') return False try: spec = self.mgr.spec_store.active_specs.get('agent', None) deps = self.mgr._calc_daemon_deps(spec, 'agent', agent.daemon_id) last_deps, last_config = self.mgr.cache.get_daemon_last_config_deps( host, agent.name()) if not last_config or last_deps != deps: daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(agent) with self.mgr.agent_helpers.agent_lock(host): self.mgr._daemon_action(daemon_spec, action='reconfig') return False except AgentLockException: self.mgr.log.debug( f'Could not reconfig agent on host {host}. Someone else holds agent\'s lock') except Exception as e: self.mgr.log.debug( f'Agent on host {host} not ready to have config and deps checked: {e}') action = self.mgr.cache.get_scheduled_daemon_action(agent.hostname, agent.name()) if action: try: with self.mgr.agent_helpers.agent_lock(host): daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(agent) self.mgr._daemon_action(daemon_spec, action=action) self.mgr.cache.rm_scheduled_daemon_action(agent.hostname, agent.name()) except AgentLockException: self.mgr.log.debug( f'Could not {action} agent on host {host}. Someone else holds agent\'s lock') except Exception as e: self.mgr.log.debug( f'Agent on host {host} not ready to {action}: {e}') return False
def _create_daemon( self, daemon_spec: CephadmDaemonDeploySpec, reconfig: bool = False, osd_uuid_map: Optional[Dict[str, Any]] = None, ) -> str: with set_exception_subject('service', orchestrator.DaemonDescription( daemon_type=daemon_spec.daemon_type, daemon_id=daemon_spec.daemon_id, hostname=daemon_spec.host, ).service_id(), overwrite=True): try: image = '' start_time = datetime_now() ports: List[ int] = daemon_spec.ports if daemon_spec.ports else [] if daemon_spec.daemon_type == 'container': spec = cast( CustomContainerSpec, self.mgr.spec_store[daemon_spec.service_name].spec) image = spec.image if spec.ports: ports.extend(spec.ports) if daemon_spec.daemon_type == 'cephadm-exporter': if not reconfig: assert daemon_spec.host deploy_ok = self._deploy_cephadm_binary( daemon_spec.host) if not deploy_ok: msg = f"Unable to deploy the cephadm binary to {daemon_spec.host}" self.log.warning(msg) return msg if daemon_spec.daemon_type == 'haproxy': haspec = cast( HA_RGWSpec, self.mgr.spec_store[daemon_spec.service_name].spec) if haspec.haproxy_container_image: image = haspec.haproxy_container_image if daemon_spec.daemon_type == 'keepalived': haspec = cast( HA_RGWSpec, self.mgr.spec_store[daemon_spec.service_name].spec) if haspec.keepalived_container_image: image = haspec.keepalived_container_image # TCP port to open in the host firewall if len(ports) > 0: daemon_spec.extra_args.extend( ['--tcp-ports', ' '.join(map(str, ports))]) # osd deployments needs an --osd-uuid arg if daemon_spec.daemon_type == 'osd': if not osd_uuid_map: osd_uuid_map = self.mgr.get_osd_uuid_map() osd_uuid = osd_uuid_map.get(daemon_spec.daemon_id) if not osd_uuid: raise OrchestratorError('osd.%s not in osdmap' % daemon_spec.daemon_id) daemon_spec.extra_args.extend(['--osd-fsid', osd_uuid]) if reconfig: daemon_spec.extra_args.append('--reconfig') if self.mgr.allow_ptrace: daemon_spec.extra_args.append('--allow-ptrace') if self.mgr.cache.host_needs_registry_login( daemon_spec.host) and self.mgr.registry_url: self._registry_login(daemon_spec.host, self.mgr.registry_url, self.mgr.registry_username, self.mgr.registry_password) daemon_spec.extra_args.extend(['--config-json', '-']) self.log.info('%s daemon %s on %s' % ('Reconfiguring' if reconfig else 'Deploying', daemon_spec.name(), daemon_spec.host)) out, err, code = self._run_cephadm( daemon_spec.host, daemon_spec.name(), 'deploy', [ '--name', daemon_spec.name(), ] + daemon_spec.extra_args, stdin=json.dumps(daemon_spec.final_config), image=image) if not code and daemon_spec.host in self.mgr.cache.daemons: # prime cached service state with what we (should have) # just created sd = daemon_spec.to_daemon_description(1, 'starting') self.mgr.cache.add_daemon(daemon_spec.host, sd) if daemon_spec.daemon_type in [ 'grafana', 'iscsi', 'prometheus', 'alertmanager' ]: self.mgr.requires_post_actions.add( daemon_spec.daemon_type) self.mgr.cache.invalidate_host_daemons(daemon_spec.host) self.mgr.cache.update_daemon_config_deps( daemon_spec.host, daemon_spec.name(), daemon_spec.deps, start_time) self.mgr.cache.save_host(daemon_spec.host) msg = "{} {} on host '{}'".format( 'Reconfigured' if reconfig else 'Deployed', daemon_spec.name(), daemon_spec.host) if not code: self.mgr.events.for_daemon(daemon_spec.name(), OrchestratorEvent.INFO, msg) else: what = 'reconfigure' if reconfig else 'deploy' self.mgr.events.for_daemon(daemon_spec.name(), OrchestratorEvent.ERROR, f'Failed to {what}: {err}') return msg except OrchestratorError: if not reconfig: # we have to clean up the daemon. E.g. keyrings. servict_type = daemon_type_to_service( daemon_spec.daemon_type) dd = daemon_spec.to_daemon_description(-1, 'failed') self.mgr.cephadm_services[servict_type].post_remove(dd) raise
def create_single_host(self, drive_group: DriveGroupSpec, host: str, cmd: str, replace_osd_ids: List[str], env_vars: Optional[List[str]] = None) -> str: out, err, code = self._run_ceph_volume_command(host, cmd, env_vars=env_vars) if code == 1 and ', it is already prepared' in '\n'.join(err): # HACK: when we create against an existing LV, ceph-volume # returns an error and the above message. To make this # command idempotent, tolerate this "error" and continue. logger.debug('the device was already prepared; continuing') code = 0 if code: raise RuntimeError( 'cephadm exited with an error code: %d, stderr:%s' % (code, '\n'.join(err))) # check result out, err, code = CephadmServe(self.mgr)._run_cephadm( host, 'osd', 'ceph-volume', [ '--', 'lvm', 'list', '--format', 'json', ]) before_osd_uuid_map = self.mgr.get_osd_uuid_map(only_up=True) try: osds_elems = json.loads('\n'.join(out)) except ValueError: logger.exception('Cannot decode JSON: \'%s\'' % '\n'.join(out)) osds_elems = {} fsid = self.mgr._cluster_fsid osd_uuid_map = self.mgr.get_osd_uuid_map() created = [] for osd_id, osds in osds_elems.items(): for osd in osds: if osd['tags']['ceph.cluster_fsid'] != fsid: logger.debug('mismatched fsid, skipping %s' % osd) continue if osd_id in before_osd_uuid_map and osd_id not in replace_osd_ids: # if it exists but is part of the replacement operation, don't skip continue if osd_id not in osd_uuid_map: logger.debug( 'osd id {} does not exist in cluster'.format(osd_id)) continue if osd_uuid_map.get(osd_id) != osd['tags']['ceph.osd_fsid']: logger.debug('mismatched osd uuid (cluster has %s, osd ' 'has %s)' % (osd_uuid_map.get(osd_id), osd['tags']['ceph.osd_fsid'])) continue created.append(osd_id) daemon_spec: CephadmDaemonDeploySpec = CephadmDaemonDeploySpec( service_name=drive_group.service_name(), daemon_id=osd_id, host=host, daemon_type='osd', ) daemon_spec.final_config, daemon_spec.deps = self.generate_config( daemon_spec) CephadmServe(self.mgr)._create_daemon( daemon_spec, osd_uuid_map=osd_uuid_map) if created: self.mgr.cache.invalidate_host_devices(host) return "Created osd(s) %s on host '%s'" % (','.join(created), host) else: return "Created no osd(s) on host %s; already created?" % host
def _do_upgrade(self): # type: () -> None if not self.upgrade_state: logger.debug('_do_upgrade no state, exiting') return target_image = self.target_image target_id = self.upgrade_state.target_id target_digests = self.upgrade_state.target_digests target_version = self.upgrade_state.target_version first = False if not target_id or not target_version or not target_digests: # need to learn the container hash logger.info('Upgrade: First pull of %s' % target_image) self.upgrade_info_str = 'Doing first pull of %s image' % ( target_image) try: target_id, target_version, target_digests = CephadmServe( self.mgr)._get_container_image_info(target_image) except OrchestratorError as e: self._fail_upgrade( 'UPGRADE_FAILED_PULL', { 'severity': 'warning', 'summary': 'Upgrade: failed to pull target image', 'count': 1, 'detail': [str(e)], }) return if not target_version: self._fail_upgrade( 'UPGRADE_FAILED_PULL', { 'severity': 'warning', 'summary': 'Upgrade: failed to pull target image', 'count': 1, 'detail': ['unable to extract ceph version from container'], }) return self.upgrade_state.target_id = target_id # extract the version portion of 'ceph version {version} ({sha1})' self.upgrade_state.target_version = target_version.split(' ')[2] self.upgrade_state.target_digests = target_digests self._save_upgrade_state() target_image = self.target_image first = True if target_digests is None: target_digests = [] if target_version.startswith('ceph version '): # tolerate/fix upgrade state from older version self.upgrade_state.target_version = target_version.split(' ')[2] target_version = self.upgrade_state.target_version target_major, target_minor, target_patch = target_version.split('.') target_major_name = self.mgr.lookup_release_name(int(target_major)) if first: logger.info('Upgrade: Target is version %s (%s)' % (target_version, target_major_name)) logger.info('Upgrade: Target container is %s, digests %s' % (target_image, target_digests)) version_error = self._check_target_version(target_version) if version_error: self._fail_upgrade( 'UPGRADE_BAD_TARGET_VERSION', { 'severity': 'error', 'summary': f'Upgrade: cannot upgrade/downgrade to {target_version}', 'count': 1, 'detail': [version_error], }) return image_settings = self.get_distinct_container_image_settings() daemons = [ d for d in self.mgr.cache.get_daemons() if d.daemon_type in CEPH_UPGRADE_ORDER ] done = 0 for daemon_type in CEPH_UPGRADE_ORDER: logger.debug('Upgrade: Checking %s daemons' % daemon_type) need_upgrade_self = False need_upgrade: List[Tuple[DaemonDescription, bool]] = [] need_upgrade_deployer: List[Tuple[DaemonDescription, bool]] = [] for d in daemons: if d.daemon_type != daemon_type: continue assert d.daemon_type is not None assert d.daemon_id is not None correct_digest = False if (any(d in target_digests for d in (d.container_image_digests or [])) or d.daemon_type in MONITORING_STACK_TYPES): logger.debug('daemon %s.%s container digest correct' % (daemon_type, d.daemon_id)) correct_digest = True if any(d in target_digests for d in (d.deployed_by or [])): logger.debug( 'daemon %s.%s deployed by correct version' % (d.daemon_type, d.daemon_id)) done += 1 continue if self.mgr.daemon_is_self(d.daemon_type, d.daemon_id): logger.info('Upgrade: Need to upgrade myself (mgr.%s)' % self.mgr.get_mgr_id()) need_upgrade_self = True continue if correct_digest: logger.debug( 'daemon %s.%s not deployed by correct version' % (d.daemon_type, d.daemon_id)) need_upgrade_deployer.append((d, True)) else: logger.debug( 'daemon %s.%s not correct (%s, %s, %s)' % (daemon_type, d.daemon_id, d.container_image_name, d.container_image_digests, d.version)) need_upgrade.append((d, False)) if not need_upgrade_self: # only after the mgr itself is upgraded can we expect daemons to have # deployed_by == target_digests need_upgrade += need_upgrade_deployer # prepare filesystems for daemon upgrades? if (daemon_type == 'mds' and need_upgrade and not self._prepare_for_mds_upgrade( target_major, [d_entry[0] for d_entry in need_upgrade])): return if need_upgrade: self.upgrade_info_str = 'Currently upgrading %s daemons' % ( daemon_type) to_upgrade: List[Tuple[DaemonDescription, bool]] = [] known_ok_to_stop: List[str] = [] for d_entry in need_upgrade: d = d_entry[0] assert d.daemon_type is not None assert d.daemon_id is not None assert d.hostname is not None if not d.container_image_id: if d.container_image_name == target_image: logger.debug( 'daemon %s has unknown container_image_id but has correct image name' % (d.name())) continue if known_ok_to_stop: if d.name() in known_ok_to_stop: logger.info( f'Upgrade: {d.name()} is also safe to restart') to_upgrade.append(d_entry) continue if d.daemon_type in ['mon', 'osd', 'mds']: # NOTE: known_ok_to_stop is an output argument for # _wait_for_ok_to_stop if not self._wait_for_ok_to_stop(d, known_ok_to_stop): return to_upgrade.append(d_entry) # if we don't have a list of others to consider, stop now if not known_ok_to_stop: break num = 1 for d_entry in to_upgrade: d = d_entry[0] assert d.daemon_type is not None assert d.daemon_id is not None assert d.hostname is not None self._update_upgrade_progress(done / len(daemons)) # make sure host has latest container image out, errs, code = CephadmServe(self.mgr)._run_cephadm( d.hostname, '', 'inspect-image', [], image=target_image, no_fsid=True, error_ok=True) if code or not any(d in target_digests for d in json.loads(''.join(out)).get( 'repo_digests', [])): logger.info('Upgrade: Pulling %s on %s' % (target_image, d.hostname)) self.upgrade_info_str = 'Pulling %s image on host %s' % ( target_image, d.hostname) out, errs, code = CephadmServe(self.mgr)._run_cephadm( d.hostname, '', 'pull', [], image=target_image, no_fsid=True, error_ok=True) if code: self._fail_upgrade( 'UPGRADE_FAILED_PULL', { 'severity': 'warning', 'summary': 'Upgrade: failed to pull target image', 'count': 1, 'detail': [ 'failed to pull %s on host %s' % (target_image, d.hostname) ], }) return r = json.loads(''.join(out)) if not any(d in target_digests for d in r.get('repo_digests', [])): logger.info( 'Upgrade: image %s pull on %s got new digests %s (not %s), restarting' % (target_image, d.hostname, r['repo_digests'], target_digests)) self.upgrade_info_str = 'Image %s pull on %s got new digests %s (not %s), restarting' % ( target_image, d.hostname, r['repo_digests'], target_digests) self.upgrade_state.target_digests = r['repo_digests'] self._save_upgrade_state() return self.upgrade_info_str = 'Currently upgrading %s daemons' % ( daemon_type) if len(to_upgrade) > 1: logger.info( 'Upgrade: Updating %s.%s (%d/%d)' % (d.daemon_type, d.daemon_id, num, len(to_upgrade))) else: logger.info('Upgrade: Updating %s.%s' % (d.daemon_type, d.daemon_id)) action = 'Upgrading' if not d_entry[1] else 'Redeploying' try: daemon_spec = CephadmDaemonDeploySpec.from_daemon_description( d) self.mgr._daemon_action( daemon_spec, 'redeploy', image=target_image if not d_entry[1] else None) except Exception as e: self._fail_upgrade( 'UPGRADE_REDEPLOY_DAEMON', { 'severity': 'warning', 'summary': f'{action} daemon {d.name()} on host {d.hostname} failed.', 'count': 1, 'detail': [f'Upgrade daemon: {d.name()}: {e}'], }) return num += 1 if to_upgrade: return # complete mon upgrade? if daemon_type == 'mon': if not self.mgr.get("have_local_config_map"): logger.info( 'Upgrade: Restarting mgr now that mons are running pacific' ) need_upgrade_self = True if need_upgrade_self: try: self.mgr.mgr_service.fail_over() except OrchestratorError as e: self._fail_upgrade( 'UPGRADE_NO_STANDBY_MGR', { 'severity': 'warning', 'summary': f'Upgrade: {e}', 'count': 1, 'detail': [ 'The upgrade process needs to upgrade the mgr, ' 'but it needs at least one standby to proceed.', ], }) return return # unreachable code, as fail_over never returns elif daemon_type == 'mgr': if 'UPGRADE_NO_STANDBY_MGR' in self.mgr.health_checks: del self.mgr.health_checks['UPGRADE_NO_STANDBY_MGR'] self.mgr.set_health_checks(self.mgr.health_checks) # make sure 'ceph versions' agrees ret, out_ver, err = self.mgr.check_mon_command({ 'prefix': 'versions', }) j = json.loads(out_ver) for version, count in j.get(daemon_type, {}).items(): short_version = version.split(' ')[2] if short_version != target_version: logger.warning( 'Upgrade: %d %s daemon(s) are %s != target %s' % (count, daemon_type, short_version, target_version)) # push down configs daemon_type_section = name_to_config_section(daemon_type) if image_settings.get(daemon_type_section) != target_image: logger.info('Upgrade: Setting container_image for all %s' % daemon_type) self.mgr.set_container_image(daemon_type_section, target_image) to_clean = [] for section in image_settings.keys(): if section.startswith( name_to_config_section(daemon_type) + '.'): to_clean.append(section) if to_clean: logger.debug('Upgrade: Cleaning up container_image for %s' % to_clean) for section in to_clean: ret, image, err = self.mgr.check_mon_command({ 'prefix': 'config rm', 'name': 'container_image', 'who': section, }) logger.debug('Upgrade: All %s daemons are up to date.' % daemon_type) # complete osd upgrade? if daemon_type == 'osd': osdmap = self.mgr.get("osd_map") osd_min_name = osdmap.get("require_osd_release", "argonaut") osd_min = ceph_release_to_major(osd_min_name) if osd_min < int(target_major): logger.info( f'Upgrade: Setting require_osd_release to {target_major} {target_major_name}' ) ret, _, err = self.mgr.check_mon_command({ 'prefix': 'osd require-osd-release', 'release': target_major_name, }) # complete mds upgrade? if daemon_type == 'mds' and self.upgrade_state.fs_original_max_mds: for i in self.mgr.get("fs_map")['filesystems']: fs_id = i["id"] fs_name = i['mdsmap']['fs_name'] new_max = self.upgrade_state.fs_original_max_mds.get(fs_id) if new_max: self.mgr.log.info( 'Upgrade: Scaling up filesystem %s max_mds to %d' % (fs_name, new_max)) ret, _, err = self.mgr.check_mon_command({ 'prefix': 'fs set', 'fs_name': fs_name, 'var': 'max_mds', 'val': str(new_max), }) self.upgrade_state.fs_original_max_mds = {} self._save_upgrade_state() # clean up logger.info('Upgrade: Finalizing container_image settings') self.mgr.set_container_image('global', target_image) for daemon_type in CEPH_UPGRADE_ORDER: ret, image, err = self.mgr.check_mon_command({ 'prefix': 'config rm', 'name': 'container_image', 'who': name_to_config_section(daemon_type), }) logger.info('Upgrade: Complete!') if self.upgrade_state.progress_id: self.mgr.remote('progress', 'complete', self.upgrade_state.progress_id) self.upgrade_state = None self._save_upgrade_state() return
def _check_daemons(self) -> None: daemons = self.mgr.cache.get_daemons() daemons_post: Dict[str, List[orchestrator.DaemonDescription]] = defaultdict(list) for dd in daemons: # orphan? spec = self.mgr.spec_store.active_specs.get(dd.service_name(), None) assert dd.hostname is not None assert dd.daemon_type is not None assert dd.daemon_id is not None if not spec and dd.daemon_type not in ['mon', 'mgr', 'osd']: # (mon and mgr specs should always exist; osds aren't matched # to a service spec) self.log.info('Removing orphan daemon %s...' % dd.name()) self._remove_daemon(dd.name(), dd.hostname) # ignore unmanaged services if spec and spec.unmanaged: continue # These daemon types require additional configs after creation if dd.daemon_type in ['grafana', 'iscsi', 'prometheus', 'alertmanager', 'nfs']: daemons_post[dd.daemon_type].append(dd) if self.mgr.cephadm_services[daemon_type_to_service(dd.daemon_type)].get_active_daemon( self.mgr.cache.get_daemons_by_service(dd.service_name())).daemon_id == dd.daemon_id: dd.is_active = True else: dd.is_active = False deps = self.mgr._calc_daemon_deps(dd.daemon_type, dd.daemon_id) last_deps, last_config = self.mgr.cache.get_daemon_last_config_deps( dd.hostname, dd.name()) if last_deps is None: last_deps = [] action = self.mgr.cache.get_scheduled_daemon_action(dd.hostname, dd.name()) if not last_config: self.log.info('Reconfiguring %s (unknown last config time)...' % ( dd.name())) action = 'reconfig' elif last_deps != deps: self.log.debug('%s deps %s -> %s' % (dd.name(), last_deps, deps)) self.log.info('Reconfiguring %s (dependencies changed)...' % ( dd.name())) action = 'reconfig' elif self.mgr.last_monmap and \ self.mgr.last_monmap > last_config and \ dd.daemon_type in CEPH_TYPES: self.log.info('Reconfiguring %s (monmap changed)...' % dd.name()) action = 'reconfig' elif self.mgr.extra_ceph_conf_is_newer(last_config) and \ dd.daemon_type in CEPH_TYPES: self.log.info('Reconfiguring %s (extra config changed)...' % dd.name()) action = 'reconfig' if action: if self.mgr.cache.get_scheduled_daemon_action(dd.hostname, dd.name()) == 'redeploy' \ and action == 'reconfig': action = 'redeploy' try: daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(dd) self.mgr._daemon_action(daemon_spec, action=action) self.mgr.cache.rm_scheduled_daemon_action(dd.hostname, dd.name()) except OrchestratorError as e: self.mgr.events.from_orch_error(e) if dd.daemon_type in daemons_post: del daemons_post[dd.daemon_type] # continue... except Exception as e: self.mgr.events.for_daemon_from_exception(dd.name(), e) if dd.daemon_type in daemons_post: del daemons_post[dd.daemon_type] # continue... # do daemon post actions for daemon_type, daemon_descs in daemons_post.items(): if daemon_type in self.mgr.requires_post_actions: self.mgr.requires_post_actions.remove(daemon_type) self.mgr._get_cephadm_service(daemon_type_to_service( daemon_type)).daemon_check_post(daemon_descs)