def test_daemon_ok_to_stop(self, ok_to_stop, cephadm_module: CephadmOrchestrator): spec = ServiceSpec('mds', service_id='fsname', placement=PlacementSpec(hosts=['host1', 'host2'])) with with_host(cephadm_module, 'host1'), with_host(cephadm_module, 'host2'): c = cephadm_module.apply_mds(spec) out = wait(cephadm_module, c) match_glob(out, "Scheduled mds.fsname update...") CephadmServe(cephadm_module)._apply_all_services() [daemon] = cephadm_module.cache.daemons['host1'].keys() spec.placement.set_hosts(['host2']) ok_to_stop.side_effect = False c = cephadm_module.apply_mds(spec) out = wait(cephadm_module, c) match_glob(out, "Scheduled mds.fsname update...") CephadmServe(cephadm_module)._apply_all_services() ok_to_stop.assert_called_with([daemon[4:]]) assert_rm_daemon(cephadm_module, spec.service_name(), 'host1') # verifies ok-to-stop assert_rm_daemon(cephadm_module, spec.service_name(), 'host2')
def deploy_osd_daemons_for_existing_osds( self, host: str, service_name: str, replace_osd_ids: Optional[List[str]] = None) -> str: if replace_osd_ids is None: replace_osd_ids = OsdIdClaims(self.mgr).filtered_by_host(host) assert replace_osd_ids is not None # check result osds_elems: dict = CephadmServe(self.mgr)._run_cephadm_json( host, 'osd', 'ceph-volume', [ '--', 'lvm', 'list', '--format', 'json', ]) before_osd_uuid_map = self.mgr.get_osd_uuid_map(only_up=True) fsid = self.mgr._cluster_fsid osd_uuid_map = self.mgr.get_osd_uuid_map() created = [] for osd_id, osds in osds_elems.items(): for osd in osds: if osd['type'] == 'db': continue if osd['tags']['ceph.cluster_fsid'] != fsid: logger.debug('mismatched fsid, skipping %s' % osd) continue if osd_id in before_osd_uuid_map and osd_id not in replace_osd_ids: # if it exists but is part of the replacement operation, don't skip continue if osd_id not in osd_uuid_map: logger.debug( 'osd id {} does not exist in cluster'.format(osd_id)) continue if osd_uuid_map.get(osd_id) != osd['tags']['ceph.osd_fsid']: logger.debug('mismatched osd uuid (cluster has %s, osd ' 'has %s)' % (osd_uuid_map.get(osd_id), osd['tags']['ceph.osd_fsid'])) continue created.append(osd_id) daemon_spec: CephadmDaemonDeploySpec = CephadmDaemonDeploySpec( service_name=service_name, daemon_id=osd_id, host=host, daemon_type='osd', ) daemon_spec.final_config, daemon_spec.deps = self.generate_config( daemon_spec) CephadmServe(self.mgr)._create_daemon( daemon_spec, osd_uuid_map=osd_uuid_map) if created: self.mgr.cache.invalidate_host_devices(host) self.mgr.cache.invalidate_autotune(host) return "Created osd(s) %s on host '%s'" % (','.join(created), host) else: return "Created no osd(s) on host %s; already created?" % host
def assert_rm_service(cephadm: CephadmOrchestrator, srv_name): mon_or_mgr = cephadm.spec_store[srv_name].spec.service_type in ('mon', 'mgr') if mon_or_mgr: assert 'Unable' in wait(cephadm, cephadm.remove_service(srv_name)) return assert wait(cephadm, cephadm.remove_service(srv_name)) == f'Removed service {srv_name}' assert cephadm.spec_store[srv_name].deleted is not None CephadmServe(cephadm)._check_daemons() CephadmServe(cephadm)._apply_all_services() assert cephadm.spec_store[srv_name].deleted unmanaged = cephadm.spec_store[srv_name].spec.unmanaged CephadmServe(cephadm)._purge_deleted_services() if not unmanaged: # cause then we're not deleting daemons assert srv_name not in cephadm.spec_store, f'{cephadm.spec_store[srv_name]!r}'
def test_etc_ceph(self, _check, _get_connection, cephadm_module): _get_connection.return_value = mock.Mock(), mock.Mock() _check.return_value = '{}', '', 0 assert cephadm_module.manage_etc_ceph_ceph_conf is False with with_host(cephadm_module, 'test'): assert not cephadm_module.cache.host_needs_new_etc_ceph_ceph_conf( 'test') with with_host(cephadm_module, 'test'): cephadm_module.set_module_option('manage_etc_ceph_ceph_conf', True) cephadm_module.config_notify() assert cephadm_module.manage_etc_ceph_ceph_conf == True CephadmServe(cephadm_module)._refresh_hosts_and_daemons() _check.assert_called_with(ANY, ['dd', 'of=/etc/ceph/ceph.conf'], stdin=b'') assert not cephadm_module.cache.host_needs_new_etc_ceph_ceph_conf( 'test') # set extra config and expect that we deploy another ceph.conf cephadm_module._set_extra_ceph_conf('[mon]\nk=v') CephadmServe(cephadm_module)._refresh_hosts_and_daemons() _check.assert_called_with(ANY, ['dd', 'of=/etc/ceph/ceph.conf'], stdin=b'\n\n[mon]\nk=v\n') # reload cephadm_module.cache.last_etc_ceph_ceph_conf = {} cephadm_module.cache.load() assert not cephadm_module.cache.host_needs_new_etc_ceph_ceph_conf( 'test') # Make sure, _check_daemons does a redeploy due to monmap change: cephadm_module.mock_store_set( '_ceph_get', 'mon_map', { 'modified': datetime.datetime.utcnow().strftime(CEPH_DATEFMT), 'fsid': 'foobar', }) cephadm_module.notify('mon_map', mock.MagicMock()) assert cephadm_module.cache.host_needs_new_etc_ceph_ceph_conf( 'test') cephadm_module.cache.last_etc_ceph_ceph_conf = {} cephadm_module.cache.load() assert cephadm_module.cache.host_needs_new_etc_ceph_ceph_conf( 'test')
def test_monitoring_ports(self, _run_cephadm, cephadm_module: CephadmOrchestrator): _run_cephadm.side_effect = async_side_effect(('{}', '', 0)) with with_host(cephadm_module, 'test'): yaml_str = """service_type: alertmanager service_name: alertmanager placement: count: 1 spec: port: 4200 """ yaml_file = yaml.safe_load(yaml_str) spec = ServiceSpec.from_json(yaml_file) with patch("cephadm.services.monitoring.AlertmanagerService.generate_config", return_value=({}, [])): with with_service(cephadm_module, spec): CephadmServe(cephadm_module)._check_daemons() _run_cephadm.assert_called_with( 'test', 'alertmanager.test', 'deploy', [ '--name', 'alertmanager.test', '--meta-json', '{"service_name": "alertmanager", "ports": [4200, 9094], "ip": null, "deployed_by": [], "rank": null, "rank_generation": null}', '--config-json', '-', '--tcp-ports', '4200 9094', '--reconfig' ], stdin='{}', image='')
def test_daemon_check_extra_config(self, _run_cephadm, cephadm_module: CephadmOrchestrator): _run_cephadm.return_value = ('{}', '', 0) with with_host(cephadm_module, 'test'): # Also testing deploying mons without explicit network placement cephadm_module.check_mon_command({ 'prefix': 'config set', 'who': 'mon', 'name': 'public_network', 'value': '127.0.0.0/8' }) cephadm_module.cache.update_host_devices_networks( 'test', [], { "127.0.0.0/8": ["127.0.0.1"], }) with with_service(cephadm_module, ServiceSpec(service_type='mon'), CephadmOrchestrator.apply_mon, 'test') as d_names: [daemon_name] = d_names cephadm_module._set_extra_ceph_conf('[mon]\nk=v') CephadmServe(cephadm_module)._check_daemons() _run_cephadm.assert_called_with( 'test', 'mon.test', 'deploy', ['--name', 'mon.test', '--reconfig', '--config-json', '-'], stdin='{"config": "\\n\\n[mon]\\nk=v\\n", "keyring": ""}', image='')
def with_service(cephadm_module: CephadmOrchestrator, spec: ServiceSpec, meth=None, host: str = '', status_running=False) -> Iterator[List[str]]: if spec.placement.is_empty() and host: spec.placement = PlacementSpec(hosts=[host], count=1) if meth is not None: c = meth(cephadm_module, spec) assert wait(cephadm_module, c) == f'Scheduled {spec.service_name()} update...' else: c = cephadm_module.apply([spec]) assert wait(cephadm_module, c) == [f'Scheduled {spec.service_name()} update...'] specs = [ d.spec for d in wait(cephadm_module, cephadm_module.describe_service()) ] assert spec in specs CephadmServe(cephadm_module)._apply_all_services() if status_running: make_daemons_running(cephadm_module, spec.service_name()) dds = wait(cephadm_module, cephadm_module.list_daemons()) own_dds = [dd for dd in dds if dd.service_name() == spec.service_name()] if host and spec.service_type != 'osd': assert own_dds yield [dd.name() for dd in own_dds] assert_rm_service(cephadm_module, spec.service_name())
def test_mgr_update(self, cephadm_module): with with_host(cephadm_module, 'test'): ps = PlacementSpec(hosts=['test:0.0.0.0=a'], count=1) r = CephadmServe(cephadm_module)._apply_service(ServiceSpec('mgr', placement=ps)) assert r assert_rm_daemon(cephadm_module, 'mgr.a', 'test')
def _run_ceph_volume_command(self, host: str, cmd: str, env_vars: Optional[List[str]] = None ) -> Tuple[List[str], List[str], int]: self.mgr.inventory.assert_host(host) # get bootstrap key ret, keyring, err = self.mgr.check_mon_command({ 'prefix': 'auth get', 'entity': 'client.bootstrap-osd', }) j = json.dumps({ 'config': self.mgr.get_minimal_ceph_conf(), 'keyring': keyring, }) split_cmd = cmd.split(' ') _cmd = ['--config-json', '-', '--'] _cmd.extend(split_cmd) out, err, code = CephadmServe(self.mgr)._run_cephadm(host, 'osd', 'ceph-volume', _cmd, env_vars=env_vars, stdin=j, error_ok=True) return out, err, code
def test_ceph_volume_no_filter_for_batch(self, _run_cephadm, cephadm_module: CephadmOrchestrator): _run_cephadm.return_value = ('{}', '', 0) error_message = """cephadm exited with an error code: 1, stderr:/usr/bin/podman:stderr usage: ceph-volume inventory [-h] [--format {plain,json,json-pretty}] [path]/usr/bin/podman:stderr ceph-volume inventory: error: unrecognized arguments: --filter-for-batch Traceback (most recent call last): File "<stdin>", line 6112, in <module> File "<stdin>", line 1299, in _infer_fsid File "<stdin>", line 1382, in _infer_image File "<stdin>", line 3612, in command_ceph_volume File "<stdin>", line 1061, in call_throws""" with with_host(cephadm_module, 'test'): _run_cephadm.reset_mock() _run_cephadm.side_effect = OrchestratorError(error_message) s = CephadmServe(cephadm_module)._refresh_host_devices('test') assert s == 'host test `cephadm ceph-volume` failed: ' + error_message assert _run_cephadm.mock_calls == [ mock.call('test', 'osd', 'ceph-volume', ['--', 'inventory', '--format=json', '--filter-for-batch'], image='', no_fsid=False), mock.call('test', 'osd', 'ceph-volume', ['--', 'inventory', '--format=json'], image='', no_fsid=False), ]
def test_remove_osds(self, cephadm_module): with with_host(cephadm_module, 'test'): CephadmServe(cephadm_module)._refresh_host_daemons('test') c = cephadm_module.list_daemons() wait(cephadm_module, c) c = cephadm_module.remove_daemons(['osd.0']) out = wait(cephadm_module, c) assert out == ["Removed osd.0 from host 'test'"] cephadm_module.to_remove_osds.enqueue( OSD(osd_id=0, replace=False, force=False, hostname='test', fullname='osd.0', process_started_at=datetime_now(), remove_util=cephadm_module.to_remove_osds.rm_util)) cephadm_module.to_remove_osds.process_removal_queue() assert cephadm_module.to_remove_osds == OSDRemovalQueue( cephadm_module) c = cephadm_module.remove_osds_status() out = wait(cephadm_module, c) assert out == []
def test_offline(self, check_execute_command, execute_command, cephadm_module): check_execute_command.return_value = '' execute_command.return_value = '', '', 0 if not AsyncMock: # can't run this test if we could not import AsyncMock return mock_connect = AsyncMock(return_value='') with mock.patch("asyncssh.connect", new=mock_connect) as asyncssh_connect: with with_host(cephadm_module, 'test'): asyncssh_connect.side_effect = ConnectionLost('reason') code, out, err = cephadm_module.check_host('test') assert out == '' assert "Host 'test' not found" in err out = wait(cephadm_module, cephadm_module.get_hosts())[0].to_json() assert out == HostSpec('test', '1::4', status='Offline').to_json() asyncssh_connect.return_value = mock.MagicMock() asyncssh_connect.side_effect = None assert CephadmServe(cephadm_module)._check_host('test') is None out = wait(cephadm_module, cephadm_module.get_hosts())[0].to_json() assert out == HostSpec('test', '1::4').to_json()
def test_daemon_action_fail(self, cephadm_module: CephadmOrchestrator): cephadm_module.service_cache_timeout = 10 with with_host(cephadm_module, 'test'): with with_daemon(cephadm_module, RGWSpec(service_id='myrgw.foobar'), CephadmOrchestrator.add_rgw, 'test') as daemon_id: with mock.patch('ceph_module.BaseMgrModule._ceph_send_command' ) as _ceph_send_command: _ceph_send_command.side_effect = Exception("myerror") # Make sure, _check_daemons does a redeploy due to monmap change: cephadm_module.mock_store_set( '_ceph_get', 'mon_map', { 'modified': datetime.datetime.utcnow().strftime(CEPH_DATEFMT), 'fsid': 'foobar', }) cephadm_module.notify('mon_map', None) CephadmServe(cephadm_module)._check_daemons() evs = [ e.message for e in cephadm_module.events.get_for_daemon( f'rgw.{daemon_id}') ] assert 'myerror' in ''.join(evs)
def test_daemon_action(self, cephadm_module: CephadmOrchestrator): cephadm_module.service_cache_timeout = 10 with with_host(cephadm_module, 'test'): with with_daemon(cephadm_module, RGWSpec(service_id='myrgw.foobar'), CephadmOrchestrator.add_rgw, 'test') as daemon_id: c = cephadm_module.daemon_action('redeploy', 'rgw.' + daemon_id) assert wait( cephadm_module, c ) == f"Scheduled to redeploy rgw.{daemon_id} on host 'test'" for what in ('start', 'stop', 'restart'): c = cephadm_module.daemon_action(what, 'rgw.' + daemon_id) assert wait( cephadm_module, c ) == F"Scheduled to {what} rgw.{daemon_id} on host 'test'" # Make sure, _check_daemons does a redeploy due to monmap change: cephadm_module._store['_ceph_get/mon_map'] = { 'modified': datetime.datetime.utcnow().strftime(CEPH_DATEFMT), 'fsid': 'foobar', } cephadm_module.notify('mon_map', None) CephadmServe(cephadm_module)._check_daemons()
def with_host(m: CephadmOrchestrator, name, refresh_hosts=True): # type: (CephadmOrchestrator, str) -> None wait(m, m.add_host(HostSpec(hostname=name))) if refresh_hosts: CephadmServe(m)._refresh_hosts_and_daemons() yield wait(m, m.remove_host(name))
def test_daemon_check_post(self, cephadm_module: CephadmOrchestrator): with with_host(cephadm_module, 'test'): with with_service(cephadm_module, ServiceSpec(service_type='grafana'), CephadmOrchestrator.apply_grafana, 'test'): # Make sure, _check_daemons does a redeploy due to monmap change: cephadm_module.mock_store_set( '_ceph_get', 'mon_map', { 'modified': datetime.datetime.utcnow().strftime(CEPH_DATEFMT), 'fsid': 'foobar', }) cephadm_module.notify('mon_map', None) cephadm_module.mock_store_set('_ceph_get', 'mgr_map', {'modules': ['dashboard']}) with mock.patch( "cephadm.module.CephadmOrchestrator.mon_command" ) as _mon_cmd: CephadmServe(cephadm_module)._check_daemons() _mon_cmd.assert_any_call({ 'prefix': 'dashboard set-grafana-api-url', 'value': 'https://test:3000' })
def test_remove_daemon(self, cephadm_module): with with_host(cephadm_module, 'test'): CephadmServe(cephadm_module)._refresh_host_daemons('test') c = cephadm_module.list_daemons() wait(cephadm_module, c) c = cephadm_module.remove_daemons(['rgw.myrgw.myhost.myid']) out = wait(cephadm_module, c) assert out == ["Removed rgw.myrgw.myhost.myid from host 'test'"]
def test_migrate_scheduler(cephadm_module: CephadmOrchestrator): with with_host(cephadm_module, 'host1', refresh_hosts=False): with with_host(cephadm_module, 'host2', refresh_hosts=False): # emulate the old scheduler: c = cephadm_module.apply_rgw( ServiceSpec('rgw', 'r.z', placement=PlacementSpec(host_pattern='*', count=2)) ) assert wait(cephadm_module, c) == 'Scheduled rgw.r.z update...' # with pytest.raises(OrchestratorError, match="cephadm migration still ongoing. Please wait, until the migration is complete."): CephadmServe(cephadm_module)._apply_all_services() cephadm_module.migration_current = 0 cephadm_module.migration.migrate() # assert we need all daemons. assert cephadm_module.migration_current == 0 CephadmServe(cephadm_module)._refresh_hosts_and_daemons() receive_agent_metadata_all_hosts(cephadm_module) cephadm_module.migration.migrate() CephadmServe(cephadm_module)._apply_all_services() out = {o.hostname for o in wait(cephadm_module, cephadm_module.list_daemons())} assert out == {'host1', 'host2'} c = cephadm_module.apply_rgw( ServiceSpec('rgw', 'r.z', placement=PlacementSpec(host_pattern='host1', count=2)) ) assert wait(cephadm_module, c) == 'Scheduled rgw.r.z update...' # Sorry, for this hack, but I need to make sure, Migration thinks, # we have updated all daemons already. cephadm_module.cache.last_daemon_update['host1'] = datetime_now() cephadm_module.cache.last_daemon_update['host2'] = datetime_now() cephadm_module.migration_current = 0 cephadm_module.migration.migrate() assert cephadm_module.migration_current >= 2 out = [o.spec.placement for o in wait( cephadm_module, cephadm_module.describe_service())] assert out == [PlacementSpec(count=2, hosts=[HostPlacementSpec( hostname='host1', network='', name=''), HostPlacementSpec(hostname='host2', network='', name='')])]
def with_host(m: CephadmOrchestrator, name, addr='1::4', refresh_hosts=True): # type: (CephadmOrchestrator, str) -> None with mock.patch("cephadm.utils.resolve_ip", return_value=addr): wait(m, m.add_host(HostSpec(hostname=name))) if refresh_hosts: CephadmServe(m)._refresh_hosts_and_daemons() receive_agent_metadata(m, name) yield wait(m, m.remove_host(name))
def test_daemon_check(self, cephadm_module: CephadmOrchestrator, action): with with_host(cephadm_module, 'test'): with with_service(cephadm_module, ServiceSpec(service_type='grafana'), CephadmOrchestrator.apply_grafana, 'test') as d_names: [daemon_name] = d_names cephadm_module._schedule_daemon_action(daemon_name, action) assert cephadm_module.cache.get_scheduled_daemon_action( 'test', daemon_name) == action CephadmServe(cephadm_module)._check_daemons() assert cephadm_module.cache.get_scheduled_daemon_action('test', daemon_name) is None
def receive_agent_metadata(m: CephadmOrchestrator, host: str, ops: List[str] = None) -> None: to_update: Dict[str, Callable[[str, Any], None]] = { 'ls': m._process_ls_output, 'gather-facts': m.cache.update_host_facts, 'list-networks': m.cache.update_host_networks, } if ops: for op in ops: out = CephadmServe(m)._run_cephadm_json(host, cephadmNoImage, op, []) to_update[op](host, out) m.cache.last_daemon_update[host] = datetime_now() m.cache.last_facts_update[host] = datetime_now() m.cache.last_network_update[host] = datetime_now() m.cache.metadata_up_to_date[host] = True
def test_rgw_update(self, cephadm_module): with with_host(cephadm_module, 'host1'): with with_host(cephadm_module, 'host2'): ps = PlacementSpec(hosts=['host1'], count=1) c = cephadm_module.add_rgw( RGWSpec(rgw_realm='realm', rgw_zone='zone1', placement=ps)) [out] = wait(cephadm_module, c) match_glob(out, "Deployed rgw.realm.zone1.host1.* on host 'host1'") ps = PlacementSpec(hosts=['host1', 'host2'], count=2) r = CephadmServe(cephadm_module)._apply_service( RGWSpec(rgw_realm='realm', rgw_zone='zone1', placement=ps)) assert r assert_rm_daemon(cephadm_module, 'rgw.realm.zone1', 'host1') assert_rm_daemon(cephadm_module, 'rgw.realm.zone1', 'host2')
def test_offline(self, _check, _get_connection, cephadm_module): _check.return_value = '{}', '', 0 _get_connection.return_value = mock.Mock(), mock.Mock() with with_host(cephadm_module, 'test'): _get_connection.side_effect = HostNotFound code, out, err = cephadm_module.check_host('test') assert out == '' assert "Host 'test' not found" in err out = wait(cephadm_module, cephadm_module.get_hosts())[0].to_json() assert out == HostSpec('test', 'test', status='Offline').to_json() _get_connection.side_effect = None assert CephadmServe(cephadm_module)._check_host('test') is None out = wait(cephadm_module, cephadm_module.get_hosts())[0].to_json() assert out == HostSpec('test', 'test').to_json()
def zap_osd(self, osd: "OSD") -> str: "Zaps all devices that are associated with an OSD" if osd.hostname is not None: out, err, code = CephadmServe(self.mgr)._run_cephadm( osd.hostname, 'osd', 'ceph-volume', ['--', 'lvm', 'zap', '--destroy', '--osd-id', str(osd.osd_id)], error_ok=True) self.mgr.cache.invalidate_host_devices(osd.hostname) if code: raise OrchestratorError('Zap failed: %s' % '\n'.join(out + err)) return '\n'.join(out + err) raise OrchestratorError( f"Failed to zap OSD {osd.osd_id} because host was unknown")
def test_upgrade_run(self, use_repo_digest, cephadm_module: CephadmOrchestrator): with with_host(cephadm_module, 'test', refresh_hosts=False): cephadm_module.set_container_image('global', 'image') if use_repo_digest: cephadm_module.use_repo_digest = True CephadmServe(cephadm_module).convert_tags_to_repo_digest() _, image, _ = cephadm_module.check_mon_command({ 'prefix': 'config get', 'who': 'global', 'key': 'container_image', }) if use_repo_digest: assert image == 'image@repo_digest' else: assert image == 'image'
def with_service(cephadm_module: CephadmOrchestrator, spec: ServiceSpec, meth, host: str) -> Iterator[List[str]]: if spec.placement.is_empty(): spec.placement = PlacementSpec(hosts=[host], count=1) c = meth(cephadm_module, spec) assert wait(cephadm_module, c) == f'Scheduled {spec.service_name()} update...' specs = [d.spec for d in wait(cephadm_module, cephadm_module.describe_service())] assert spec in specs CephadmServe(cephadm_module)._apply_all_services() dds = wait(cephadm_module, cephadm_module.list_daemons()) own_dds = [dd for dd in dds if dd.service_name() == spec.service_name()] assert own_dds yield [dd.name() for dd in own_dds] assert_rm_service(cephadm_module, spec.service_name())
def test_apply_osd_save(self, _run_cephadm, cephadm_module: CephadmOrchestrator): _run_cephadm.return_value = ('{}', '', 0) with with_host(cephadm_module, 'test'): spec = DriveGroupSpec( service_id='foo', placement=PlacementSpec( host_pattern='*', ), data_devices=DeviceSelection( all=True ) ) c = cephadm_module.apply([spec]) assert wait(cephadm_module, c) == ['Scheduled osd.foo update...'] inventory = Devices([ Device( '/dev/sdb', available=True ), ]) cephadm_module.cache.update_host_devices_networks('test', inventory.devices, {}) _run_cephadm.return_value = (['{}'], '', 0) assert CephadmServe(cephadm_module)._apply_all_services() == False _run_cephadm.assert_any_call( 'test', 'osd', 'ceph-volume', ['--config-json', '-', '--', 'lvm', 'batch', '--no-auto', '/dev/sdb', '--yes', '--no-systemd'], env_vars=['CEPH_VOLUME_OSDSPEC_AFFINITY=foo'], error_ok=True, stdin='{"config": "", "keyring": ""}') _run_cephadm.assert_called_with( 'test', 'osd', 'ceph-volume', ['--', 'lvm', 'list', '--format', 'json'])
def _do_upgrade(self): # type: () -> None if not self.upgrade_state: logger.debug('_do_upgrade no state, exiting') return target_image = self.target_image target_id = self.upgrade_state.target_id target_digests = self.upgrade_state.target_digests target_version = self.upgrade_state.target_version first = False if not target_id or not target_version or not target_digests: # need to learn the container hash logger.info('Upgrade: First pull of %s' % target_image) self.upgrade_info_str = 'Doing first pull of %s image' % ( target_image) try: target_id, target_version, target_digests = CephadmServe( self.mgr)._get_container_image_info(target_image) except OrchestratorError as e: self._fail_upgrade( 'UPGRADE_FAILED_PULL', { 'severity': 'warning', 'summary': 'Upgrade: failed to pull target image', 'count': 1, 'detail': [str(e)], }) return if not target_version: self._fail_upgrade( 'UPGRADE_FAILED_PULL', { 'severity': 'warning', 'summary': 'Upgrade: failed to pull target image', 'count': 1, 'detail': ['unable to extract ceph version from container'], }) return self.upgrade_state.target_id = target_id # extract the version portion of 'ceph version {version} ({sha1})' self.upgrade_state.target_version = target_version.split(' ')[2] self.upgrade_state.target_digests = target_digests self._save_upgrade_state() target_image = self.target_image first = True if target_digests is None: target_digests = [] if target_version.startswith('ceph version '): # tolerate/fix upgrade state from older version self.upgrade_state.target_version = target_version.split(' ')[2] target_version = self.upgrade_state.target_version target_major, target_minor, target_patch = target_version.split('.') target_major_name = self.mgr.lookup_release_name(int(target_major)) if first: logger.info('Upgrade: Target is version %s (%s)' % (target_version, target_major_name)) logger.info('Upgrade: Target container is %s, digests %s' % (target_image, target_digests)) version_error = self._check_target_version(target_version) if version_error: self._fail_upgrade( 'UPGRADE_BAD_TARGET_VERSION', { 'severity': 'error', 'summary': f'Upgrade: cannot upgrade/downgrade to {target_version}', 'count': 1, 'detail': [version_error], }) return image_settings = self.get_distinct_container_image_settings() daemons = [ d for d in self.mgr.cache.get_daemons() if d.daemon_type in CEPH_UPGRADE_ORDER ] done = 0 for daemon_type in CEPH_UPGRADE_ORDER: logger.debug('Upgrade: Checking %s daemons' % daemon_type) need_upgrade_self = False need_upgrade: List[Tuple[DaemonDescription, bool]] = [] need_upgrade_deployer: List[Tuple[DaemonDescription, bool]] = [] for d in daemons: if d.daemon_type != daemon_type: continue assert d.daemon_type is not None assert d.daemon_id is not None correct_digest = False if (any(d in target_digests for d in (d.container_image_digests or [])) or d.daemon_type in MONITORING_STACK_TYPES): logger.debug('daemon %s.%s container digest correct' % (daemon_type, d.daemon_id)) correct_digest = True if any(d in target_digests for d in (d.deployed_by or [])): logger.debug( 'daemon %s.%s deployed by correct version' % (d.daemon_type, d.daemon_id)) done += 1 continue if self.mgr.daemon_is_self(d.daemon_type, d.daemon_id): logger.info('Upgrade: Need to upgrade myself (mgr.%s)' % self.mgr.get_mgr_id()) need_upgrade_self = True continue if correct_digest: logger.debug( 'daemon %s.%s not deployed by correct version' % (d.daemon_type, d.daemon_id)) need_upgrade_deployer.append((d, True)) else: logger.debug( 'daemon %s.%s not correct (%s, %s, %s)' % (daemon_type, d.daemon_id, d.container_image_name, d.container_image_digests, d.version)) need_upgrade.append((d, False)) if not need_upgrade_self: # only after the mgr itself is upgraded can we expect daemons to have # deployed_by == target_digests need_upgrade += need_upgrade_deployer # prepare filesystems for daemon upgrades? if (daemon_type == 'mds' and need_upgrade and not self._prepare_for_mds_upgrade( target_major, [d_entry[0] for d_entry in need_upgrade])): return if need_upgrade: self.upgrade_info_str = 'Currently upgrading %s daemons' % ( daemon_type) to_upgrade: List[Tuple[DaemonDescription, bool]] = [] known_ok_to_stop: List[str] = [] for d_entry in need_upgrade: d = d_entry[0] assert d.daemon_type is not None assert d.daemon_id is not None assert d.hostname is not None if not d.container_image_id: if d.container_image_name == target_image: logger.debug( 'daemon %s has unknown container_image_id but has correct image name' % (d.name())) continue if known_ok_to_stop: if d.name() in known_ok_to_stop: logger.info( f'Upgrade: {d.name()} is also safe to restart') to_upgrade.append(d_entry) continue if d.daemon_type in ['mon', 'osd', 'mds']: # NOTE: known_ok_to_stop is an output argument for # _wait_for_ok_to_stop if not self._wait_for_ok_to_stop(d, known_ok_to_stop): return to_upgrade.append(d_entry) # if we don't have a list of others to consider, stop now if not known_ok_to_stop: break num = 1 for d_entry in to_upgrade: d = d_entry[0] assert d.daemon_type is not None assert d.daemon_id is not None assert d.hostname is not None self._update_upgrade_progress(done / len(daemons)) # make sure host has latest container image out, errs, code = CephadmServe(self.mgr)._run_cephadm( d.hostname, '', 'inspect-image', [], image=target_image, no_fsid=True, error_ok=True) if code or not any(d in target_digests for d in json.loads(''.join(out)).get( 'repo_digests', [])): logger.info('Upgrade: Pulling %s on %s' % (target_image, d.hostname)) self.upgrade_info_str = 'Pulling %s image on host %s' % ( target_image, d.hostname) out, errs, code = CephadmServe(self.mgr)._run_cephadm( d.hostname, '', 'pull', [], image=target_image, no_fsid=True, error_ok=True) if code: self._fail_upgrade( 'UPGRADE_FAILED_PULL', { 'severity': 'warning', 'summary': 'Upgrade: failed to pull target image', 'count': 1, 'detail': [ 'failed to pull %s on host %s' % (target_image, d.hostname) ], }) return r = json.loads(''.join(out)) if not any(d in target_digests for d in r.get('repo_digests', [])): logger.info( 'Upgrade: image %s pull on %s got new digests %s (not %s), restarting' % (target_image, d.hostname, r['repo_digests'], target_digests)) self.upgrade_info_str = 'Image %s pull on %s got new digests %s (not %s), restarting' % ( target_image, d.hostname, r['repo_digests'], target_digests) self.upgrade_state.target_digests = r['repo_digests'] self._save_upgrade_state() return self.upgrade_info_str = 'Currently upgrading %s daemons' % ( daemon_type) if len(to_upgrade) > 1: logger.info( 'Upgrade: Updating %s.%s (%d/%d)' % (d.daemon_type, d.daemon_id, num, len(to_upgrade))) else: logger.info('Upgrade: Updating %s.%s' % (d.daemon_type, d.daemon_id)) action = 'Upgrading' if not d_entry[1] else 'Redeploying' try: daemon_spec = CephadmDaemonDeploySpec.from_daemon_description( d) self.mgr._daemon_action( daemon_spec, 'redeploy', image=target_image if not d_entry[1] else None) except Exception as e: self._fail_upgrade( 'UPGRADE_REDEPLOY_DAEMON', { 'severity': 'warning', 'summary': f'{action} daemon {d.name()} on host {d.hostname} failed.', 'count': 1, 'detail': [f'Upgrade daemon: {d.name()}: {e}'], }) return num += 1 if to_upgrade: return # complete mon upgrade? if daemon_type == 'mon': if not self.mgr.get("have_local_config_map"): logger.info( 'Upgrade: Restarting mgr now that mons are running pacific' ) need_upgrade_self = True if need_upgrade_self: try: self.mgr.mgr_service.fail_over() except OrchestratorError as e: self._fail_upgrade( 'UPGRADE_NO_STANDBY_MGR', { 'severity': 'warning', 'summary': f'Upgrade: {e}', 'count': 1, 'detail': [ 'The upgrade process needs to upgrade the mgr, ' 'but it needs at least one standby to proceed.', ], }) return return # unreachable code, as fail_over never returns elif daemon_type == 'mgr': if 'UPGRADE_NO_STANDBY_MGR' in self.mgr.health_checks: del self.mgr.health_checks['UPGRADE_NO_STANDBY_MGR'] self.mgr.set_health_checks(self.mgr.health_checks) # make sure 'ceph versions' agrees ret, out_ver, err = self.mgr.check_mon_command({ 'prefix': 'versions', }) j = json.loads(out_ver) for version, count in j.get(daemon_type, {}).items(): short_version = version.split(' ')[2] if short_version != target_version: logger.warning( 'Upgrade: %d %s daemon(s) are %s != target %s' % (count, daemon_type, short_version, target_version)) # push down configs daemon_type_section = name_to_config_section(daemon_type) if image_settings.get(daemon_type_section) != target_image: logger.info('Upgrade: Setting container_image for all %s' % daemon_type) self.mgr.set_container_image(daemon_type_section, target_image) to_clean = [] for section in image_settings.keys(): if section.startswith( name_to_config_section(daemon_type) + '.'): to_clean.append(section) if to_clean: logger.debug('Upgrade: Cleaning up container_image for %s' % to_clean) for section in to_clean: ret, image, err = self.mgr.check_mon_command({ 'prefix': 'config rm', 'name': 'container_image', 'who': section, }) logger.debug('Upgrade: All %s daemons are up to date.' % daemon_type) # complete osd upgrade? if daemon_type == 'osd': osdmap = self.mgr.get("osd_map") osd_min_name = osdmap.get("require_osd_release", "argonaut") osd_min = ceph_release_to_major(osd_min_name) if osd_min < int(target_major): logger.info( f'Upgrade: Setting require_osd_release to {target_major} {target_major_name}' ) ret, _, err = self.mgr.check_mon_command({ 'prefix': 'osd require-osd-release', 'release': target_major_name, }) # complete mds upgrade? if daemon_type == 'mds' and self.upgrade_state.fs_original_max_mds: for i in self.mgr.get("fs_map")['filesystems']: fs_id = i["id"] fs_name = i['mdsmap']['fs_name'] new_max = self.upgrade_state.fs_original_max_mds.get(fs_id) if new_max: self.mgr.log.info( 'Upgrade: Scaling up filesystem %s max_mds to %d' % (fs_name, new_max)) ret, _, err = self.mgr.check_mon_command({ 'prefix': 'fs set', 'fs_name': fs_name, 'var': 'max_mds', 'val': str(new_max), }) self.upgrade_state.fs_original_max_mds = {} self._save_upgrade_state() # clean up logger.info('Upgrade: Finalizing container_image settings') self.mgr.set_container_image('global', target_image) for daemon_type in CEPH_UPGRADE_ORDER: ret, image, err = self.mgr.check_mon_command({ 'prefix': 'config rm', 'name': 'container_image', 'who': name_to_config_section(daemon_type), }) logger.info('Upgrade: Complete!') if self.upgrade_state.progress_id: self.mgr.remote('progress', 'complete', self.upgrade_state.progress_id) self.upgrade_state = None self._save_upgrade_state() return
def process_removal_queue(self) -> None: """ Performs actions in the _serve() loop to remove an OSD when criteria is met. we can't hold self.lock, as we're calling _remove_daemon in the loop """ # make sure that we don't run on OSDs that are not in the cluster anymore. self.cleanup() # find osds that are ok-to-stop and not yet draining ready_to_drain_osds = self._ready_to_drain_osds() if ready_to_drain_osds: # start draining those _ = [osd.start_draining() for osd in ready_to_drain_osds] all_osds = self.all_osds() logger.debug(f"{self.queue_size()} OSDs are scheduled " f"for removal: {all_osds}") # Check all osds for their state and take action (remove, purge etc) new_queue: Set[OSD] = set() for osd in all_osds: # type: OSD if not osd.force: # skip criteria if not osd.is_empty: logger.debug(f"{osd} is not empty yet. Waiting a bit more") new_queue.add(osd) continue if not osd.safe_to_destroy(): logger.debug( f"{osd} is not safe-to-destroy yet. Waiting a bit more") new_queue.add(osd) continue # abort criteria if not osd.down(): # also remove it from the remove_osd list and set a health_check warning? raise orchestrator.OrchestratorError( f"Could not mark {osd} down") # stop and remove daemon assert osd.hostname is not None if self.mgr.cache.has_daemon(f'osd.{osd.osd_id}'): CephadmServe(self.mgr)._remove_daemon(f'osd.{osd.osd_id}', osd.hostname) logger.info(f"Successfully removed {osd} on {osd.hostname}") else: logger.info( f"Daemon {osd} on {osd.hostname} was already removed") if osd.replace: # mark destroyed in osdmap if not osd.destroy(): raise orchestrator.OrchestratorError( f"Could not destroy {osd}") logger.info( f"Successfully destroyed old {osd} on {osd.hostname}; ready for replacement" ) else: # purge from osdmap if not osd.purge(): raise orchestrator.OrchestratorError( f"Could not purge {osd}") logger.info(f"Successfully purged {osd} on {osd.hostname}") if osd.zap: # throws an exception if the zap fails logger.info(f"Zapping devices for {osd} on {osd.hostname}") osd.do_zap() logger.info( f"Successfully zapped devices for {osd} on {osd.hostname}") logger.debug(f"Removing {osd} from the queue.") # self could change while this is processing (osds get added from the CLI) # The new set is: 'an intersection of all osds that are still not empty/removed (new_queue) and # osds that were added while this method was executed' with self.lock: self.osds.intersection_update(new_queue) self._save_to_store()
def create_single_host(self, drive_group: DriveGroupSpec, host: str, cmd: str, replace_osd_ids: List[str], env_vars: Optional[List[str]] = None) -> str: out, err, code = self._run_ceph_volume_command(host, cmd, env_vars=env_vars) if code == 1 and ', it is already prepared' in '\n'.join(err): # HACK: when we create against an existing LV, ceph-volume # returns an error and the above message. To make this # command idempotent, tolerate this "error" and continue. logger.debug('the device was already prepared; continuing') code = 0 if code: raise RuntimeError( 'cephadm exited with an error code: %d, stderr:%s' % (code, '\n'.join(err))) # check result out, err, code = CephadmServe(self.mgr)._run_cephadm( host, 'osd', 'ceph-volume', [ '--', 'lvm', 'list', '--format', 'json', ]) before_osd_uuid_map = self.mgr.get_osd_uuid_map(only_up=True) try: osds_elems = json.loads('\n'.join(out)) except ValueError: logger.exception('Cannot decode JSON: \'%s\'' % '\n'.join(out)) osds_elems = {} fsid = self.mgr._cluster_fsid osd_uuid_map = self.mgr.get_osd_uuid_map() created = [] for osd_id, osds in osds_elems.items(): for osd in osds: if osd['tags']['ceph.cluster_fsid'] != fsid: logger.debug('mismatched fsid, skipping %s' % osd) continue if osd_id in before_osd_uuid_map and osd_id not in replace_osd_ids: # if it exists but is part of the replacement operation, don't skip continue if osd_id not in osd_uuid_map: logger.debug( 'osd id {} does not exist in cluster'.format(osd_id)) continue if osd_uuid_map.get(osd_id) != osd['tags']['ceph.osd_fsid']: logger.debug('mismatched osd uuid (cluster has %s, osd ' 'has %s)' % (osd_uuid_map.get(osd_id), osd['tags']['ceph.osd_fsid'])) continue created.append(osd_id) daemon_spec: CephadmDaemonSpec = CephadmDaemonSpec( daemon_id=osd_id, host=host, daemon_type='osd', ) daemon_spec.final_config, daemon_spec.deps = self.generate_config( daemon_spec) CephadmServe(self.mgr)._create_daemon( daemon_spec, osd_uuid_map=osd_uuid_map) if created: self.mgr.cache.invalidate_host_devices(host) return "Created osd(s) %s on host '%s'" % (','.join(created), host) else: return "Created no osd(s) on host %s; already created?" % host