Beispiel #1
0
    def test_daemon_ok_to_stop(self, ok_to_stop,
                               cephadm_module: CephadmOrchestrator):
        spec = ServiceSpec('mds',
                           service_id='fsname',
                           placement=PlacementSpec(hosts=['host1', 'host2']))
        with with_host(cephadm_module,
                       'host1'), with_host(cephadm_module, 'host2'):
            c = cephadm_module.apply_mds(spec)
            out = wait(cephadm_module, c)
            match_glob(out, "Scheduled mds.fsname update...")
            CephadmServe(cephadm_module)._apply_all_services()

            [daemon] = cephadm_module.cache.daemons['host1'].keys()

            spec.placement.set_hosts(['host2'])

            ok_to_stop.side_effect = False

            c = cephadm_module.apply_mds(spec)
            out = wait(cephadm_module, c)
            match_glob(out, "Scheduled mds.fsname update...")
            CephadmServe(cephadm_module)._apply_all_services()

            ok_to_stop.assert_called_with([daemon[4:]])

            assert_rm_daemon(cephadm_module, spec.service_name(),
                             'host1')  # verifies ok-to-stop
            assert_rm_daemon(cephadm_module, spec.service_name(), 'host2')
Beispiel #2
0
    def deploy_osd_daemons_for_existing_osds(
            self,
            host: str,
            service_name: str,
            replace_osd_ids: Optional[List[str]] = None) -> str:

        if replace_osd_ids is None:
            replace_osd_ids = OsdIdClaims(self.mgr).filtered_by_host(host)
            assert replace_osd_ids is not None
        # check result
        osds_elems: dict = CephadmServe(self.mgr)._run_cephadm_json(
            host, 'osd', 'ceph-volume', [
                '--',
                'lvm',
                'list',
                '--format',
                'json',
            ])
        before_osd_uuid_map = self.mgr.get_osd_uuid_map(only_up=True)
        fsid = self.mgr._cluster_fsid
        osd_uuid_map = self.mgr.get_osd_uuid_map()
        created = []
        for osd_id, osds in osds_elems.items():
            for osd in osds:
                if osd['type'] == 'db':
                    continue
                if osd['tags']['ceph.cluster_fsid'] != fsid:
                    logger.debug('mismatched fsid, skipping %s' % osd)
                    continue
                if osd_id in before_osd_uuid_map and osd_id not in replace_osd_ids:
                    # if it exists but is part of the replacement operation, don't skip
                    continue
                if osd_id not in osd_uuid_map:
                    logger.debug(
                        'osd id {} does not exist in cluster'.format(osd_id))
                    continue
                if osd_uuid_map.get(osd_id) != osd['tags']['ceph.osd_fsid']:
                    logger.debug('mismatched osd uuid (cluster has %s, osd '
                                 'has %s)' % (osd_uuid_map.get(osd_id),
                                              osd['tags']['ceph.osd_fsid']))
                    continue

                created.append(osd_id)
                daemon_spec: CephadmDaemonDeploySpec = CephadmDaemonDeploySpec(
                    service_name=service_name,
                    daemon_id=osd_id,
                    host=host,
                    daemon_type='osd',
                )
                daemon_spec.final_config, daemon_spec.deps = self.generate_config(
                    daemon_spec)
                CephadmServe(self.mgr)._create_daemon(
                    daemon_spec, osd_uuid_map=osd_uuid_map)

        if created:
            self.mgr.cache.invalidate_host_devices(host)
            self.mgr.cache.invalidate_autotune(host)
            return "Created osd(s) %s on host '%s'" % (','.join(created), host)
        else:
            return "Created no osd(s) on host %s; already created?" % host
Beispiel #3
0
def assert_rm_service(cephadm: CephadmOrchestrator, srv_name):
    mon_or_mgr = cephadm.spec_store[srv_name].spec.service_type in ('mon', 'mgr')
    if mon_or_mgr:
        assert 'Unable' in wait(cephadm, cephadm.remove_service(srv_name))
        return
    assert wait(cephadm, cephadm.remove_service(srv_name)) == f'Removed service {srv_name}'
    assert cephadm.spec_store[srv_name].deleted is not None
    CephadmServe(cephadm)._check_daemons()
    CephadmServe(cephadm)._apply_all_services()
    assert cephadm.spec_store[srv_name].deleted
    unmanaged = cephadm.spec_store[srv_name].spec.unmanaged
    CephadmServe(cephadm)._purge_deleted_services()
    if not unmanaged:  # cause then we're not deleting daemons
        assert srv_name not in cephadm.spec_store, f'{cephadm.spec_store[srv_name]!r}'
Beispiel #4
0
    def test_etc_ceph(self, _check, _get_connection, cephadm_module):
        _get_connection.return_value = mock.Mock(), mock.Mock()
        _check.return_value = '{}', '', 0

        assert cephadm_module.manage_etc_ceph_ceph_conf is False

        with with_host(cephadm_module, 'test'):
            assert not cephadm_module.cache.host_needs_new_etc_ceph_ceph_conf(
                'test')

        with with_host(cephadm_module, 'test'):
            cephadm_module.set_module_option('manage_etc_ceph_ceph_conf', True)
            cephadm_module.config_notify()
            assert cephadm_module.manage_etc_ceph_ceph_conf == True

            CephadmServe(cephadm_module)._refresh_hosts_and_daemons()
            _check.assert_called_with(ANY, ['dd', 'of=/etc/ceph/ceph.conf'],
                                      stdin=b'')

            assert not cephadm_module.cache.host_needs_new_etc_ceph_ceph_conf(
                'test')

            # set extra config and expect that we deploy another ceph.conf
            cephadm_module._set_extra_ceph_conf('[mon]\nk=v')
            CephadmServe(cephadm_module)._refresh_hosts_and_daemons()
            _check.assert_called_with(ANY, ['dd', 'of=/etc/ceph/ceph.conf'],
                                      stdin=b'\n\n[mon]\nk=v\n')

            # reload
            cephadm_module.cache.last_etc_ceph_ceph_conf = {}
            cephadm_module.cache.load()

            assert not cephadm_module.cache.host_needs_new_etc_ceph_ceph_conf(
                'test')

            # Make sure, _check_daemons does a redeploy due to monmap change:
            cephadm_module.mock_store_set(
                '_ceph_get', 'mon_map', {
                    'modified':
                    datetime.datetime.utcnow().strftime(CEPH_DATEFMT),
                    'fsid': 'foobar',
                })
            cephadm_module.notify('mon_map', mock.MagicMock())
            assert cephadm_module.cache.host_needs_new_etc_ceph_ceph_conf(
                'test')
            cephadm_module.cache.last_etc_ceph_ceph_conf = {}
            cephadm_module.cache.load()
            assert cephadm_module.cache.host_needs_new_etc_ceph_ceph_conf(
                'test')
Beispiel #5
0
    def test_monitoring_ports(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
        _run_cephadm.side_effect = async_side_effect(('{}', '', 0))

        with with_host(cephadm_module, 'test'):

            yaml_str = """service_type: alertmanager
service_name: alertmanager
placement:
    count: 1
spec:
    port: 4200
"""
            yaml_file = yaml.safe_load(yaml_str)
            spec = ServiceSpec.from_json(yaml_file)

            with patch("cephadm.services.monitoring.AlertmanagerService.generate_config", return_value=({}, [])):
                with with_service(cephadm_module, spec):

                    CephadmServe(cephadm_module)._check_daemons()

                    _run_cephadm.assert_called_with(
                        'test', 'alertmanager.test', 'deploy', [
                            '--name', 'alertmanager.test',
                            '--meta-json', '{"service_name": "alertmanager", "ports": [4200, 9094], "ip": null, "deployed_by": [], "rank": null, "rank_generation": null}',
                            '--config-json', '-',
                            '--tcp-ports', '4200 9094',
                            '--reconfig'
                        ],
                        stdin='{}',
                        image='')
Beispiel #6
0
    def test_daemon_check_extra_config(self, _run_cephadm,
                                       cephadm_module: CephadmOrchestrator):
        _run_cephadm.return_value = ('{}', '', 0)

        with with_host(cephadm_module, 'test'):

            # Also testing deploying mons without explicit network placement
            cephadm_module.check_mon_command({
                'prefix': 'config set',
                'who': 'mon',
                'name': 'public_network',
                'value': '127.0.0.0/8'
            })

            cephadm_module.cache.update_host_devices_networks(
                'test', [], {
                    "127.0.0.0/8": ["127.0.0.1"],
                })

            with with_service(cephadm_module, ServiceSpec(service_type='mon'),
                              CephadmOrchestrator.apply_mon,
                              'test') as d_names:
                [daemon_name] = d_names

                cephadm_module._set_extra_ceph_conf('[mon]\nk=v')

                CephadmServe(cephadm_module)._check_daemons()

                _run_cephadm.assert_called_with(
                    'test',
                    'mon.test',
                    'deploy',
                    ['--name', 'mon.test', '--reconfig', '--config-json', '-'],
                    stdin='{"config": "\\n\\n[mon]\\nk=v\\n", "keyring": ""}',
                    image='')
Beispiel #7
0
def with_service(cephadm_module: CephadmOrchestrator,
                 spec: ServiceSpec,
                 meth=None,
                 host: str = '',
                 status_running=False) -> Iterator[List[str]]:
    if spec.placement.is_empty() and host:
        spec.placement = PlacementSpec(hosts=[host], count=1)
    if meth is not None:
        c = meth(cephadm_module, spec)
        assert wait(cephadm_module,
                    c) == f'Scheduled {spec.service_name()} update...'
    else:
        c = cephadm_module.apply([spec])
        assert wait(cephadm_module,
                    c) == [f'Scheduled {spec.service_name()} update...']

    specs = [
        d.spec for d in wait(cephadm_module, cephadm_module.describe_service())
    ]
    assert spec in specs

    CephadmServe(cephadm_module)._apply_all_services()

    if status_running:
        make_daemons_running(cephadm_module, spec.service_name())

    dds = wait(cephadm_module, cephadm_module.list_daemons())
    own_dds = [dd for dd in dds if dd.service_name() == spec.service_name()]
    if host and spec.service_type != 'osd':
        assert own_dds

    yield [dd.name() for dd in own_dds]

    assert_rm_service(cephadm_module, spec.service_name())
Beispiel #8
0
    def test_mgr_update(self, cephadm_module):
        with with_host(cephadm_module, 'test'):
            ps = PlacementSpec(hosts=['test:0.0.0.0=a'], count=1)
            r = CephadmServe(cephadm_module)._apply_service(ServiceSpec('mgr', placement=ps))
            assert r

            assert_rm_daemon(cephadm_module, 'mgr.a', 'test')
Beispiel #9
0
    def _run_ceph_volume_command(self,
                                 host: str,
                                 cmd: str,
                                 env_vars: Optional[List[str]] = None
                                 ) -> Tuple[List[str], List[str], int]:
        self.mgr.inventory.assert_host(host)

        # get bootstrap key
        ret, keyring, err = self.mgr.check_mon_command({
            'prefix':
            'auth get',
            'entity':
            'client.bootstrap-osd',
        })

        j = json.dumps({
            'config': self.mgr.get_minimal_ceph_conf(),
            'keyring': keyring,
        })

        split_cmd = cmd.split(' ')
        _cmd = ['--config-json', '-', '--']
        _cmd.extend(split_cmd)
        out, err, code = CephadmServe(self.mgr)._run_cephadm(host,
                                                             'osd',
                                                             'ceph-volume',
                                                             _cmd,
                                                             env_vars=env_vars,
                                                             stdin=j,
                                                             error_ok=True)
        return out, err, code
Beispiel #10
0
    def test_ceph_volume_no_filter_for_batch(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
        _run_cephadm.return_value = ('{}', '', 0)

        error_message = """cephadm exited with an error code: 1, stderr:/usr/bin/podman:stderr usage: ceph-volume inventory [-h] [--format {plain,json,json-pretty}] [path]/usr/bin/podman:stderr ceph-volume inventory: error: unrecognized arguments: --filter-for-batch
Traceback (most recent call last):
  File "<stdin>", line 6112, in <module>
  File "<stdin>", line 1299, in _infer_fsid
  File "<stdin>", line 1382, in _infer_image
  File "<stdin>", line 3612, in command_ceph_volume
  File "<stdin>", line 1061, in call_throws"""

        with with_host(cephadm_module, 'test'):
            _run_cephadm.reset_mock()
            _run_cephadm.side_effect = OrchestratorError(error_message)

            s = CephadmServe(cephadm_module)._refresh_host_devices('test')
            assert s == 'host test `cephadm ceph-volume` failed: ' + error_message

            assert _run_cephadm.mock_calls == [
                mock.call('test', 'osd', 'ceph-volume',
                          ['--', 'inventory', '--format=json', '--filter-for-batch'], image='',
                          no_fsid=False),
                mock.call('test', 'osd', 'ceph-volume',
                          ['--', 'inventory', '--format=json'], image='',
                          no_fsid=False),
            ]
Beispiel #11
0
    def test_remove_osds(self, cephadm_module):
        with with_host(cephadm_module, 'test'):
            CephadmServe(cephadm_module)._refresh_host_daemons('test')
            c = cephadm_module.list_daemons()
            wait(cephadm_module, c)

            c = cephadm_module.remove_daemons(['osd.0'])
            out = wait(cephadm_module, c)
            assert out == ["Removed osd.0 from host 'test'"]

            cephadm_module.to_remove_osds.enqueue(
                OSD(osd_id=0,
                    replace=False,
                    force=False,
                    hostname='test',
                    fullname='osd.0',
                    process_started_at=datetime_now(),
                    remove_util=cephadm_module.to_remove_osds.rm_util))
            cephadm_module.to_remove_osds.process_removal_queue()
            assert cephadm_module.to_remove_osds == OSDRemovalQueue(
                cephadm_module)

            c = cephadm_module.remove_osds_status()
            out = wait(cephadm_module, c)
            assert out == []
Beispiel #12
0
    def test_offline(self, check_execute_command, execute_command,
                     cephadm_module):
        check_execute_command.return_value = ''
        execute_command.return_value = '', '', 0

        if not AsyncMock:
            # can't run this test if we could not import AsyncMock
            return
        mock_connect = AsyncMock(return_value='')
        with mock.patch("asyncssh.connect",
                        new=mock_connect) as asyncssh_connect:
            with with_host(cephadm_module, 'test'):
                asyncssh_connect.side_effect = ConnectionLost('reason')
                code, out, err = cephadm_module.check_host('test')
                assert out == ''
                assert "Host 'test' not found" in err

                out = wait(cephadm_module,
                           cephadm_module.get_hosts())[0].to_json()
                assert out == HostSpec('test', '1::4',
                                       status='Offline').to_json()

                asyncssh_connect.return_value = mock.MagicMock()
                asyncssh_connect.side_effect = None
                assert CephadmServe(cephadm_module)._check_host('test') is None
                out = wait(cephadm_module,
                           cephadm_module.get_hosts())[0].to_json()
                assert out == HostSpec('test', '1::4').to_json()
Beispiel #13
0
    def test_daemon_action_fail(self, cephadm_module: CephadmOrchestrator):
        cephadm_module.service_cache_timeout = 10
        with with_host(cephadm_module, 'test'):
            with with_daemon(cephadm_module,
                             RGWSpec(service_id='myrgw.foobar'),
                             CephadmOrchestrator.add_rgw, 'test') as daemon_id:
                with mock.patch('ceph_module.BaseMgrModule._ceph_send_command'
                                ) as _ceph_send_command:

                    _ceph_send_command.side_effect = Exception("myerror")

                    # Make sure, _check_daemons does a redeploy due to monmap change:
                    cephadm_module.mock_store_set(
                        '_ceph_get', 'mon_map', {
                            'modified':
                            datetime.datetime.utcnow().strftime(CEPH_DATEFMT),
                            'fsid':
                            'foobar',
                        })
                    cephadm_module.notify('mon_map', None)

                    CephadmServe(cephadm_module)._check_daemons()

                    evs = [
                        e.message
                        for e in cephadm_module.events.get_for_daemon(
                            f'rgw.{daemon_id}')
                    ]

                    assert 'myerror' in ''.join(evs)
Beispiel #14
0
    def test_daemon_action(self, cephadm_module: CephadmOrchestrator):
        cephadm_module.service_cache_timeout = 10
        with with_host(cephadm_module, 'test'):
            with with_daemon(cephadm_module,
                             RGWSpec(service_id='myrgw.foobar'),
                             CephadmOrchestrator.add_rgw, 'test') as daemon_id:

                c = cephadm_module.daemon_action('redeploy',
                                                 'rgw.' + daemon_id)
                assert wait(
                    cephadm_module, c
                ) == f"Scheduled to redeploy rgw.{daemon_id} on host 'test'"

                for what in ('start', 'stop', 'restart'):
                    c = cephadm_module.daemon_action(what, 'rgw.' + daemon_id)
                    assert wait(
                        cephadm_module, c
                    ) == F"Scheduled to {what} rgw.{daemon_id} on host 'test'"

                # Make sure, _check_daemons does a redeploy due to monmap change:
                cephadm_module._store['_ceph_get/mon_map'] = {
                    'modified':
                    datetime.datetime.utcnow().strftime(CEPH_DATEFMT),
                    'fsid': 'foobar',
                }
                cephadm_module.notify('mon_map', None)

                CephadmServe(cephadm_module)._check_daemons()
Beispiel #15
0
def with_host(m: CephadmOrchestrator, name, refresh_hosts=True):
    # type: (CephadmOrchestrator, str) -> None
    wait(m, m.add_host(HostSpec(hostname=name)))
    if refresh_hosts:
        CephadmServe(m)._refresh_hosts_and_daemons()
    yield
    wait(m, m.remove_host(name))
Beispiel #16
0
    def test_daemon_check_post(self, cephadm_module: CephadmOrchestrator):
        with with_host(cephadm_module, 'test'):
            with with_service(cephadm_module,
                              ServiceSpec(service_type='grafana'),
                              CephadmOrchestrator.apply_grafana, 'test'):

                # Make sure, _check_daemons does a redeploy due to monmap change:
                cephadm_module.mock_store_set(
                    '_ceph_get', 'mon_map', {
                        'modified':
                        datetime.datetime.utcnow().strftime(CEPH_DATEFMT),
                        'fsid':
                        'foobar',
                    })
                cephadm_module.notify('mon_map', None)
                cephadm_module.mock_store_set('_ceph_get', 'mgr_map',
                                              {'modules': ['dashboard']})

                with mock.patch(
                        "cephadm.module.CephadmOrchestrator.mon_command"
                ) as _mon_cmd:
                    CephadmServe(cephadm_module)._check_daemons()
                    _mon_cmd.assert_any_call({
                        'prefix': 'dashboard set-grafana-api-url',
                        'value': 'https://test:3000'
                    })
Beispiel #17
0
 def test_remove_daemon(self, cephadm_module):
     with with_host(cephadm_module, 'test'):
         CephadmServe(cephadm_module)._refresh_host_daemons('test')
         c = cephadm_module.list_daemons()
         wait(cephadm_module, c)
         c = cephadm_module.remove_daemons(['rgw.myrgw.myhost.myid'])
         out = wait(cephadm_module, c)
         assert out == ["Removed rgw.myrgw.myhost.myid from host 'test'"]
Beispiel #18
0
def test_migrate_scheduler(cephadm_module: CephadmOrchestrator):
    with with_host(cephadm_module, 'host1', refresh_hosts=False):
        with with_host(cephadm_module, 'host2', refresh_hosts=False):

            # emulate the old scheduler:
            c = cephadm_module.apply_rgw(
                ServiceSpec('rgw', 'r.z', placement=PlacementSpec(host_pattern='*', count=2))
            )
            assert wait(cephadm_module, c) == 'Scheduled rgw.r.z update...'

            # with pytest.raises(OrchestratorError, match="cephadm migration still ongoing. Please wait, until the migration is complete."):
            CephadmServe(cephadm_module)._apply_all_services()

            cephadm_module.migration_current = 0
            cephadm_module.migration.migrate()
            # assert we need all daemons.
            assert cephadm_module.migration_current == 0

            CephadmServe(cephadm_module)._refresh_hosts_and_daemons()
            receive_agent_metadata_all_hosts(cephadm_module)
            cephadm_module.migration.migrate()

            CephadmServe(cephadm_module)._apply_all_services()

            out = {o.hostname for o in wait(cephadm_module, cephadm_module.list_daemons())}
            assert out == {'host1', 'host2'}

            c = cephadm_module.apply_rgw(
                ServiceSpec('rgw', 'r.z', placement=PlacementSpec(host_pattern='host1', count=2))
            )
            assert wait(cephadm_module, c) == 'Scheduled rgw.r.z update...'

            # Sorry, for this hack, but I need to make sure, Migration thinks,
            # we have updated all daemons already.
            cephadm_module.cache.last_daemon_update['host1'] = datetime_now()
            cephadm_module.cache.last_daemon_update['host2'] = datetime_now()

            cephadm_module.migration_current = 0
            cephadm_module.migration.migrate()
            assert cephadm_module.migration_current >= 2

            out = [o.spec.placement for o in wait(
                cephadm_module, cephadm_module.describe_service())]
            assert out == [PlacementSpec(count=2, hosts=[HostPlacementSpec(
                hostname='host1', network='', name=''), HostPlacementSpec(hostname='host2', network='', name='')])]
Beispiel #19
0
def with_host(m: CephadmOrchestrator, name, addr='1::4', refresh_hosts=True):
    # type: (CephadmOrchestrator, str) -> None
    with mock.patch("cephadm.utils.resolve_ip", return_value=addr):
        wait(m, m.add_host(HostSpec(hostname=name)))
        if refresh_hosts:
            CephadmServe(m)._refresh_hosts_and_daemons()
            receive_agent_metadata(m, name)
        yield
        wait(m, m.remove_host(name))
Beispiel #20
0
    def test_daemon_check(self, cephadm_module: CephadmOrchestrator, action):
        with with_host(cephadm_module, 'test'):
            with with_service(cephadm_module, ServiceSpec(service_type='grafana'), CephadmOrchestrator.apply_grafana, 'test') as d_names:
                [daemon_name] = d_names

                cephadm_module._schedule_daemon_action(daemon_name, action)

                assert cephadm_module.cache.get_scheduled_daemon_action(
                    'test', daemon_name) == action

                CephadmServe(cephadm_module)._check_daemons()

                assert cephadm_module.cache.get_scheduled_daemon_action('test', daemon_name) is None
Beispiel #21
0
def receive_agent_metadata(m: CephadmOrchestrator, host: str, ops: List[str] = None) -> None:
    to_update: Dict[str, Callable[[str, Any], None]] = {
        'ls': m._process_ls_output,
        'gather-facts': m.cache.update_host_facts,
        'list-networks': m.cache.update_host_networks,
    }
    if ops:
        for op in ops:
            out = CephadmServe(m)._run_cephadm_json(host, cephadmNoImage, op, [])
            to_update[op](host, out)
    m.cache.last_daemon_update[host] = datetime_now()
    m.cache.last_facts_update[host] = datetime_now()
    m.cache.last_network_update[host] = datetime_now()
    m.cache.metadata_up_to_date[host] = True
Beispiel #22
0
    def test_rgw_update(self, cephadm_module):
        with with_host(cephadm_module, 'host1'):
            with with_host(cephadm_module, 'host2'):
                ps = PlacementSpec(hosts=['host1'], count=1)
                c = cephadm_module.add_rgw(
                    RGWSpec(rgw_realm='realm', rgw_zone='zone1', placement=ps))
                [out] = wait(cephadm_module, c)
                match_glob(out, "Deployed rgw.realm.zone1.host1.* on host 'host1'")

                ps = PlacementSpec(hosts=['host1', 'host2'], count=2)
                r = CephadmServe(cephadm_module)._apply_service(
                    RGWSpec(rgw_realm='realm', rgw_zone='zone1', placement=ps))
                assert r

                assert_rm_daemon(cephadm_module, 'rgw.realm.zone1', 'host1')
                assert_rm_daemon(cephadm_module, 'rgw.realm.zone1', 'host2')
Beispiel #23
0
    def test_offline(self, _check, _get_connection, cephadm_module):
        _check.return_value = '{}', '', 0
        _get_connection.return_value = mock.Mock(), mock.Mock()
        with with_host(cephadm_module, 'test'):
            _get_connection.side_effect = HostNotFound
            code, out, err = cephadm_module.check_host('test')
            assert out == ''
            assert "Host 'test' not found" in err

            out = wait(cephadm_module, cephadm_module.get_hosts())[0].to_json()
            assert out == HostSpec('test', 'test', status='Offline').to_json()

            _get_connection.side_effect = None
            assert CephadmServe(cephadm_module)._check_host('test') is None
            out = wait(cephadm_module, cephadm_module.get_hosts())[0].to_json()
            assert out == HostSpec('test', 'test').to_json()
Beispiel #24
0
 def zap_osd(self, osd: "OSD") -> str:
     "Zaps all devices that are associated with an OSD"
     if osd.hostname is not None:
         out, err, code = CephadmServe(self.mgr)._run_cephadm(
             osd.hostname,
             'osd',
             'ceph-volume',
             ['--', 'lvm', 'zap', '--destroy', '--osd-id',
              str(osd.osd_id)],
             error_ok=True)
         self.mgr.cache.invalidate_host_devices(osd.hostname)
         if code:
             raise OrchestratorError('Zap failed: %s' %
                                     '\n'.join(out + err))
         return '\n'.join(out + err)
     raise OrchestratorError(
         f"Failed to zap OSD {osd.osd_id} because host was unknown")
Beispiel #25
0
    def test_upgrade_run(self, use_repo_digest, cephadm_module: CephadmOrchestrator):
        with with_host(cephadm_module, 'test', refresh_hosts=False):
            cephadm_module.set_container_image('global', 'image')
            if use_repo_digest:
                cephadm_module.use_repo_digest = True

                CephadmServe(cephadm_module).convert_tags_to_repo_digest()

            _, image, _ = cephadm_module.check_mon_command({
                'prefix': 'config get',
                'who': 'global',
                'key': 'container_image',
            })
            if use_repo_digest:
                assert image == 'image@repo_digest'
            else:
                assert image == 'image'
Beispiel #26
0
def with_service(cephadm_module: CephadmOrchestrator, spec: ServiceSpec, meth, host: str) -> Iterator[List[str]]:
    if spec.placement.is_empty():
        spec.placement = PlacementSpec(hosts=[host], count=1)
    c = meth(cephadm_module, spec)
    assert wait(cephadm_module, c) == f'Scheduled {spec.service_name()} update...'
    specs = [d.spec for d in wait(cephadm_module, cephadm_module.describe_service())]
    assert spec in specs

    CephadmServe(cephadm_module)._apply_all_services()

    dds = wait(cephadm_module, cephadm_module.list_daemons())
    own_dds = [dd for dd in dds if dd.service_name() == spec.service_name()]
    assert own_dds

    yield [dd.name() for dd in own_dds]

    assert_rm_service(cephadm_module, spec.service_name())
Beispiel #27
0
    def test_apply_osd_save(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
        _run_cephadm.return_value = ('{}', '', 0)
        with with_host(cephadm_module, 'test'):

            spec = DriveGroupSpec(
                service_id='foo',
                placement=PlacementSpec(
                    host_pattern='*',
                ),
                data_devices=DeviceSelection(
                    all=True
                )
            )

            c = cephadm_module.apply([spec])
            assert wait(cephadm_module, c) == ['Scheduled osd.foo update...']

            inventory = Devices([
                Device(
                    '/dev/sdb',
                    available=True
                ),
            ])

            cephadm_module.cache.update_host_devices_networks('test', inventory.devices, {})

            _run_cephadm.return_value = (['{}'], '', 0)

            assert CephadmServe(cephadm_module)._apply_all_services() == False

            _run_cephadm.assert_any_call(
                'test', 'osd', 'ceph-volume',
                ['--config-json', '-', '--', 'lvm', 'batch',
                    '--no-auto', '/dev/sdb', '--yes', '--no-systemd'],
                env_vars=['CEPH_VOLUME_OSDSPEC_AFFINITY=foo'], error_ok=True, stdin='{"config": "", "keyring": ""}')
            _run_cephadm.assert_called_with(
                'test', 'osd', 'ceph-volume', ['--', 'lvm', 'list', '--format', 'json'])
Beispiel #28
0
    def _do_upgrade(self):
        # type: () -> None
        if not self.upgrade_state:
            logger.debug('_do_upgrade no state, exiting')
            return

        target_image = self.target_image
        target_id = self.upgrade_state.target_id
        target_digests = self.upgrade_state.target_digests
        target_version = self.upgrade_state.target_version

        first = False
        if not target_id or not target_version or not target_digests:
            # need to learn the container hash
            logger.info('Upgrade: First pull of %s' % target_image)
            self.upgrade_info_str = 'Doing first pull of %s image' % (
                target_image)
            try:
                target_id, target_version, target_digests = CephadmServe(
                    self.mgr)._get_container_image_info(target_image)
            except OrchestratorError as e:
                self._fail_upgrade(
                    'UPGRADE_FAILED_PULL', {
                        'severity': 'warning',
                        'summary': 'Upgrade: failed to pull target image',
                        'count': 1,
                        'detail': [str(e)],
                    })
                return
            if not target_version:
                self._fail_upgrade(
                    'UPGRADE_FAILED_PULL', {
                        'severity':
                        'warning',
                        'summary':
                        'Upgrade: failed to pull target image',
                        'count':
                        1,
                        'detail':
                        ['unable to extract ceph version from container'],
                    })
                return
            self.upgrade_state.target_id = target_id
            # extract the version portion of 'ceph version {version} ({sha1})'
            self.upgrade_state.target_version = target_version.split(' ')[2]
            self.upgrade_state.target_digests = target_digests
            self._save_upgrade_state()
            target_image = self.target_image
            first = True

        if target_digests is None:
            target_digests = []
        if target_version.startswith('ceph version '):
            # tolerate/fix upgrade state from older version
            self.upgrade_state.target_version = target_version.split(' ')[2]
            target_version = self.upgrade_state.target_version
        target_major, target_minor, target_patch = target_version.split('.')
        target_major_name = self.mgr.lookup_release_name(int(target_major))

        if first:
            logger.info('Upgrade: Target is version %s (%s)' %
                        (target_version, target_major_name))
            logger.info('Upgrade: Target container is %s, digests %s' %
                        (target_image, target_digests))

        version_error = self._check_target_version(target_version)
        if version_error:
            self._fail_upgrade(
                'UPGRADE_BAD_TARGET_VERSION', {
                    'severity': 'error',
                    'summary':
                    f'Upgrade: cannot upgrade/downgrade to {target_version}',
                    'count': 1,
                    'detail': [version_error],
                })
            return

        image_settings = self.get_distinct_container_image_settings()

        daemons = [
            d for d in self.mgr.cache.get_daemons()
            if d.daemon_type in CEPH_UPGRADE_ORDER
        ]
        done = 0
        for daemon_type in CEPH_UPGRADE_ORDER:
            logger.debug('Upgrade: Checking %s daemons' % daemon_type)

            need_upgrade_self = False
            need_upgrade: List[Tuple[DaemonDescription, bool]] = []
            need_upgrade_deployer: List[Tuple[DaemonDescription, bool]] = []
            for d in daemons:
                if d.daemon_type != daemon_type:
                    continue
                assert d.daemon_type is not None
                assert d.daemon_id is not None
                correct_digest = False
                if (any(d in target_digests
                        for d in (d.container_image_digests or []))
                        or d.daemon_type in MONITORING_STACK_TYPES):
                    logger.debug('daemon %s.%s container digest correct' %
                                 (daemon_type, d.daemon_id))
                    correct_digest = True
                    if any(d in target_digests for d in (d.deployed_by or [])):
                        logger.debug(
                            'daemon %s.%s deployed by correct version' %
                            (d.daemon_type, d.daemon_id))
                        done += 1
                        continue

                if self.mgr.daemon_is_self(d.daemon_type, d.daemon_id):
                    logger.info('Upgrade: Need to upgrade myself (mgr.%s)' %
                                self.mgr.get_mgr_id())
                    need_upgrade_self = True
                    continue

                if correct_digest:
                    logger.debug(
                        'daemon %s.%s not deployed by correct version' %
                        (d.daemon_type, d.daemon_id))
                    need_upgrade_deployer.append((d, True))
                else:
                    logger.debug(
                        'daemon %s.%s not correct (%s, %s, %s)' %
                        (daemon_type, d.daemon_id, d.container_image_name,
                         d.container_image_digests, d.version))
                    need_upgrade.append((d, False))

            if not need_upgrade_self:
                # only after the mgr itself is upgraded can we expect daemons to have
                # deployed_by == target_digests
                need_upgrade += need_upgrade_deployer

            # prepare filesystems for daemon upgrades?
            if (daemon_type == 'mds' and need_upgrade
                    and not self._prepare_for_mds_upgrade(
                        target_major, [d_entry[0]
                                       for d_entry in need_upgrade])):
                return

            if need_upgrade:
                self.upgrade_info_str = 'Currently upgrading %s daemons' % (
                    daemon_type)

            to_upgrade: List[Tuple[DaemonDescription, bool]] = []
            known_ok_to_stop: List[str] = []
            for d_entry in need_upgrade:
                d = d_entry[0]
                assert d.daemon_type is not None
                assert d.daemon_id is not None
                assert d.hostname is not None

                if not d.container_image_id:
                    if d.container_image_name == target_image:
                        logger.debug(
                            'daemon %s has unknown container_image_id but has correct image name'
                            % (d.name()))
                        continue

                if known_ok_to_stop:
                    if d.name() in known_ok_to_stop:
                        logger.info(
                            f'Upgrade: {d.name()} is also safe to restart')
                        to_upgrade.append(d_entry)
                    continue

                if d.daemon_type in ['mon', 'osd', 'mds']:
                    # NOTE: known_ok_to_stop is an output argument for
                    # _wait_for_ok_to_stop
                    if not self._wait_for_ok_to_stop(d, known_ok_to_stop):
                        return

                to_upgrade.append(d_entry)

                # if we don't have a list of others to consider, stop now
                if not known_ok_to_stop:
                    break

            num = 1
            for d_entry in to_upgrade:
                d = d_entry[0]
                assert d.daemon_type is not None
                assert d.daemon_id is not None
                assert d.hostname is not None

                self._update_upgrade_progress(done / len(daemons))

                # make sure host has latest container image
                out, errs, code = CephadmServe(self.mgr)._run_cephadm(
                    d.hostname,
                    '',
                    'inspect-image', [],
                    image=target_image,
                    no_fsid=True,
                    error_ok=True)
                if code or not any(d in target_digests
                                   for d in json.loads(''.join(out)).get(
                                       'repo_digests', [])):
                    logger.info('Upgrade: Pulling %s on %s' %
                                (target_image, d.hostname))
                    self.upgrade_info_str = 'Pulling %s image on host %s' % (
                        target_image, d.hostname)
                    out, errs, code = CephadmServe(self.mgr)._run_cephadm(
                        d.hostname,
                        '',
                        'pull', [],
                        image=target_image,
                        no_fsid=True,
                        error_ok=True)
                    if code:
                        self._fail_upgrade(
                            'UPGRADE_FAILED_PULL', {
                                'severity':
                                'warning',
                                'summary':
                                'Upgrade: failed to pull target image',
                                'count':
                                1,
                                'detail': [
                                    'failed to pull %s on host %s' %
                                    (target_image, d.hostname)
                                ],
                            })
                        return
                    r = json.loads(''.join(out))
                    if not any(d in target_digests
                               for d in r.get('repo_digests', [])):
                        logger.info(
                            'Upgrade: image %s pull on %s got new digests %s (not %s), restarting'
                            % (target_image, d.hostname, r['repo_digests'],
                               target_digests))
                        self.upgrade_info_str = 'Image %s pull on %s got new digests %s (not %s), restarting' % (
                            target_image, d.hostname, r['repo_digests'],
                            target_digests)
                        self.upgrade_state.target_digests = r['repo_digests']
                        self._save_upgrade_state()
                        return

                    self.upgrade_info_str = 'Currently upgrading %s daemons' % (
                        daemon_type)

                if len(to_upgrade) > 1:
                    logger.info(
                        'Upgrade: Updating %s.%s (%d/%d)' %
                        (d.daemon_type, d.daemon_id, num, len(to_upgrade)))
                else:
                    logger.info('Upgrade: Updating %s.%s' %
                                (d.daemon_type, d.daemon_id))
                action = 'Upgrading' if not d_entry[1] else 'Redeploying'
                try:
                    daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(
                        d)
                    self.mgr._daemon_action(
                        daemon_spec,
                        'redeploy',
                        image=target_image if not d_entry[1] else None)
                except Exception as e:
                    self._fail_upgrade(
                        'UPGRADE_REDEPLOY_DAEMON', {
                            'severity': 'warning',
                            'summary':
                            f'{action} daemon {d.name()} on host {d.hostname} failed.',
                            'count': 1,
                            'detail': [f'Upgrade daemon: {d.name()}: {e}'],
                        })
                    return
                num += 1
            if to_upgrade:
                return

            # complete mon upgrade?
            if daemon_type == 'mon':
                if not self.mgr.get("have_local_config_map"):
                    logger.info(
                        'Upgrade: Restarting mgr now that mons are running pacific'
                    )
                    need_upgrade_self = True

            if need_upgrade_self:
                try:
                    self.mgr.mgr_service.fail_over()
                except OrchestratorError as e:
                    self._fail_upgrade(
                        'UPGRADE_NO_STANDBY_MGR', {
                            'severity':
                            'warning',
                            'summary':
                            f'Upgrade: {e}',
                            'count':
                            1,
                            'detail': [
                                'The upgrade process needs to upgrade the mgr, '
                                'but it needs at least one standby to proceed.',
                            ],
                        })
                    return

                return  # unreachable code, as fail_over never returns
            elif daemon_type == 'mgr':
                if 'UPGRADE_NO_STANDBY_MGR' in self.mgr.health_checks:
                    del self.mgr.health_checks['UPGRADE_NO_STANDBY_MGR']
                    self.mgr.set_health_checks(self.mgr.health_checks)

            # make sure 'ceph versions' agrees
            ret, out_ver, err = self.mgr.check_mon_command({
                'prefix':
                'versions',
            })
            j = json.loads(out_ver)
            for version, count in j.get(daemon_type, {}).items():
                short_version = version.split(' ')[2]
                if short_version != target_version:
                    logger.warning(
                        'Upgrade: %d %s daemon(s) are %s != target %s' %
                        (count, daemon_type, short_version, target_version))

            # push down configs
            daemon_type_section = name_to_config_section(daemon_type)
            if image_settings.get(daemon_type_section) != target_image:
                logger.info('Upgrade: Setting container_image for all %s' %
                            daemon_type)
                self.mgr.set_container_image(daemon_type_section, target_image)
            to_clean = []
            for section in image_settings.keys():
                if section.startswith(
                        name_to_config_section(daemon_type) + '.'):
                    to_clean.append(section)
            if to_clean:
                logger.debug('Upgrade: Cleaning up container_image for %s' %
                             to_clean)
                for section in to_clean:
                    ret, image, err = self.mgr.check_mon_command({
                        'prefix':
                        'config rm',
                        'name':
                        'container_image',
                        'who':
                        section,
                    })

            logger.debug('Upgrade: All %s daemons are up to date.' %
                         daemon_type)

            # complete osd upgrade?
            if daemon_type == 'osd':
                osdmap = self.mgr.get("osd_map")
                osd_min_name = osdmap.get("require_osd_release", "argonaut")
                osd_min = ceph_release_to_major(osd_min_name)
                if osd_min < int(target_major):
                    logger.info(
                        f'Upgrade: Setting require_osd_release to {target_major} {target_major_name}'
                    )
                    ret, _, err = self.mgr.check_mon_command({
                        'prefix':
                        'osd require-osd-release',
                        'release':
                        target_major_name,
                    })

            # complete mds upgrade?
            if daemon_type == 'mds' and self.upgrade_state.fs_original_max_mds:
                for i in self.mgr.get("fs_map")['filesystems']:
                    fs_id = i["id"]
                    fs_name = i['mdsmap']['fs_name']
                    new_max = self.upgrade_state.fs_original_max_mds.get(fs_id)
                    if new_max:
                        self.mgr.log.info(
                            'Upgrade: Scaling up filesystem %s max_mds to %d' %
                            (fs_name, new_max))
                        ret, _, err = self.mgr.check_mon_command({
                            'prefix':
                            'fs set',
                            'fs_name':
                            fs_name,
                            'var':
                            'max_mds',
                            'val':
                            str(new_max),
                        })

                self.upgrade_state.fs_original_max_mds = {}
                self._save_upgrade_state()

        # clean up
        logger.info('Upgrade: Finalizing container_image settings')
        self.mgr.set_container_image('global', target_image)

        for daemon_type in CEPH_UPGRADE_ORDER:
            ret, image, err = self.mgr.check_mon_command({
                'prefix':
                'config rm',
                'name':
                'container_image',
                'who':
                name_to_config_section(daemon_type),
            })

        logger.info('Upgrade: Complete!')
        if self.upgrade_state.progress_id:
            self.mgr.remote('progress', 'complete',
                            self.upgrade_state.progress_id)
        self.upgrade_state = None
        self._save_upgrade_state()
        return
Beispiel #29
0
    def process_removal_queue(self) -> None:
        """
        Performs actions in the _serve() loop to remove an OSD
        when criteria is met.

        we can't hold self.lock, as we're calling _remove_daemon in the loop
        """

        # make sure that we don't run on OSDs that are not in the cluster anymore.
        self.cleanup()

        # find osds that are ok-to-stop and not yet draining
        ready_to_drain_osds = self._ready_to_drain_osds()
        if ready_to_drain_osds:
            # start draining those
            _ = [osd.start_draining() for osd in ready_to_drain_osds]

        all_osds = self.all_osds()

        logger.debug(f"{self.queue_size()} OSDs are scheduled "
                     f"for removal: {all_osds}")

        # Check all osds for their state and take action (remove, purge etc)
        new_queue: Set[OSD] = set()
        for osd in all_osds:  # type: OSD
            if not osd.force:
                # skip criteria
                if not osd.is_empty:
                    logger.debug(f"{osd} is not empty yet. Waiting a bit more")
                    new_queue.add(osd)
                    continue

            if not osd.safe_to_destroy():
                logger.debug(
                    f"{osd} is not safe-to-destroy yet. Waiting a bit more")
                new_queue.add(osd)
                continue

            # abort criteria
            if not osd.down():
                # also remove it from the remove_osd list and set a health_check warning?
                raise orchestrator.OrchestratorError(
                    f"Could not mark {osd} down")

            # stop and remove daemon
            assert osd.hostname is not None

            if self.mgr.cache.has_daemon(f'osd.{osd.osd_id}'):
                CephadmServe(self.mgr)._remove_daemon(f'osd.{osd.osd_id}',
                                                      osd.hostname)
                logger.info(f"Successfully removed {osd} on {osd.hostname}")
            else:
                logger.info(
                    f"Daemon {osd} on {osd.hostname} was already removed")

            if osd.replace:
                # mark destroyed in osdmap
                if not osd.destroy():
                    raise orchestrator.OrchestratorError(
                        f"Could not destroy {osd}")
                logger.info(
                    f"Successfully destroyed old {osd} on {osd.hostname}; ready for replacement"
                )
            else:
                # purge from osdmap
                if not osd.purge():
                    raise orchestrator.OrchestratorError(
                        f"Could not purge {osd}")
                logger.info(f"Successfully purged {osd} on {osd.hostname}")

            if osd.zap:
                # throws an exception if the zap fails
                logger.info(f"Zapping devices for {osd} on {osd.hostname}")
                osd.do_zap()
                logger.info(
                    f"Successfully zapped devices for {osd} on {osd.hostname}")

            logger.debug(f"Removing {osd} from the queue.")

        # self could change while this is processing (osds get added from the CLI)
        # The new set is: 'an intersection of all osds that are still not empty/removed (new_queue) and
        # osds that were added while this method was executed'
        with self.lock:
            self.osds.intersection_update(new_queue)
            self._save_to_store()
Beispiel #30
0
    def create_single_host(self,
                           drive_group: DriveGroupSpec,
                           host: str,
                           cmd: str,
                           replace_osd_ids: List[str],
                           env_vars: Optional[List[str]] = None) -> str:
        out, err, code = self._run_ceph_volume_command(host,
                                                       cmd,
                                                       env_vars=env_vars)

        if code == 1 and ', it is already prepared' in '\n'.join(err):
            # HACK: when we create against an existing LV, ceph-volume
            # returns an error and the above message.  To make this
            # command idempotent, tolerate this "error" and continue.
            logger.debug('the device was already prepared; continuing')
            code = 0
        if code:
            raise RuntimeError(
                'cephadm exited with an error code: %d, stderr:%s' %
                (code, '\n'.join(err)))

        # check result
        out, err, code = CephadmServe(self.mgr)._run_cephadm(
            host, 'osd', 'ceph-volume', [
                '--',
                'lvm',
                'list',
                '--format',
                'json',
            ])
        before_osd_uuid_map = self.mgr.get_osd_uuid_map(only_up=True)
        try:
            osds_elems = json.loads('\n'.join(out))
        except ValueError:
            logger.exception('Cannot decode JSON: \'%s\'' % '\n'.join(out))
            osds_elems = {}
        fsid = self.mgr._cluster_fsid
        osd_uuid_map = self.mgr.get_osd_uuid_map()
        created = []
        for osd_id, osds in osds_elems.items():
            for osd in osds:
                if osd['tags']['ceph.cluster_fsid'] != fsid:
                    logger.debug('mismatched fsid, skipping %s' % osd)
                    continue
                if osd_id in before_osd_uuid_map and osd_id not in replace_osd_ids:
                    # if it exists but is part of the replacement operation, don't skip
                    continue
                if osd_id not in osd_uuid_map:
                    logger.debug(
                        'osd id {} does not exist in cluster'.format(osd_id))
                    continue
                if osd_uuid_map.get(osd_id) != osd['tags']['ceph.osd_fsid']:
                    logger.debug('mismatched osd uuid (cluster has %s, osd '
                                 'has %s)' % (osd_uuid_map.get(osd_id),
                                              osd['tags']['ceph.osd_fsid']))
                    continue

                created.append(osd_id)
                daemon_spec: CephadmDaemonSpec = CephadmDaemonSpec(
                    daemon_id=osd_id,
                    host=host,
                    daemon_type='osd',
                )
                daemon_spec.final_config, daemon_spec.deps = self.generate_config(
                    daemon_spec)
                CephadmServe(self.mgr)._create_daemon(
                    daemon_spec, osd_uuid_map=osd_uuid_map)

        if created:
            self.mgr.cache.invalidate_host_devices(host)
            return "Created osd(s) %s on host '%s'" % (','.join(created), host)
        else:
            return "Created no osd(s) on host %s; already created?" % host