def test_scenarios_cluster_config(self): YScenarioChecker()() msg = ('Cluster partition handling is currently set to "ignore". This ' 'is potentially dangerous and a setting of ' '"pause_minority" is recommended.') issues = list(IssuesStore().load().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_lp1936136(self, mocl_cli, mock_cephbase, mock_kernelbase, mock_cset_config, mock_ceph_config): def fake_ceph_config(key): if key == 'bluefs_buffered_io': return 'true' mocl_cli.return_value = mock.MagicMock() mocl_cli.return_value.dpkg_l.return_value = \ ["ii ceph-osd 14.2.22-0ubuntu0.20.04.2 amd64"] mock_cset_config.return_value = mock.MagicMock() mock_cset_config.return_value.get.return_value = 69 mock_ceph_config.return_value = mock.MagicMock() mock_ceph_config.return_value.get.side_effect = fake_ceph_config mock_cephbase.return_value = mock.MagicMock() mock_cephbase.return_value.local_osds_use_bcache = True mock_kernelbase.return_value = mock.MagicMock() mock_kernelbase.return_value.version = '5.3' YScenarioChecker()() msg = ('This host has Ceph OSDs using bcache block devices and may be ' 'vulnerable to bcache bug LP 1936136 since ' 'bcache cache_available_percent is lt 70 (actual=69). The ' 'current workaround is to set bluefs_buffered_io=false in Ceph ' 'or upgrade to a kernel >= 5.4.') issues = list(IssuesManager().load_bugs().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_flow_lookup_checks_p2(self, mock_cli): mock_cli.return_value = mock.MagicMock() mock_cli.return_value.ovs_appctl_dpctl_show.return_value = \ ['lookups: hit:39017272903 missed:137481120 lost:54691089'] with tempfile.TemporaryDirectory() as dtmp: setup_config(DATA_ROOT=dtmp) logfile = os.path.join(dtmp, 'var/log/openvswitch/ovs-vswitchd.log') os.makedirs(os.path.dirname(logfile)) with open(logfile, 'w') as fd: fd.write(DPIF_LOST_PACKETS_LOGS) YScenarioChecker()() msg = ('OVS datapath is reporting a non-zero amount of "lost" ' 'packets (total=54691089) which implies that packets ' 'destined for userspace (e.g. vm tap) are being dropped. ' 'ovs-vswitchd is also reporting large numbers of dropped ' 'packets within a 24h period (look for ' '"system@ovs-system: lost packet on port channel"). ' 'This could be caused by ' 'overloaded system cores blocking ovs threads from ' 'delivering packets in time. Please check ovs-appctl ' 'dpctl/show to see if the number of lost packets is still ' 'increasing.') issues = list(IssuesStore().load().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_bug_check_lp1959649(self, mock_cephdaemon, mock_helper): mock_helper.return_value = mock.MagicMock() mock_helper.return_value.dpkg_l.return_value = \ ["ii ceph-osd 15.2.7-0ubuntu0.20.04.2 amd64"] mock_cephdaemon.return_value = mock.MagicMock() mock_cephdaemon.return_value.bluestore_volume_selection_policy = \ ['rocksdb_original'] YScenarioChecker()() msg = ('This host is vulnerable to known bug ' 'https://tracker.ceph.com/issues/38745. RocksDB needs more ' 'space than the leveled space available so it is using storage ' 'from the data disk. Please set ' 'bluestore_volume_selection_policy of all OSDs to ' 'use_some_extra') expected = { 'bugs-detected': [{ 'context': { 'passes': True }, 'desc': msg, 'id': 'https://bugs.launchpad.net/bugs/1959649', 'origin': 'storage.01part' }] } self.assertEqual(IssuesManager().load_bugs(), expected)
def test_osd_messenger_v2_protocol(self): YScenarioChecker()() msg = ("This Ceph cluster has 1 OSD(s) that do not bind to a v2 " "messenger address. This will cause unexpected behaviour and " "should be resolved asap.") issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_1943937(self): with tempfile.TemporaryDirectory() as dtmp: setup_config(DATA_ROOT=dtmp) logfile = os.path.join(dtmp, 'var/log/rabbitmq/[email protected]') os.makedirs(os.path.dirname(logfile)) with open(logfile, 'w') as fd: fd.write("operation queue.declare caused a channel exception " "not_found: failed to perform operation on queue " "'test_exchange_queue' in vhost " "'nagios-rabbitmq-server-0' due to timeout") YScenarioChecker()() msg = ('Known RabbitMQ issue where queues get stuck and clients ' 'trying to use them will just keep timing out. This stops ' 'many services in the cloud from working correctly. ' 'Resolution requires you to stop all RabbitMQ servers ' 'before starting them all again at the same time. A ' 'rolling restart or restarting them simultaneously will ' 'not work. See bug for more detail.') expected = { 'bugs-detected': [{ 'id': 'https://bugs.launchpad.net/bugs/1943937', 'desc': msg, 'origin': 'rabbitmq.01part' }] } self.assertEqual(IssuesManager().load_bugs(), expected)
def test_unattended_upgrades(self): YScenarioChecker()() msg = ('Unattended upgrades are enabled which can lead to ' 'uncontrolled changes to this environment. If maintenance ' 'windows are required please consider disabling unattended ' 'upgrades.') issues = list(IssuesStore().load().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_filestore_to_bluestore_upgrade(self, mock_ceph_config): mock_ceph_config.return_value = mock.MagicMock() mock_ceph_config.return_value.get = lambda args: '/journal/path' YScenarioChecker()() msg = ("Ceph Bluestore is enabled yet there is a still a journal " "device configured in ceph.conf - please check") issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_ssd_osds_no_discard(self): self.skipTest("scenario currently disabled until fixed") YScenarioChecker()() msgs = [("This host has osds with device_class 'ssd' but Bluestore " "discard is not enabled. The recommendation is to set 'bdev " "enable discard true'.")] issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], msgs)
def test_juju_ceph_no_bcache_tuning(self): YScenarioChecker()() msg = ("This host is running Juju-managed Ceph OSDs that are " "using bcache devices yet the bcache-tuning charm was " "not detected. It is recommended to use the " "bcache-tuning charm to ensure optimal bcache " "configuration.") issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_bdev(self): with tempfile.TemporaryDirectory() as dtmp: self.setup_bcachefs(dtmp, bdev_error=True) setup_config(DATA_ROOT=dtmp) YScenarioChecker()() msg = ('bcache config writeback_percent expected to be ge ' '10 but actual=1.') issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_bluefs_size(self): YScenarioChecker()() msg = ('Found 3 Ceph OSDs with metadata size larger than 10G. This ' 'could be the result of a compaction failure/bug and this host ' 'may be affected by https://tracker.ceph.com/issues/45903. A ' 'workaround (>= Nautilus) is to manually compact using ' "'ceph-bluestore-tool'.") issues = list(IssuesManager().load_bugs().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_unresponsive_mgr_p1(self): YScenarioChecker()() msg = ("One or more sosreport ceph plugins contain incomplete data. " "This usually indicates a problem with ceph mon/mgr. Please " "check ceph-mon.log and retry commands to see if they are " "still unresponsive. Restarting ceph-mon and ceph-mgr might " "resolve this.") issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_required_osd_release(self, mock_helper): mock_helper.return_value = mock.MagicMock() mock_helper.return_value.ceph_versions.return_value = \ CEPH_VERSIONS_MISMATCHED_MAJOR.split('\n') YScenarioChecker()() msg = ("Ceph cluster config 'require_osd_release' is set to 'octopus' " "but not all OSDs are on that version - please check.") issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_laggy_pgs(self, mock_helper): mock_helper.return_value = mock.MagicMock() mock_helper.return_value.ceph_pg_dump_json_decoded.return_value = \ PG_DUMP_JSON_DECODED YScenarioChecker()() msg = ('Ceph cluster is reporting 1 laggy/wait PGs. This suggests a ' 'potential network or storage issue - please check.') issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_unresponsive_mgr_p2(self): YScenarioChecker()() msg = ("Some ceph commands are returning incomplete data. This " "usually indicates a problem with ceph mon/mgr. Please check " "ceph-mon.log and retry commands to see if they are still " "unresponsive. Restarting ceph-mon and ceph-mgr might " "resolve this.") issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_ceph_versions_mismatch_p2(self, mock_helper): mock_helper.return_value = mock.MagicMock() mock_helper.return_value.ceph_versions.return_value = \ CEPH_VERSIONS_MISMATCHED_MINOR_MONS_UNALIGNED.split('\n') YScenarioChecker()() msg = ('One or more Ceph mons has a version lower than other daemons ' 'e.g. ceph-osd running in the cluster. This can cause ' 'unexpected behaviour and should be resolved as soon as ' 'possible. Check full summary output for current versions.') issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_ceph_versions_mismatch_p1(self, mock_helper): mock_helper.return_value = mock.MagicMock() mock_helper.return_value.ceph_versions.return_value = \ CEPH_VERSIONS_MISMATCHED_MINOR.split('\n') YScenarioChecker()() msg = ('Ceph daemon versions are not aligned across the cluster. This ' 'could be the result of an incomplete or failed cluster ' 'upgrade. All daemons, except the clients, should ideally be ' 'on the same version for ceph to function correctly.') issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_plugin_timeouts(self): with tempfile.TemporaryDirectory() as dtmp: self.setup_timed_out_plugins(dtmp) YScenarioChecker()() msg = ('The following sosreport plugins have have timed out and may ' 'have incomplete data: networking, system') issues = list(IssuesStore().load().values())[0] self.assertEqual([issue['type'] for issue in issues], [SOSReportWarning('').name]) self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_crushmap_bucket_checks_mixed_buckets(self, mock_helper): mock_helper.return_value = mock.MagicMock() mock_helper.return_value.ceph_osd_crush_dump_json_decoded.\ return_value = json.loads(CEPH_OSD_CRUSH_DUMP) YScenarioChecker()() msg = ("Mixed crush bucket types identified in buckets 'default'. " "This can cause data distribution to become skewed - please " "check crush map.") issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_ceph_pg_imbalance(self, mock_helper): self.setup_fake_cli_osds_imbalanced_pgs(mock_helper) YScenarioChecker()() msg1 = ('Found some Ceph osd(s) with > 500 pgs - this is close to the ' 'hard limit at which point they will stop creating pgs and ' 'fail - please investigate.') msg2 = ('Found some Ceph osd(s) whose pg count is > 30% outside the ' 'optimal range of 50-200 pgs. This could indicate poor data ' 'distribution across the cluster and result in ' 'performance degradation.') issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg1, msg2])
def test_scenario_bluefs_spillover(self, mock_helper): mock_helper.return_value = mock.MagicMock() mock_helper.return_value.ceph_health_detail_json_decoded.return_value \ = " experiencing BlueFS spillover" YScenarioChecker()() msg = ('Identified known Ceph bug. RocksDB needs more space than the ' 'leveled space available. See ' 'www.mail-archive.com/[email protected]/msg05782.html ' 'for more background information.') issues = list(IssuesManager().load_bugs().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_scenario_osd_maps_backlog_too_large(self, mock_helper): pinned = {'osdmap_manifest': {'pinned_maps': range(5496)}} mock_helper.return_value = mock.MagicMock() mock_helper.return_value.ceph_report_json_decoded.return_value = pinned YScenarioChecker()() msg = ("This Ceph cluster has 5496 pinned osdmaps. This can affect " "ceph-mon performance and may also indicate bugs such as " "https://tracker.ceph.com/issues/44184 and " "https://tracker.ceph.com/issues/47290.") issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_scenarios_cpufreq(self): YScenarioChecker()() msg = ('This node has Ceph OSDs running on it but is not using ' 'cpufreq scaling_governor in "performance" mode ' '(actual=powersave). This is not recommended and can result ' 'in performance degradation. To fix this you can install ' 'cpufrequtils, set "GOVERNOR=performance" in ' '/etc/default/cpufrequtils and run systemctl restart ' 'cpufrequtils. You will also need to stop and disable the ' 'ondemand systemd service in order for changes to persist.') issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_flow_lookup_checks_p1(self, mock_cli): mock_cli.return_value = mock.MagicMock() mock_cli.return_value.ovs_appctl_dpctl_show.return_value = \ ['lookups: hit:39017272903 missed:137481120 lost:54691089'] YScenarioChecker()() msg = ('OVS datapath is reporting a non-zero amount of "lost" packets ' '(total=54691089) which implies that packets destined for ' 'userspace (e.g. vm tap) are being dropped. Please check ' 'ovs-appctl dpctl/show to see if the number of lost packets is ' 'still increasing.') issues = list(IssuesStore().load().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_unit_checks(self, mock_cli): mock_cli.return_value = mock.MagicMock() with tempfile.TemporaryDirectory() as dtmp: setup_config(DATA_ROOT=dtmp) logfile = os.path.join(dtmp, 'var/log/juju/unit-keystone-2.log') os.makedirs(os.path.dirname(logfile)) with open(logfile, 'w') as fd: fd.write(UNIT_LEADERSHIP_ERROR) # first try outside age limit mock_cli.return_value.date.return_value = "2021-09-25 00:00:00" YScenarioChecker()() self.assertEqual(IssuesStore().load(), {}) # then within mock_cli.return_value.date.return_value = "2021-09-17 00:00:00" YScenarioChecker()() msg = ("Juju unit(s) 'keystone' are showing leadership errors in " "their logs from the last 7 days. Please investigate.") issues = list(IssuesStore().load().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_oom_killer_invoked(self): with tempfile.TemporaryDirectory() as dtmp: setup_config(DATA_ROOT=dtmp) os.makedirs(os.path.join(dtmp, 'var/log')) klog = os.path.join(dtmp, 'var/log/kern.log') with open(klog, 'w') as fd: fd.write(KERNLOG_OOM) YScenarioChecker()() msg = ('1 reports of oom-killer invoked in kern.log - please check.') issues = list(IssuesStore().load().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_large_omap_objects(self, mock_cli): mock_cli.return_value = mock.MagicMock() mock_cli.return_value.ceph_pg_dump_json_decoded.return_value = \ PG_DUMP_JSON_DECODED YScenarioChecker()() msg = ("Large omap objects found in pgs '2.f'. " "This is usually resolved by deep-scrubbing the pgs. Check " "config options " "'osd_deep_scrub_large_omap_object_key_threshold' and " "'osd_deep_scrub_large_omap_object_value_sum_threshold' to " "find whether the values of these keys are too high. " "See full summary for more detail.") issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_nf_conntrack_full(self): with tempfile.TemporaryDirectory() as dtmp: setup_config(DATA_ROOT=dtmp) os.makedirs(os.path.join(dtmp, 'var/log')) klog = os.path.join(dtmp, 'var/log/kern.log') with open(klog, 'w') as fd: fd.write(KERNLOG_NF_CONNTRACK_FULL) YScenarioChecker()() msg = ("1 reports of 'nf_conntrack: table full' detected in " "kern.log - please check.") issues = list(IssuesStore().load().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_cacheset(self): with tempfile.TemporaryDirectory() as dtmp: self.setup_bcachefs(dtmp, cacheset_error=True) setup_config(DATA_ROOT=dtmp) YScenarioChecker()() bug_msg = ( 'bcache cache_available_percent is 33 (i.e. approx. 30%) ' 'which implies this node could be suffering from bug LP ' '1900438 - please check.') issue_msg = ('bcache cacheset config congested_write_threshold_us ' 'expected to be eq 0 but actual=100.') issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [issue_msg]) bugs = list(IssuesManager().load_bugs().values())[0] self.assertEqual([issue['desc'] for issue in bugs], [bug_msg])