def test_detached_parts(self, prometheus_operator_spec, clickhouse_operator_spec, chi): clickhouse.create_table_on_cluster(chi) detached_pod, detached_svc, _, _ = alerts.random_pod_choice_for_callbacks(chi) def create_part_and_detach(): clickhouse.query(chi["metadata"]["name"], "INSERT INTO default.test SELECT now(), number FROM numbers(100)", pod=detached_pod) part_name = clickhouse.query( chi["metadata"]["name"], sql="SELECT name FROM system.parts WHERE database='default' AND table='test' ORDER BY modification_time DESC LIMIT 1", pod=detached_pod ) clickhouse.query(chi["metadata"]["name"], f"ALTER TABLE default.test DETACH PART '{part_name}'", pod=detached_pod) def attach_all_parts(): detached_parts = clickhouse.query(chi["metadata"]["name"], "SELECT name FROM system.detached_parts WHERE database='default' AND table='test' AND reason=''", pod=detached_pod) all_parts = "" for part in detached_parts.splitlines(): all_parts += f"ALTER TABLE default.test ATTACH PART '{part}';" if all_parts.strip() != "": clickhouse.query(chi["metadata"]["name"], all_parts, pod=detached_pod) with When("check ClickHouseDetachedParts firing"): fired = alerts.wait_alert_state("ClickHouseDetachedParts", "firing", True, labels={"hostname": detached_svc, "chi": chi["metadata"]["name"]}, time_range='30s', callback=create_part_and_detach) assert fired, error("can't get ClickHouseDetachedParts alert in firing state") with Then("check ClickHouseDetachedParts gone away"): resolved = alerts.wait_alert_state("ClickHouseDetachedParts", "firing", False, labels={"hostname": detached_svc}, callback=attach_all_parts) assert resolved, error("can't check ClickHouseDetachedParts alert is gone away") clickhouse.drop_table_on_cluster(chi)
def test_read_only_replica(self, prometheus_operator_spec, clickhouse_operator_spec, chi): read_only_pod, read_only_svc, other_pod, other_svc = alerts.random_pod_choice_for_callbacks(chi) chi_name = chi["metadata"]["name"] clickhouse.create_table_on_cluster( chi, 'all-replicated', 'default.test_repl', '(event_time DateTime, test UInt64) ' + 'ENGINE ReplicatedMergeTree(\'/clickhouse/tables/{installation}-{shard}/test_repl\', \'{replica}\') ORDER BY tuple()' ) def restart_zookeeper(): kubectl.launch( f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"", ok_to_fail=True, ) clickhouse.query_with_error(chi_name, "INSERT INTO default.test_repl VALUES(now(),rand())", host=read_only_svc) with Then("check ClickHouseReadonlyReplica firing"): fired = alerts.wait_alert_state("ClickHouseReadonlyReplica", "firing", True, labels={"hostname": read_only_svc}, time_range='30s', sleep_time=settings.prometheus_scrape_interval, callback=restart_zookeeper) assert fired, error("can't get ClickHouseReadonlyReplica alert in firing state") with Then("check ClickHouseReadonlyReplica gone away"): resolved = alerts.wait_alert_state("ClickHouseReadonlyReplica", "firing", False, labels={"hostname": read_only_svc}) assert resolved, error("can't check ClickHouseReadonlyReplica alert is gone away") kubectl.wait_pod_status("zookeeper-0", "Running", ns=kubectl.namespace) kubectl.wait_jsonpath("pod", "zookeeper-0", "{.status.containerStatuses[0].ready}", "true", ns=kubectl.namespace) for i in range(11): zookeeper_status = kubectl.launch( f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"echo ruok | nc 127.0.0.1 2181\"", ok_to_fail=True ) if "imok" in zookeeper_status: break elif i == 10: fail(f"invalid zookeeper status after {i} retries") with Then("zookeper is not ready, wait 2 seconds"): time.sleep(2) clickhouse.query_with_error( chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl", host=read_only_svc, timeout=240 ) clickhouse.query_with_error( chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl", host=other_svc, timeout=240 ) clickhouse.drop_table_on_cluster(chi, 'all-replicated', 'default.test_repl')
def insert_replicated_data(chi, create_tables, insert_tables): with When(f'create if not exists replicated tables {create_tables}'): for table in create_tables: clickhouse.create_table_on_cluster( chi, 'all-sharded', f'default.{table}', f'(id UInt64) ENGINE=ReplicatedMergeTree(\'/clickhouse/tables/default.{table}/{{shard}}\',\'{{replica}}\') ORDER BY (id)', if_not_exists=True, ) with When(f'insert tables data {insert_tables}'): for table in insert_tables: clickhouse.query( chi['metadata']['name'], f'INSERT INTO default.{table} SELECT rand()+number FROM numbers(1000)', pod="chi-test-cluster-for-zk-default-0-1-0")
def test_replicas_max_absolute_delay(self, prometheus_operator_spec, clickhouse_operator_spec, chi): stop_replica_pod, stop_replica_svc, insert_pod, insert_svc = alerts.random_pod_choice_for_callbacks(chi) clickhouse.create_table_on_cluster( chi, 'all-replicated', 'default.test_repl', '(event_time DateTime, test UInt64) ' + 'ENGINE ReplicatedMergeTree(\'/clickhouse/tables/{installation}-{shard}/test_repl\', \'{replica}\') ORDER BY tuple()' ) prometheus_scrape_interval = 15 def restart_clickhouse_and_insert_to_replicated_table(): with When(f"stop replica fetches on {stop_replica_svc}"): sql = "SYSTEM STOP FETCHES default.test_repl" kubectl.launch( f"exec -n {kubectl.namespace} {stop_replica_pod} -c clickhouse-pod -- clickhouse-client -q \"{sql}\"", ok_to_fail=True, timeout=600, ) sql = "INSERT INTO default.test_repl SELECT now(), number FROM numbers(100000)" kubectl.launch( f"exec -n {kubectl.namespace} {insert_pod} -c clickhouse-pod -- clickhouse-client -q \"{sql}\"", ) with Then("check ClickHouseReplicasMaxAbsoluteDelay firing"): fired = alerts.wait_alert_state( "ClickHouseReplicasMaxAbsoluteDelay", "firing", True, labels={"hostname": stop_replica_svc}, time_range='60s', sleep_time=prometheus_scrape_interval * 2, callback=restart_clickhouse_and_insert_to_replicated_table ) assert fired, error("can't get ClickHouseReadonlyReplica alert in firing state") clickhouse.query( chi["metadata"]["name"], "SYSTEM START FETCHES; SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl", host=stop_replica_svc, timeout=240 ) with Then("check ClickHouseReplicasMaxAbsoluteDelay gone away"): resolved = alerts.wait_alert_state("ClickHouseReplicasMaxAbsoluteDelay", "firing", False, labels={"hostname": stop_replica_svc}) assert resolved, error("can't check ClickHouseReplicasMaxAbsoluteDelay alert is gone away") clickhouse.drop_table_on_cluster(chi, 'all-replicated', 'default.test_repl')
def test_insert_related_alerts(self, prometheus_operator_spec, clickhouse_operator_spec, chi): clickhouse.create_table_on_cluster(chi) delayed_pod, delayed_svc, rejected_pod, rejected_svc = alerts.random_pod_choice_for_callbacks(chi) prometheus_scrape_interval = settings.prometheus_scrape_interval # default values in system.merge_tree_settings parts_to_throw_insert = 300 parts_to_delay_insert = 150 chi_name = chi["metadata"]["name"] parts_limits = parts_to_delay_insert selected_svc = delayed_svc def insert_many_parts_to_clickhouse(): stop_merges = "SYSTEM STOP MERGES default.test;" min_block = "SET max_block_size=1; SET max_insert_block_size=1; SET min_insert_block_size_rows=1;" with When(f"Insert to MergeTree table {parts_limits} parts"): r = parts_limits sql = stop_merges + min_block + f"INSERT INTO default.test(event_time, test) SELECT now(),number FROM system.numbers LIMIT {r};" clickhouse.query(chi_name, sql, host=selected_svc, ns=kubectl.namespace) sql = min_block + "INSERT INTO default.test(event_time, test) SELECT now(), number FROM system.numbers LIMIT 1;" clickhouse.query_with_error(chi_name, sql, host=selected_svc, ns=kubectl.namespace) with Then(f"wait prometheus_scrape_interval={prometheus_scrape_interval}*2 sec"): time.sleep(prometheus_scrape_interval * 2) with Then("after 21.8 InsertedRows include system.* rows"): for i in range(35): sql = min_block + "INSERT INTO default.test(event_time, test) SELECT now(), number FROM system.numbers LIMIT 1;" clickhouse.query_with_error(chi_name, sql, host=selected_svc, ns=kubectl.namespace) insert_many_parts_to_clickhouse() with Then("check ClickHouseDelayedInsertThrottling firing"): fired = alerts.wait_alert_state( "ClickHouseDelayedInsertThrottling", "firing", True, labels={"hostname": delayed_svc}, time_range="60s" ) assert fired, error("can't get ClickHouseDelayedInsertThrottling alert in firing state") with Then("check ClickHouseMaxPartCountForPartition firing"): fired = alerts.wait_alert_state( "ClickHouseMaxPartCountForPartition", "firing", True, labels={"hostname": delayed_svc}, time_range="90s" ) assert fired, error("can't get ClickHouseMaxPartCountForPartition alert in firing state") with Then("check ClickHouseLowInsertedRowsPerQuery firing"): fired = alerts.wait_alert_state( "ClickHouseLowInsertedRowsPerQuery", "firing", True, labels={"hostname": delayed_svc}, time_range="120s", ) assert fired, error("can't get ClickHouseLowInsertedRowsPerQuery alert in firing state") clickhouse.query(chi_name, "SYSTEM START MERGES default.test", host=selected_svc, ns=kubectl.namespace) with Then("check ClickHouseDelayedInsertThrottling gone away"): resolved = alerts.wait_alert_state("ClickHouseDelayedInsertThrottling", "firing", False, labels={"hostname": delayed_svc}) assert resolved, error("can't check ClickHouseDelayedInsertThrottling alert is gone away") with Then("check ClickHouseMaxPartCountForPartition gone away"): resolved = alerts.wait_alert_state("ClickHouseMaxPartCountForPartition", "firing", False, labels={"hostname": delayed_svc}) assert resolved, error("can't check ClickHouseMaxPartCountForPartition alert is gone away") with Then("check ClickHouseLowInsertedRowsPerQuery gone away"): resolved = alerts.wait_alert_state("ClickHouseLowInsertedRowsPerQuery", "firing", False, labels={"hostname": delayed_svc}) assert resolved, error("can't check ClickHouseLowInsertedRowsPerQuery alert is gone away") parts_limits = parts_to_throw_insert selected_svc = rejected_svc insert_many_parts_to_clickhouse() with Then("check ClickHouseRejectedInsert firing"): fired = alerts.wait_alert_state("ClickHouseRejectedInsert", "firing", True, labels={"hostname": rejected_svc}, time_range="30s", sleep_time=settings.prometheus_scrape_interval) assert fired, error("can't get ClickHouseRejectedInsert alert in firing state") with Then("check ClickHouseRejectedInsert gone away"): resolved = alerts.wait_alert_state("ClickHouseRejectedInsert", "firing", False, labels={"hostname": rejected_svc}) assert resolved, error("can't check ClickHouseRejectedInsert alert is gone away") clickhouse.query(chi_name, "SYSTEM START MERGES default.test", host=selected_svc, ns=kubectl.namespace) clickhouse.drop_table_on_cluster(chi)