def test_zookeeper_hardware_exceptions(): pod1, svc1, pod2, svc2 = random_pod_choice_for_callbacks() chi_name = chi["metadata"]["name"] def restart_zookeeper(): kubectl.launch( f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"", ok_to_fail=True, ) clickhouse.query_with_error(chi_name, "SELECT name, path FROM system.zookeeper WHERE path='/'", host=svc1) clickhouse.query_with_error(chi_name, "SELECT name, path FROM system.zookeeper WHERE path='/'", host=svc2) with Then("check ClickHouseZooKeeperHardwareExceptions firing"): for svc in (svc1, svc2): fired = wait_alert_state("ClickHouseZooKeeperHardwareExceptions", "firing", True, labels={"hostname": svc}, time_range='40s', sleep_time=5, callback=restart_zookeeper) assert fired, error("can't get ClickHouseZooKeeperHardwareExceptions alert in firing state") kubectl.wait_pod_status("zookeeper-0", "Running", ns=kubectl.namespace) kubectl.wait_jsonpath("pod", "zookeeper-0", "{.status.containerStatuses[0].ready}", "true", ns=kubectl.namespace) with Then("check ClickHouseZooKeeperHardwareExceptions gone away"): for svc in (svc1, svc2): resolved = wait_alert_state("ClickHouseZooKeeperHardwareExceptions", "firing", False, labels={"hostname": svc}) assert resolved, error("can't check ClickHouseZooKeeperHardwareExceptions alert is gone away")
def test_read_only_replica(): read_only_pod, read_only_svc, other_pod, other_svc = random_pod_choice_for_callbacks() chi_name = chi["metadata"]["name"] create_table_on_cluster('all-replicated', 'default.test_repl', '(event_time DateTime, test UInt64) ENGINE ReplicatedMergeTree(\'/clickhouse/tables/{installation}-{shard}/{uuid}/test_repl\', \'{replica}\') ORDER BY tuple()') def restart_zookeeper(): kubectl.launch( f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"", ok_to_fail=True, ) clickhouse.query_with_error(chi_name, "INSERT INTO default.test_repl VALUES(now(),rand())", host=read_only_svc) with Then("check ClickHouseReadonlyReplica firing"): fired = wait_alert_state("ClickHouseReadonlyReplica", "firing", True, labels={"hostname": read_only_svc}, time_range='30s', sleep_time=5, callback=restart_zookeeper) assert fired, error("can't get ClickHouseReadonlyReplica alert in firing state") with Then("check ClickHouseReadonlyReplica gone away"): resolved = wait_alert_state("ClickHouseReadonlyReplica", "firing", False, labels={"hostname": read_only_svc}) assert resolved, error("can't check ClickHouseReadonlyReplica alert is gone away") kubectl.wait_pod_status("zookeeper-0", "Running", ns=kubectl.namespace) kubectl.wait_jsonpath("pod", "zookeeper-0", "{.status.containerStatuses[0].ready}", "true", ns=kubectl.namespace) clickhouse.query_with_error( chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl", host=read_only_svc, timeout=240 ) clickhouse.query_with_error( chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl", host=other_svc, timeout=240 ) drop_table_on_cluster('all-replicated', 'default.test_repl')
def test_distributed_connection_exceptions(): delayed_pod, delayed_svc, restarted_pod, restarted_svc = random_pod_choice_for_callbacks() create_distributed_table_on_cluster() def reboot_clickhouse_and_distributed_exection(): # we need 70 delayed files for catch insert_sql = 'INSERT INTO default.test_distr(event_time, test) SELECT now(), number FROM system.numbers LIMIT 10000' select_sql = 'SELECT count() FROM default.test_distr' with Then("reboot clickhouse-server pod"): kubectl.launch( f"exec -n {kubectl.namespace} {restarted_pod} -c clickhouse -- kill 1", ok_to_fail=True, ) with And("Insert to distributed table"): clickhouse.query(chi["metadata"]["name"], insert_sql, host=delayed_pod, ns=kubectl.namespace) with And("Select from distributed table"): clickhouse.query_with_error(chi["metadata"]["name"], select_sql, host=delayed_pod, ns=kubectl.namespace) with When("check ClickHouseDistributedConnectionExceptions firing"): fired = wait_alert_state("ClickHouseDistributedConnectionExceptions", "firing", True, labels={"hostname": delayed_svc, "chi": chi["metadata"]["name"]}, time_range='30s', callback=reboot_clickhouse_and_distributed_exection) assert fired, error("can't get ClickHouseDistributedConnectionExceptions alert in firing state") with Then("check DistributedConnectionExpections gone away"): resolved = wait_alert_state("ClickHouseDistributedConnectionExceptions", "firing", False, labels={"hostname": delayed_svc}) assert resolved, error("can't check ClickHouseDistributedConnectionExceptions alert is gone away") kubectl.wait_pod_status(restarted_pod, "Running", ns=kubectl.namespace) kubectl.wait_jsonpath("pod", restarted_pod, "{.status.containerStatuses[0].ready}", "true", ns=kubectl.namespace) drop_distributed_table_on_cluster()
def wait_when_zookeeper_up(): kubectl.wait_pod_status(zookeeper_pod, "Running", ns=kubectl.namespace) kubectl.wait_jsonpath("pod", zookeeper_pod, "{.status.containerStatuses[0].ready}", "true", ns=kubectl.namespace)
def test_read_only_replica(self): read_only_pod, read_only_svc, other_pod, other_svc = alerts.random_pod_choice_for_callbacks( chi) chi_name = chi["metadata"]["name"] clickhouse.create_table_on_cluster( chi, 'all-replicated', 'default.test_repl', '(event_time DateTime, test UInt64) ' + 'ENGINE ReplicatedMergeTree(\'/clickhouse/tables/{installation}-{shard}/test_repl\', \'{replica}\') ORDER BY tuple()' ) def restart_zookeeper(): kubectl.launch( f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"", ok_to_fail=True, ) clickhouse.query_with_error( chi_name, "INSERT INTO default.test_repl VALUES(now(),rand())", host=read_only_svc) with Then("check ClickHouseReadonlyReplica firing"): fired = alerts.wait_alert_state( "ClickHouseReadonlyReplica", "firing", True, labels={"hostname": read_only_svc}, time_range='30s', sleep_time=settings.prometheus_scrape_interval, callback=restart_zookeeper) assert fired, error( "can't get ClickHouseReadonlyReplica alert in firing state") with Then("check ClickHouseReadonlyReplica gone away"): resolved = alerts.wait_alert_state("ClickHouseReadonlyReplica", "firing", False, labels={"hostname": read_only_svc}) assert resolved, error( "can't check ClickHouseReadonlyReplica alert is gone away") kubectl.wait_pod_status("zookeeper-0", "Running", ns=kubectl.namespace) kubectl.wait_jsonpath("pod", "zookeeper-0", "{.status.containerStatuses[0].ready}", "true", ns=kubectl.namespace) for i in range(11): zookeeper_status = kubectl.launch( f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"echo ruok | nc 127.0.0.1 2181\"", ok_to_fail=True) if "imok" in zookeeper_status: break elif i == 10: fail(f"invalid zookeeper status after {i} retries") with Then("zookeper is not ready, wait 2 seconds"): time.sleep(2) clickhouse.query_with_error( chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl", host=read_only_svc, timeout=240) clickhouse.query_with_error( chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl", host=other_svc, timeout=240) clickhouse.drop_table_on_cluster(chi, 'all-replicated', 'default.test_repl')