def test_read_only_replica(): read_only_pod, read_only_svc, other_pod, other_svc = random_pod_choice_for_callbacks() chi_name = chi["metadata"]["name"] create_table_on_cluster('all-replicated', 'default.test_repl', '(event_time DateTime, test UInt64) ENGINE ReplicatedMergeTree(\'/clickhouse/tables/{installation}-{shard}/{uuid}/test_repl\', \'{replica}\') ORDER BY tuple()') def restart_zookeeper(): kubectl.launch( f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"", ok_to_fail=True, ) clickhouse.query_with_error(chi_name, "INSERT INTO default.test_repl VALUES(now(),rand())", host=read_only_svc) with Then("check ClickHouseReadonlyReplica firing"): fired = wait_alert_state("ClickHouseReadonlyReplica", "firing", True, labels={"hostname": read_only_svc}, time_range='30s', sleep_time=5, callback=restart_zookeeper) assert fired, error("can't get ClickHouseReadonlyReplica alert in firing state") with Then("check ClickHouseReadonlyReplica gone away"): resolved = wait_alert_state("ClickHouseReadonlyReplica", "firing", False, labels={"hostname": read_only_svc}) assert resolved, error("can't check ClickHouseReadonlyReplica alert is gone away") kubectl.wait_pod_status("zookeeper-0", "Running", ns=kubectl.namespace) kubectl.wait_jsonpath("pod", "zookeeper-0", "{.status.containerStatuses[0].ready}", "true", ns=kubectl.namespace) clickhouse.query_with_error( chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl", host=read_only_svc, timeout=240 ) clickhouse.query_with_error( chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl", host=other_svc, timeout=240 ) drop_table_on_cluster('all-replicated', 'default.test_repl')
def test_zookeeper_hardware_exceptions(): pod1, svc1, pod2, svc2 = random_pod_choice_for_callbacks() chi_name = chi["metadata"]["name"] def restart_zookeeper(): kubectl.launch( f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"", ok_to_fail=True, ) clickhouse.query_with_error(chi_name, "SELECT name, path FROM system.zookeeper WHERE path='/'", host=svc1) clickhouse.query_with_error(chi_name, "SELECT name, path FROM system.zookeeper WHERE path='/'", host=svc2) with Then("check ClickHouseZooKeeperHardwareExceptions firing"): for svc in (svc1, svc2): fired = wait_alert_state("ClickHouseZooKeeperHardwareExceptions", "firing", True, labels={"hostname": svc}, time_range='40s', sleep_time=5, callback=restart_zookeeper) assert fired, error("can't get ClickHouseZooKeeperHardwareExceptions alert in firing state") kubectl.wait_pod_status("zookeeper-0", "Running", ns=kubectl.namespace) kubectl.wait_jsonpath("pod", "zookeeper-0", "{.status.containerStatuses[0].ready}", "true", ns=kubectl.namespace) with Then("check ClickHouseZooKeeperHardwareExceptions gone away"): for svc in (svc1, svc2): resolved = wait_alert_state("ClickHouseZooKeeperHardwareExceptions", "firing", False, labels={"hostname": svc}) assert resolved, error("can't check ClickHouseZooKeeperHardwareExceptions alert is gone away")
def require_zookeeper(manifest='zookeeper-1-node-1GB-for-tests-only.yaml', force_install=False): with Given("Install Zookeeper if missing"): if force_install or kubectl.get_count("service", name="zookeeper") == 0: config = util.get_full_path(f"../deploy/zookeeper/quick-start-persistent-volume/{manifest}") kubectl.apply(config) kubectl.wait_object("pod", "zookeeper-0") kubectl.wait_pod_status("zookeeper-0", "Running")
def test_distributed_connection_exceptions(): delayed_pod, delayed_svc, restarted_pod, restarted_svc = random_pod_choice_for_callbacks() create_distributed_table_on_cluster() def reboot_clickhouse_and_distributed_exection(): # we need 70 delayed files for catch insert_sql = 'INSERT INTO default.test_distr(event_time, test) SELECT now(), number FROM system.numbers LIMIT 10000' select_sql = 'SELECT count() FROM default.test_distr' with Then("reboot clickhouse-server pod"): kubectl.launch( f"exec -n {kubectl.namespace} {restarted_pod} -c clickhouse -- kill 1", ok_to_fail=True, ) with And("Insert to distributed table"): clickhouse.query(chi["metadata"]["name"], insert_sql, host=delayed_pod, ns=kubectl.namespace) with And("Select from distributed table"): clickhouse.query_with_error(chi["metadata"]["name"], select_sql, host=delayed_pod, ns=kubectl.namespace) with When("check ClickHouseDistributedConnectionExceptions firing"): fired = wait_alert_state("ClickHouseDistributedConnectionExceptions", "firing", True, labels={"hostname": delayed_svc, "chi": chi["metadata"]["name"]}, time_range='30s', callback=reboot_clickhouse_and_distributed_exection) assert fired, error("can't get ClickHouseDistributedConnectionExceptions alert in firing state") with Then("check DistributedConnectionExpections gone away"): resolved = wait_alert_state("ClickHouseDistributedConnectionExceptions", "firing", False, labels={"hostname": delayed_svc}) assert resolved, error("can't check ClickHouseDistributedConnectionExceptions alert is gone away") kubectl.wait_pod_status(restarted_pod, "Running", ns=kubectl.namespace) kubectl.wait_jsonpath("pod", restarted_pod, "{.status.containerStatuses[0].ready}", "true", ns=kubectl.namespace) drop_distributed_table_on_cluster()
def wait_when_zookeeper_up(): kubectl.wait_pod_status(zookeeper_pod, "Running", ns=kubectl.namespace) kubectl.wait_jsonpath("pod", zookeeper_pod, "{.status.containerStatuses[0].ready}", "true", ns=kubectl.namespace)
def restart_operator(ns=settings.operator_namespace, timeout=60): pod_name = kubectl.get( "pod", name="", ns=ns, label="-l app=clickhouse-operator")["items"][0]["metadata"]["name"] kubectl.launch(f"delete pod {pod_name}", ns=ns, timeout=timeout) kubectl.wait_object("pod", name="", ns=ns, label="-l app=clickhouse-operator") pod_name = kubectl.get( "pod", name="", ns=ns, label="-l app=clickhouse-operator")["items"][0]["metadata"]["name"] kubectl.wait_pod_status(pod_name, "Running", ns=ns)
def test_distributed_files_to_insert(): delayed_pod, delayed_svc, restarted_pod, restarted_svc = random_pod_choice_for_callbacks() create_distributed_table_on_cluster() insert_sql = 'INSERT INTO default.test_distr(event_time, test) SELECT now(), number FROM system.numbers LIMIT 1000' clickhouse.query( chi["metadata"]["name"], 'SYSTEM STOP DISTRIBUTED SENDS default.test_distr', pod=delayed_pod, ns=kubectl.namespace ) files_to_insert_from_metrics = 0 files_to_insert_from_disk = 0 tries = 0 # we need more than 50 delayed files for catch while files_to_insert_from_disk <= 55 and files_to_insert_from_metrics <= 55 and tries < 500: kubectl.launch( f"exec -n {kubectl.namespace} {restarted_pod} -c clickhouse -- kill 1", ok_to_fail=True, ) clickhouse.query(chi["metadata"]["name"], insert_sql, pod=delayed_pod, host=delayed_pod, ns=kubectl.namespace) files_to_insert_from_metrics = clickhouse.query( chi["metadata"]["name"], "SELECT value FROM system.metrics WHERE metric='DistributedFilesToInsert'", pod=delayed_pod, ns=kubectl.namespace ) files_to_insert_from_metrics = int(files_to_insert_from_metrics) files_to_insert_from_disk = int(kubectl.launch( f"exec -n {kubectl.namespace} {delayed_pod} -c clickhouse -- bash -c 'ls -la /var/lib/clickhouse/data/default/test_distr/*/*.bin 2>/dev/null | wc -l'", ok_to_fail=False, )) with When("reboot clickhouse-server pod"): fired = wait_alert_state("ClickHouseDistributedFilesToInsertHigh", "firing", True, labels={"hostname": delayed_svc, "chi": chi["metadata"]["name"]}) assert fired, error("can't get ClickHouseDistributedFilesToInsertHigh alert in firing state") kubectl.wait_pod_status(restarted_pod, "Running", ns=kubectl.namespace) clickhouse.query( chi["metadata"]["name"], 'SYSTEM START DISTRIBUTED SENDS default.test_distr', pod=delayed_pod, ns=kubectl.namespace ) with Then("check ClickHouseDistributedFilesToInsertHigh gone away"): resolved = wait_alert_state("ClickHouseDistributedFilesToInsertHigh", "firing", False, labels={"hostname": delayed_svc}) assert resolved, error("can't check ClickHouseDistributedFilesToInsertHigh alert is gone away") drop_distributed_table_on_cluster()
def test_zookeeper_rescale(self): with When('create replicated table'): clickhouse.create_table_on_cluster( chi, 'all-sharded', 'default.zk_repl', '(id UInt64) ENGINE=ReplicatedMergeTree(\'/clickhouse/tables/default.zk_repl/{shard}\',\'{replica}\') ORDER BY (id)' ) with Then('insert data x1'): clickhouse.query( chi['metadata']['name'], 'INSERT INTO default.zk_repl SELECT number FROM numbers(1000)', pod="chi-test-cluster-for-zk-default-0-0-0") with Then('scale up zookeeper to 3 nodes'): util.require_zookeeper('zookeeper-3-nodes-1GB-for-tests-only.yaml', force_install=True) kubectl.wait_pod_status('zookeeper-0', 'Running', settings.test_namespace) kubectl.wait_pod_status('zookeeper-1', 'Running', settings.test_namespace) kubectl.wait_pod_status('zookeeper-2', 'Running', settings.test_namespace) with Then('insert data x2'): clickhouse.query( chi['metadata']['name'], 'INSERT INTO default.zk_repl SELECT number*2 FROM numbers(1000)', pod="chi-test-cluster-for-zk-default-0-1-0") with Then('scale down zookeeper to 1 nodes'): util.require_zookeeper('zookeeper-1-node-1GB-for-tests-only.yaml', force_install=True) kubectl.wait_pod_status('zookeeper-0', 'Running', settings.test_namespace) with Then('insert data x2'): clickhouse.query( chi['metadata']['name'], 'INSERT INTO default.zk_repl SELECT number*3 FROM numbers(1000)', pod="chi-test-cluster-for-zk-default-0-0-0") assert clickhouse.query(chi['metadata']['name'], 'SELECT count() FROM default.zk_repl', pod="chi-test-cluster-for-zk-default-0-1-0" ) == '3000', "Invalid rows after 3x1000 inserts" clickhouse.drop_table_on_cluster(chi, 'all-sharded', 'default.zk_repl')
def test_014(): require_zookeeper() create_table = """ CREATE TABLE test_local(a Int8) Engine = ReplicatedMergeTree('/clickhouse/{installation}/{cluster}/tables/{shard}/{database}/{table}', '{replica}') PARTITION BY tuple() ORDER BY a """.replace('\r', '').replace('\n', '') config = "configs/test-014-replication-1.yaml" chi = manifest.get_chi_name(util.get_full_path(config)) cluster = "default" kubectl.create_and_check( config=config, check={ "apply_templates": { settings.clickhouse_template, "templates/tpl-persistent-volume-100Mi.yaml", }, "object_counts": { "statefulset": 2, "pod": 2, "service": 3, }, "do_not_delete": 1, }) start_time = kubectl.get_field("pod", f"chi-{chi}-{cluster}-0-0-0", ".status.startTime") schema_objects = ['test_local', 'test_view', 'test_mv', 'a_view'] with Given("Create schema objects"): clickhouse.query(chi, create_table, host=f"chi-{chi}-{cluster}-0-0") clickhouse.query(chi, "CREATE VIEW test_view as SELECT * from test_local", host=f"chi-{chi}-{cluster}-0-0") clickhouse.query(chi, "CREATE VIEW a_view as SELECT * from test_view", host=f"chi-{chi}-{cluster}-0-0") clickhouse.query( chi, "CREATE MATERIALIZED VIEW test_mv Engine = Log as SELECT * from test_local", host=f"chi-{chi}-{cluster}-0-0") clickhouse.query( chi, "CREATE DICTIONARY test_dict (a Int8, b Int8) PRIMARY KEY a SOURCE(CLICKHOUSE(host 'localhost' port 9000 table 'test_local' user 'default')) LAYOUT(FLAT()) LIFETIME(0)", host=f"chi-{chi}-{cluster}-0-0") with Given( "Replicated table is created on a first replica and data is inserted" ): clickhouse.query(chi, "INSERT INTO test_local values(1)", host=f"chi-{chi}-{cluster}-0-0") with When("Table is created on the second replica"): clickhouse.query(chi, create_table, host=f"chi-{chi}-{cluster}-0-1") # Give some time for replication to catch up time.sleep(10) with Then("Data should be replicated"): out = clickhouse.query(chi, "SELECT a FROM test_local", host=f"chi-{chi}-{cluster}-0-1") assert out == "1" with When("Add one more replica"): kubectl.create_and_check(config="configs/test-014-replication-2.yaml", check={ "pod_count": 3, "do_not_delete": 1, }) # Give some time for replication to catch up time.sleep(10) new_start_time = kubectl.get_field("pod", f"chi-{chi}-{cluster}-0-0-0", ".status.startTime") assert start_time == new_start_time with Then("Schema objects should be migrated to the new replica"): for obj in schema_objects: out = clickhouse.query( chi, f"SELECT count() FROM system.tables WHERE name = '{obj}'", host=f"chi-{chi}-{cluster}-0-2") assert out == "1" # Check dictionary out = clickhouse.query( chi, f"SELECT count() FROM system.dictionaries WHERE name = 'test_dict'", host=f"chi-{chi}-{cluster}-0-2") assert out == "1" with And("Replicated table should have the data"): out = clickhouse.query(chi, "SELECT a FROM test_local", host=f"chi-{chi}-{cluster}-0-2") assert out == "1" with When("Remove replica"): kubectl.create_and_check(config=config, check={ "pod_count": 1, "do_not_delete": 1, }) new_start_time = kubectl.get_field("pod", f"chi-{chi}-{cluster}-0-0-0", ".status.startTime") assert start_time == new_start_time with Then("Replica needs to be removed from the Zookeeper as well"): out = clickhouse.query( chi, "SELECT count() FROM system.replicas WHERE table='test_local'") assert out == "1" with When("Restart Zookeeper pod"): with Then("Delete Zookeeper pod"): kubectl.launch("delete pod zookeeper-0") time.sleep(1) with Then( "Insert into the table while there is no Zookeeper -- table should be in readonly mode" ): out = clickhouse.query_with_error( chi, "INSERT INTO test_local values(2)") assert "Table is in readonly mode" in out with Then("Wait for Zookeeper pod to come back"): kubectl.wait_object("pod", "zookeeper-0") kubectl.wait_pod_status("zookeeper-0", "Running") with Then( "Wait for ClickHouse to reconnect to Zookeeper and switch to read-write mode" ): time.sleep(30) # with Then("Restart clickhouse pods"): # kubectl("delete pod chi-test-014-replication-default-0-0-0") # kubectl("delete pod chi-test-014-replication-default-0-1-0") with Then("Table should be back to normal"): clickhouse.query(chi, "INSERT INTO test_local values(3)") kubectl.delete_chi("test-014-replication")
def test_read_only_replica(self): read_only_pod, read_only_svc, other_pod, other_svc = alerts.random_pod_choice_for_callbacks( chi) chi_name = chi["metadata"]["name"] clickhouse.create_table_on_cluster( chi, 'all-replicated', 'default.test_repl', '(event_time DateTime, test UInt64) ' + 'ENGINE ReplicatedMergeTree(\'/clickhouse/tables/{installation}-{shard}/test_repl\', \'{replica}\') ORDER BY tuple()' ) def restart_zookeeper(): kubectl.launch( f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"", ok_to_fail=True, ) clickhouse.query_with_error( chi_name, "INSERT INTO default.test_repl VALUES(now(),rand())", host=read_only_svc) with Then("check ClickHouseReadonlyReplica firing"): fired = alerts.wait_alert_state( "ClickHouseReadonlyReplica", "firing", True, labels={"hostname": read_only_svc}, time_range='30s', sleep_time=settings.prometheus_scrape_interval, callback=restart_zookeeper) assert fired, error( "can't get ClickHouseReadonlyReplica alert in firing state") with Then("check ClickHouseReadonlyReplica gone away"): resolved = alerts.wait_alert_state("ClickHouseReadonlyReplica", "firing", False, labels={"hostname": read_only_svc}) assert resolved, error( "can't check ClickHouseReadonlyReplica alert is gone away") kubectl.wait_pod_status("zookeeper-0", "Running", ns=kubectl.namespace) kubectl.wait_jsonpath("pod", "zookeeper-0", "{.status.containerStatuses[0].ready}", "true", ns=kubectl.namespace) for i in range(11): zookeeper_status = kubectl.launch( f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"echo ruok | nc 127.0.0.1 2181\"", ok_to_fail=True) if "imok" in zookeeper_status: break elif i == 10: fail(f"invalid zookeeper status after {i} retries") with Then("zookeper is not ready, wait 2 seconds"): time.sleep(2) clickhouse.query_with_error( chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl", host=read_only_svc, timeout=240) clickhouse.query_with_error( chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl", host=other_svc, timeout=240) clickhouse.drop_table_on_cluster(chi, 'all-replicated', 'default.test_repl')