Ejemplo n.º 1
0
def test_zookeeper_hardware_exceptions():
    pod1, svc1, pod2, svc2 = random_pod_choice_for_callbacks()
    chi_name = chi["metadata"]["name"]

    def restart_zookeeper():
        kubectl.kubectl(
            f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"",
            ok_to_fail=True,
        )
        clickhouse.clickhouse_query_with_error(chi_name, "SELECT name, path FROM system.zookeeper WHERE path='/'", host=svc1)
        clickhouse.clickhouse_query_with_error(chi_name, "SELECT name, path FROM system.zookeeper WHERE path='/'", host=svc2)

    with Then("check ZooKeeperHardwareExceptions firing"):
        for svc in (svc1, svc2):
            fired = wait_alert_state("ZooKeeperHardwareExceptions", "firing", True, labels={"hostname": svc},
                                     time_range='30s', sleep_time=5, callback=restart_zookeeper)
            assert fired, error("can't get ZooKeeperHardwareExceptions alert in firing state")

    kubectl.kube_wait_pod_status("zookeeper-0", "Running", ns=kubectl.namespace)
    kubectl.kube_wait_jsonpath("pod", "zookeeper-0", "{.status.containerStatuses[0].ready}", "true",
                               ns=kubectl.namespace)

    with Then("check ZooKeeperHardwareExceptions gone away"):
        for svc in (svc1, svc2):
            resolved = wait_alert_state("ZooKeeperHardwareExceptions", "firing", False, labels={"hostname": svc})
            assert resolved, error("can't check ZooKeeperHardwareExceptions alert is gone away")
Ejemplo n.º 2
0
def test_distributed_connection_exceptions():
    delayed_pod, delayed_svc, restarted_pod, restarted_svc = random_pod_choice_for_callbacks()
    create_distributed_table_on_cluster()

    def reboot_clickhouse_and_distributed_exection():
        # we need 70 delayed files for catch
        insert_sql = 'INSERT INTO default.test_distr(event_time, test) SELECT now(), number FROM system.numbers LIMIT 10000'
        select_sql = 'SELECT count() FROM default.test_distr'
        with Then("reboot clickhouse-server pod"):
            kubectl.kubectl(
                f"exec -n {kubectl.namespace} {restarted_pod} -c clickhouse -- kill 1",
                ok_to_fail=True,
            )
            with And("Insert to distributed table"):
                clickhouse.clickhouse_query(chi["metadata"]["name"], insert_sql, host=delayed_pod, ns=kubectl.namespace)

            with And("Select from distributed table"):
                clickhouse.clickhouse_query_with_error(chi["metadata"]["name"], select_sql, host=delayed_pod,
                                                       ns=kubectl.namespace)

    with When("check ClickHouseDistributedConnectionExceptions firing"):
        fired = wait_alert_state("ClickHouseDistributedConnectionExceptions", "firing", True,
                                 labels={"hostname": delayed_svc, "chi": chi["metadata"]["name"]}, time_range='30s',
                                 callback=reboot_clickhouse_and_distributed_exection)
        assert fired, error("can't get ClickHouseDistributedConnectionExceptions alert in firing state")

    with Then("check DistributedConnectionExpections gone away"):
        resolved = wait_alert_state("ClickHouseDistributedConnectionExceptions", "firing", False,
                                    labels={"hostname": delayed_svc})
        assert resolved, error("can't check ClickHouseDistributedConnectionExceptions alert is gone away")
    kubectl.kube_wait_pod_status(restarted_pod, "Running", ns=kubectl.namespace)
    kubectl.kube_wait_jsonpath("pod", restarted_pod, "{.status.containerStatuses[0].ready}", "true",
                               ns=kubectl.namespace)
    drop_distributed_table_on_cluster()
Ejemplo n.º 3
0
def test_read_only_replica():
    read_only_pod, read_only_svc, other_pod, other_svc = random_pod_choice_for_callbacks()
    chi_name = chi["metadata"]["name"]
    create_replicated_table_on_cluster()

    def restart_zookeeper():
        kubectl.kubectl(
            f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"",
            ok_to_fail=True,
        )
        clickhouse.clickhouse_query_with_error(chi_name, "INSERT INTO default.test_repl VALUES(now(),rand())", host=read_only_svc)

    with Then("check ClickHouseReadonlyReplica firing"):
        fired = wait_alert_state("ClickHouseReadonlyReplica", "firing", True, labels={"hostname": read_only_svc},
                                 time_range='30s', sleep_time=5, callback=restart_zookeeper)
        assert fired, error("can't get ClickHouseReadonlyReplica alert in firing state")
    with Then("check ClickHouseReadonlyReplica gone away"):
        resolved = wait_alert_state("ClickHouseReadonlyReplica", "firing", False, labels={"hostname": read_only_svc})
        assert resolved, error("can't check ClickHouseReadonlyReplica alert is gone away")

    kubectl.kube_wait_pod_status("zookeeper-0", "Running", ns=kubectl.namespace)
    kubectl.kube_wait_jsonpath("pod", "zookeeper-0", "{.status.containerStatuses[0].ready}", "true",
                               ns=kubectl.namespace)

    clickhouse.clickhouse_query_with_error(
        chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl",
        host=read_only_svc, timeout=240
    )
    clickhouse.clickhouse_query_with_error(
        chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl",
        host=other_svc, timeout=240
    )

    drop_replicated_table_on_cluster()
Ejemplo n.º 4
0
def test_distributed_files_to_insert():
    delayed_pod, delayed_svc, restarted_pod, restarted_svc = random_pod_choice_for_callbacks()
    create_distributed_table_on_cluster()

    # we need 70 delayed files for catch
    insert_sql = 'INSERT INTO default.test_distr(event_time, test) SELECT now(), number FROM system.numbers LIMIT 10000'
    # clickhouse.clickhouse_query(
    #     chi["metadata"]["name"], 'SYSTEM STOP DISTRIBUTED SENDS default.test_distr',
    #     host=delayed_svc, ns=kubectl.namespace
    # )

    files_to_insert_from_metrics = 0
    files_to_insert_from_disk = 0
    tries = 0
    while files_to_insert_from_disk < 50 and tries < 500:
        kubectl.kubectl(
            f"exec -n {kubectl.namespace} {restarted_pod} -c clickhouse -- kill 1",
            ok_to_fail=True,
        )
        clickhouse.clickhouse_query(chi["metadata"]["name"], insert_sql, host=delayed_svc, ns=kubectl.namespace)
        files_to_insert_from_metrics = clickhouse.clickhouse_query(
            chi["metadata"]["name"], "SELECT value FROM system.metrics WHERE metric='DistributedFilesToInsert'",
            host=delayed_svc, ns=kubectl.namespace
        )
        files_to_insert_from_metrics = int(files_to_insert_from_metrics)

        files_to_insert_from_disk = int(kubectl.kubectl(
            f"exec -n {kubectl.namespace} {delayed_pod} -c clickhouse -- bash -c 'ls -la /var/lib/clickhouse/data/default/test_distr/*/*.bin 2>/dev/null | wc -l'",
            ok_to_fail=False,
        ))

    with When("reboot clickhouse-server pod"):
        fired = wait_alert_state("ClickHouseDistributedFilesToInsertHigh", "firing", True,
                                 labels={"hostname": delayed_svc, "chi": chi["metadata"]["name"]})
        assert fired, error("can't get ClickHouseDistributedFilesToInsertHigh alert in firing state")
    # @TODO remove it when  https://github.com/ClickHouse/ClickHouse/pull/11220 will merged to docker latest image
    kubectl.kube_wait_pod_status(restarted_pod, "Running", ns=kubectl.namespace)

    with Then("check ClickHouseClickHouseDistributedFilesToInsertHigh gone away"):
        resolved = wait_alert_state("ClickHouseDistributedFilesToInsertHigh", "firing", False, labels={"hostname": delayed_svc})
        assert resolved, error("can't check ClickHouseDistributedFilesToInsertHigh alert is gone away")

    drop_distributed_table_on_cluster()