コード例 #1
0
def test_query_preempted(self):
    priority_pod, priority_svc, _, _ = alerts.random_pod_choice_for_callbacks(
        chi)

    def run_queries_with_priority():
        sql = ""
        for i in range(50):
            sql += f"SET priority={i % 20};SELECT uniq(number) FROM numbers(20000000):"
        cmd = f"echo \\\"{sql} SELECT 1\\\" | xargs -i'{{}}' --no-run-if-empty -d ':' -P 20 clickhouse-client --time -m -n -q \\\"{{}}\\\""
        kubectl.launch(f"exec {priority_pod} -- bash -c \"{cmd}\"",
                       timeout=120)
        clickhouse.query(
            chi["metadata"]["name"],
            "SELECT event_time, CurrentMetric_QueryPreempted FROM system.metric_log WHERE CurrentMetric_QueryPreempted > 0",
            host=priority_svc,
        )

    with Then("check ClickHouseQueryPreempted firing"):
        fired = alerts.wait_alert_state(
            "ClickHouseQueryPreempted",
            "firing",
            True,
            labels={"hostname": priority_svc},
            time_range='30s',
            sleep_time=settings.prometheus_scrape_interval,
            callback=run_queries_with_priority)
        assert fired, error(
            "can't get ClickHouseQueryPreempted alert in firing state")
    with Then("check ClickHouseQueryPreempted gone away"):
        resolved = alerts.wait_alert_state("ClickHouseQueryPreempted",
                                           "firing",
                                           False,
                                           labels={"hostname": priority_svc})
        assert resolved, error(
            "can't check ClickHouseQueryPreempted alert is gone away")
コード例 #2
0
def test_longest_running_query(self):
    long_running_pod, long_running_svc, _, _ = alerts.random_pod_choice_for_callbacks(
        chi)
    # 600s trigger + 2*30s - double prometheus scraping interval
    clickhouse.query(
        chi["metadata"]["name"],
        "SELECT now(),sleepEachRow(1),number FROM system.numbers LIMIT 660",
        host=long_running_svc,
        timeout=670)
    with Then("check ClickHouseLongestRunningQuery firing"):
        fired = alerts.wait_alert_state("ClickHouseLongestRunningQuery",
                                        "firing",
                                        True,
                                        labels={"hostname": long_running_svc},
                                        time_range='30s')
        assert fired, error(
            "can't get ClickHouseLongestRunningQuery alert in firing state")
    with Then("check ClickHouseLongestRunningQuery gone away"):
        resolved = alerts.wait_alert_state(
            "ClickHouseLongestRunningQuery",
            "firing",
            False,
            labels={"hostname": long_running_svc})
        assert resolved, error(
            "can't check ClickHouseLongestRunningQuery alert is gone away")
コード例 #3
0
def test_018():
    create_and_check("configs/test-018-configmap.yaml", {
        "pod_count": 1,
        "do_not_delete": 1
    })
    chi_name = "test-018-configmap"

    with Then("user1/networks/ip should be in config"):
        chi = kube_get("chi", chi_name)
        assert "user1/networks/ip" in chi["spec"]["configuration"]["users"]

    start_time = kube_get_field("pod", f"chi-{chi_name}-default-0-0-0",
                                ".status.startTime")

    create_and_check("configs/test-018-configmap-2.yaml", {
        "pod_count": 1,
        "do_not_delete": 1
    })
    with Then("user2/networks should be in config"):
        chi = kube_get("chi", chi_name)
        assert "user2/networks/ip" in chi["spec"]["configuration"]["users"]
        with And("user1/networks/ip should NOT be in config"):
            assert "user1/networks/ip" not in chi["spec"]["configuration"][
                "users"]
        with And("Pod should not be restarted"):
            new_start_time = kube_get_field("pod",
                                            f"chi-{chi_name}-default-0-0-0",
                                            ".status.startTime")
            assert start_time == new_start_time

    kube_delete_chi(chi_name)
コード例 #4
0
def test_020(config="configs/test-020-multi-volume.yaml"):
    chi = get_chi_name(get_full_path(config))
    create_and_check(
        config, {
            "pod_count": 1,
            "pod_volumes": {"/var/lib/clickhouse", "/var/lib/clickhouse2"},
            "do_not_delete": 1
        })

    with When("Create a table and insert 1 row"):
        clickhouse_query(
            chi,
            "create table test_disks(a Int8) Engine = MergeTree() order by a")
        clickhouse_query(chi, "insert into test_disks values (1)")

        with Then("Data should be placed on default disk"):
            out = clickhouse_query(
                chi,
                "select disk_name from system.parts where table='test_disks'")
            assert out == 'default'

    with When("alter table test_disks move partition tuple() to disk 'disk2'"):
        clickhouse_query(
            chi,
            "alter table test_disks move partition tuple() to disk 'disk2'")

        with Then("Data should be placed on disk2"):
            out = clickhouse_query(
                chi,
                "select disk_name from system.parts where table='test_disks'")
            assert out == 'disk2'

    kube_delete_chi(chi)
コード例 #5
0
def test_replicas_max_abosulute_delay():
    stop_replica_pod, stop_replica_svc, insert_pod, insert_svc = random_pod_choice_for_callbacks()
    create_table_on_cluster('all-replicated', 'default.test_repl',
                            '(event_time DateTime, test UInt64) ENGINE ReplicatedMergeTree(\'/clickhouse/tables/{installation}-{shard}/{uuid}/test_repl\', \'{replica}\') ORDER BY tuple()')
    prometheus_scrape_interval = 15

    def restart_clickhouse_and_insert_to_replicated_table():
        with When(f"stop replica fetches on {stop_replica_svc}"):
            sql = "SYSTEM STOP FETCHES default.test_repl"
            kubectl.launch(
                f"exec -n {kubectl.namespace} {stop_replica_pod} -c clickhouse -- clickhouse-client -q \"{sql}\"",
                ok_to_fail=True,
            )
            sql = "INSERT INTO default.test_repl SELECT now(), number FROM numbers(100000)"
            kubectl.launch(
                f"exec -n {kubectl.namespace} {insert_pod} -c clickhouse -- clickhouse-client -q \"{sql}\"",
            )

    with Then("check ClickHouseReplicasMaxAbsoluteDelay firing"):
        fired = wait_alert_state("ClickHouseReplicasMaxAbsoluteDelay", "firing", True, labels={"hostname": stop_replica_svc},
                                 time_range='60s', sleep_time=prometheus_scrape_interval * 2,
                                 callback=restart_clickhouse_and_insert_to_replicated_table)
        assert fired, error("can't get ClickHouseReadonlyReplica alert in firing state")

    clickhouse.query(
        chi["metadata"]["name"],
        "SYSTEM START FETCHES; SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl",
        host=stop_replica_svc, timeout=240
    )
    with Then("check ClickHouseReplicasMaxAbsoluteDelay gone away"):
        resolved = wait_alert_state("ClickHouseReplicasMaxAbsoluteDelay", "firing", False, labels={"hostname": stop_replica_svc})
        assert resolved, error("can't check ClickHouseReplicasMaxAbsoluteDelay alert is gone away")

    drop_table_on_cluster('all-replicated', 'default.test_repl')
コード例 #6
0
def test_009():
    version_from = "0.6.0"
    version_to = "dev"
    with Given(f"clickhouse-operator {version_from}"):
        set_operator_version(version_from)
        config = get_full_path("configs/test-009-long-name.yaml")
        chi_full_name = get_chi_name(config)
        chi_cut_name = chi_full_name[0:15]

        kube_apply(config)
        kube_wait_objects(chi_cut_name, [1, 1, 2])
        kube_wait_chi_status(chi_full_name, "Completed")

        assert kube_get_count(
            "statefulset",
            label="-l clickhouse.altinity.com/app=chop") == 1, error()

        with Then(f"upgrade operator to {version_to}"):
            set_operator_version(version_to)
            with And("Wait 20 seconds"):
                time.sleep(20)
                with Then("No new statefulsets should be created"):
                    assert kube_get_count(
                        "statefulset",
                        label="-l clickhouse.altinity.com/app=chop"
                    ) == 1, error()
コード例 #7
0
def test_009(version_from="0.8.0", version_to=settings.operator_version):
    with Then("Test simple chi for operator upgrade"):
        test_operator_upgrade("configs/test-009-operator-upgrade.yaml",
                              version_from, version_to)
    with Then("Test advanced chi for operator upgrade"):
        test_operator_upgrade("configs/test-009-operator-upgrade-2.yaml",
                              version_from, version_to)
コード例 #8
0
def test_015():
    kubectl.create_and_check(config="configs/test-015-host-network.yaml",
                             check={
                                 "pod_count": 2,
                                 "do_not_delete": 1,
                             })

    time.sleep(30)

    with Then("Query from one server to another one should work"):
        out = clickhouse.query(
            "test-015-host-network",
            host="chi-test-015-host-network-default-0-0",
            port="10000",
            sql=
            "SELECT * FROM remote('chi-test-015-host-network-default-0-1', system.one)"
        )
        print("remote out=")
        print(out)

    with Then("Distributed query should work"):
        out = clickhouse.query(
            "test-015-host-network",
            host="chi-test-015-host-network-default-0-0",
            port="10000",
            sql=
            "SELECT count() FROM cluster('all-sharded', system.one) settings receive_timeout=10"
        )
        print("cluster out=")
        print(out)
        assert out == "2"

    kubectl.delete_chi("test-015-host-network")
コード例 #9
0
def test_006():
    with Then("Create initial position"):
        kubectl.create_and_check(config="configs/test-006-ch-upgrade-1.yaml",
                                 check={
                                     "pod_count": 2,
                                     "pod_image":
                                     "yandex/clickhouse-server:19.11",
                                     "do_not_delete": 1,
                                 })
    with Then(
            "Use different podTemplate and confirm that pod image is updated"):
        kubectl.create_and_check(config="configs/test-006-ch-upgrade-2.yaml",
                                 check={
                                     "pod_count": 2,
                                     "pod_image":
                                     "yandex/clickhouse-server:19.16",
                                     "do_not_delete": 1,
                                 })
    with Then(
            "Change image in podTemplate itself and confirm that pod image is updated"
    ):
        kubectl.create_and_check(config="configs/test-006-ch-upgrade-3.yaml",
                                 check={
                                     "pod_count": 2,
                                     "pod_image":
                                     "yandex/clickhouse-server:19.11",
                                 })
コード例 #10
0
def test_read_only_replica():
    read_only_pod, read_only_svc, other_pod, other_svc = random_pod_choice_for_callbacks()
    chi_name = chi["metadata"]["name"]
    create_replicated_table_on_cluster()

    def restart_zookeeper():
        kubectl.kubectl(
            f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"",
            ok_to_fail=True,
        )
        clickhouse.clickhouse_query_with_error(chi_name, "INSERT INTO default.test_repl VALUES(now(),rand())", host=read_only_svc)

    with Then("check ClickHouseReadonlyReplica firing"):
        fired = wait_alert_state("ClickHouseReadonlyReplica", "firing", True, labels={"hostname": read_only_svc},
                                 time_range='30s', sleep_time=5, callback=restart_zookeeper)
        assert fired, error("can't get ClickHouseReadonlyReplica alert in firing state")
    with Then("check ClickHouseReadonlyReplica gone away"):
        resolved = wait_alert_state("ClickHouseReadonlyReplica", "firing", False, labels={"hostname": read_only_svc})
        assert resolved, error("can't check ClickHouseReadonlyReplica alert is gone away")

    kubectl.kube_wait_pod_status("zookeeper-0", "Running", ns=kubectl.namespace)
    kubectl.kube_wait_jsonpath("pod", "zookeeper-0", "{.status.containerStatuses[0].ready}", "true",
                               ns=kubectl.namespace)

    clickhouse.clickhouse_query_with_error(
        chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl",
        host=read_only_svc, timeout=240
    )
    clickhouse.clickhouse_query_with_error(
        chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl",
        host=other_svc, timeout=240
    )

    drop_replicated_table_on_cluster()
コード例 #11
0
def test_016():
    create_and_check(
        "configs/test-016-settings.yaml", {
            "apply_templates": {settings.clickhouse_template},
            "pod_count": 1,
            "do_not_delete": 1
        })

    with Then("dictGet() should work"):
        out = clickhouse_query(
            "test-016-settings",
            query="select dictGet('one', 'one', toUInt64(0))")
        assert out == "0"

    with Then("Custom macro 'layer' should be available"):
        out = clickhouse_query(
            "test-016-settings",
            query="select substitution from system.macros where macro='layer'")
        assert out == "01"

    with Then("query_log should be disabled"):
        clickhouse_query("test-016-settings", query="system flush logs")
        out = clickhouse_query_with_error(
            "test-016-settings", query="select count() from system.query_log")
        assert "doesn't exist" in out

    kube_delete_chi("test-016-settings")
コード例 #12
0
def test_replicas_max_abosulute_delay():
    stop_replica_pod, stop_replica_svc, insert_pod, insert_svc = random_pod_choice_for_callbacks()
    create_replicated_table_on_cluster()
    prometheus_scrape_interval = 30

    def restart_clickhouse_and_insert_to_replicated_table():
        with When(f"stop replica fetches on {stop_replica_svc}"):
            sql = "SYSTEM STOP FETCHES default.test_repl"
            kubectl.kubectl(
                f"exec -n {kubectl.namespace} {stop_replica_pod} -c clickhouse -- clickhouse-client -q \"{sql}\"",
                ok_to_fail=True,
            )
            sql = "INSERT INTO default.test_repl SELECT now(), number FROM numbers(100000)"
            kubectl.kubectl(
                f"exec -n {kubectl.namespace} {insert_pod} -c clickhouse -- clickhouse-client -q \"{sql}\"",
            )

    with Then("check ReplicasMaxAbsoluteDelay firing"):
        fired = wait_alert_state("ReplicasMaxAbsoluteDelay", "firing", True, labels={"hostname": stop_replica_svc},
                                 time_range='60s', sleep_time=prometheus_scrape_interval*2,
                                 callback=restart_clickhouse_and_insert_to_replicated_table)
        assert fired, error("can't get ReadonlyReplica alert in firing state")

    clickhouse.clickhouse_query(
        chi["metadata"]["name"],
        "SYSTEM START FETCHES; SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl", timeout=240
    )
    with Then("check ReplicasMaxAbsoluteDelay gone away"):
        resolved = wait_alert_state("ReplicasMaxAbsoluteDelay", "firing", False, labels={"hostname": stop_replica_svc})
        assert resolved, error("can't check ReplicasMaxAbsoluteDelay alert is gone away")

    drop_replicated_table_on_cluster()
コード例 #13
0
def test_ch_002(self):
    kubectl.create_and_check(
        "configs/test-ch-002-row-level.yaml", {
            "apply_templates": {"templates/tpl-clickhouse-20.3.yaml"},
            "do_not_delete": 1,
        })

    chi = "test-ch-002-row-level"
    create_table = """create table test (d Date default today(), team LowCardinality(String), user String) Engine = MergeTree() PARTITION BY d ORDER BY d;"""

    with When("Create test table"):
        clickhouse.query(chi, create_table)

    with And("Insert some data"):
        clickhouse.query(
            chi,
            "INSERT INTO test(team, user) values('team1', 'user1'),('team2', 'user2'),('team3', 'user3'),('team4', 'user4')"
        )

    with Then(
            "Make another query for different users. It should be restricted to corresponding team by row-level security"
    ):
        for user in ['user1', 'user2', 'user3', 'user4']:
            out = clickhouse.query(chi, "select user from test", user=user)
            assert out == user

    with Then(
            "Make a count() query for different users. It should be restricted to corresponding team by row-level security"
    ):
        for user in ['user1', 'user2', 'user3', 'user4']:
            out = clickhouse.query(chi, "select count() from test", user=user)
            assert out == "1"

    kubectl.delete_chi(chi)
コード例 #14
0
def test_read_only_replica():
    read_only_pod, read_only_svc, other_pod, other_svc = random_pod_choice_for_callbacks()
    chi_name = chi["metadata"]["name"]
    create_table_on_cluster('all-replicated', 'default.test_repl',
                            '(event_time DateTime, test UInt64) ENGINE ReplicatedMergeTree(\'/clickhouse/tables/{installation}-{shard}/{uuid}/test_repl\', \'{replica}\') ORDER BY tuple()')

    def restart_zookeeper():
        kubectl.launch(
            f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"",
            ok_to_fail=True,
        )
        clickhouse.query_with_error(chi_name, "INSERT INTO default.test_repl VALUES(now(),rand())", host=read_only_svc)

    with Then("check ClickHouseReadonlyReplica firing"):
        fired = wait_alert_state("ClickHouseReadonlyReplica", "firing", True, labels={"hostname": read_only_svc},
                                 time_range='30s', sleep_time=5, callback=restart_zookeeper)
        assert fired, error("can't get ClickHouseReadonlyReplica alert in firing state")
    with Then("check ClickHouseReadonlyReplica gone away"):
        resolved = wait_alert_state("ClickHouseReadonlyReplica", "firing", False, labels={"hostname": read_only_svc})
        assert resolved, error("can't check ClickHouseReadonlyReplica alert is gone away")

    kubectl.wait_pod_status("zookeeper-0", "Running", ns=kubectl.namespace)
    kubectl.wait_jsonpath("pod", "zookeeper-0", "{.status.containerStatuses[0].ready}", "true",
                          ns=kubectl.namespace)

    clickhouse.query_with_error(
        chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl",
        host=read_only_svc, timeout=240
    )
    clickhouse.query_with_error(
        chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl",
        host=other_svc, timeout=240
    )

    drop_table_on_cluster('all-replicated', 'default.test_repl')
コード例 #15
0
def test_metrics_exporter_with_multiple_clickhouse_version():
    def check_monitoring_metrics(operator_namespace, operator_pod, expect_result, max_retries=10):
        with And(f"metrics-exporter /metrics enpoint result should match with {expect_result}"):
            for i in range(1, max_retries):
                out = kubectl.launch(
                    f"exec {operator_pod} -c metrics-exporter -- wget -O- -q http://127.0.0.1:8888/metrics",
                    ns=operator_namespace
                )
                all_strings_expected_done = True
                for string, exists in expect_result.items():
                    all_strings_expected_done = (exists == (string in out))
                    if not all_strings_expected_done:
                        break

                if all_strings_expected_done:
                    break
                with Then("Not ready. Wait for " + str(i * 5) + " seconds"):
                    time.sleep(i * 5)
            assert all_strings_expected_done, error()

    with Given("clickhouse-operator pod exists"):
        out = kubectl.launch("get pods -l app=clickhouse-operator", ns='kube-system').splitlines()[1]
        operator_pod = re.split(r'[\t\r\n\s]+', out)[0]
        operator_namespace = "kube-system"

        with Then("check empty /metrics"):
            kubectl.delete_ns(kubectl.namespace, ok_to_fail=True)
            kubectl.create_ns(kubectl.namespace)
            check_monitoring_metrics(operator_namespace, operator_pod, expect_result={
                'chi_clickhouse_metric_VersionInteger': False,
            })

        with Then("Install multiple clickhouse version"):
            config = util.get_full_path("configs/test-017-multi-version.yaml")
            kubectl.create_and_check(
                config=config,
                check={
                    "object_counts": {
                        "statefulset": 4,
                        "pod": 4,
                        "service": 5,
                    },
                    "do_not_delete": True,
                })
            with And("Check not empty /metrics"):
                check_monitoring_metrics(operator_namespace, operator_pod, expect_result={
                    '# HELP chi_clickhouse_metric_VersionInteger': True,
                    '# TYPE chi_clickhouse_metric_VersionInteger gauge': True,
                    'chi_clickhouse_metric_VersionInteger{chi="test-017-multi-version",hostname="chi-test-017-multi-version-default-0-0': True,
                    'chi_clickhouse_metric_VersionInteger{chi="test-017-multi-version",hostname="chi-test-017-multi-version-default-1-0': True,
                    'chi_clickhouse_metric_VersionInteger{chi="test-017-multi-version",hostname="chi-test-017-multi-version-default-2-0': True,
                    'chi_clickhouse_metric_VersionInteger{chi="test-017-multi-version",hostname="chi-test-017-multi-version-default-3-0': True,

                })

        with Then("check empty /metrics after delete namespace"):
            kubectl.delete_ns(kubectl.namespace)
            check_monitoring_metrics(operator_namespace, operator_pod, expect_result={
                'chi_clickhouse_metric_VersionInteger': False,
            })
コード例 #16
0
def test_clickhouse_server_reboot():
    random_idx = random.randint(0, 1)
    clickhouse_pod = chi["status"]["pods"][random_idx]
    clickhouse_svc = chi["status"]["fqdns"][random_idx]

    def reboot_clickhouse_server():
        kubectl.launch(
            f"exec -n {kubectl.namespace} {clickhouse_pod} -c clickhouse -- kill 1",
            ok_to_fail=True,
        )

    with When("reboot clickhouse-server pod"):
        fired = wait_alert_state("ClickHouseServerDown", "firing", True,
                                 labels={"hostname": clickhouse_svc, "chi": chi["metadata"]["name"]},
                                 callback=reboot_clickhouse_server,
                                 sleep_time=5, time_range='30s', max_try=30,
                                 )
        assert fired, error("can't get ClickHouseServerDown alert in firing state")

    with Then("check ClickHouseServerDown gone away"):
        resolved = wait_alert_state("ClickHouseServerDown", "firing", False, labels={"hostname": clickhouse_svc}, time_range='5s',
                                    sleep_time=5)
        assert resolved, error("can't check ClickHouseServerDown alert is gone away")

    with Then("check ClickHouseServerRestartRecently firing and gone away"):
        fired = wait_alert_state("ClickHouseServerRestartRecently", "firing", True,
                                 labels={"hostname": clickhouse_svc, "chi": chi["metadata"]["name"]}, time_range="30s")
        assert fired, error("after ClickHouseServerDown gone away, ClickHouseServerRestartRecently shall firing")

        resolved = wait_alert_state("ClickHouseServerRestartRecently", "firing", False,
                                    labels={"hostname": clickhouse_svc})
        assert resolved, error("can't check ClickHouseServerRestartRecently alert is gone away")
コード例 #17
0
def test_015():
    create_and_check("configs/test-015-host-network.yaml", {
        "pod_count": 2,
        "do_not_delete": 1
    })

    with Then("Query from one server to another one should work"):
        clickhouse_query(
            "test-015-host-network",
            host="chi-test-015-host-network-default-0-0",
            port="10000",
            query=
            "select * from remote('chi-test-015-host-network-default-0-1', system.one)"
        )

    with Then("Distributed query should work"):
        out = clickhouse_query(
            "test-015-host-network",
            host="chi-test-015-host-network-default-0-0",
            port="10000",
            query=
            "select count() from cluster('all-sharded', system.one) settings receive_timeout=10"
        )
        assert out == "2"

    kube_delete_chi("test-015-host-network")
コード例 #18
0
def test_013():
    create_and_check(
        "configs/test-013-add-shards-1.yaml", {
            "apply_templates": {settings.clickhouse_template},
            "object_counts": [1, 1, 2],
            "do_not_delete": 1
        })

    with Then("Create local and distributed table"):
        clickhouse_query(
            "test-013-add-shards",
            "CREATE TABLE test_local Engine = Log as select * from system.one")
        clickhouse_query(
            "test-013-add-shards",
            "CREATE TABLE test_distr as test_local Engine = Distributed('default', default, test_local)"
        )

    with Then("Add one more shard"):
        create_and_check("configs/test-013-add-shards-2.yaml", {
            "object_counts": [2, 2, 3],
            "do_not_delete": 1
        })
    with And("Table should be created on a second shard"):
        out = clickhouse_query("test-013-add-shards",
                               "select count() from default.test_distr",
                               host="chi-test-013-add-shards-default-1-0")
        assert out == "1"

    with Then("Remove shard"):
        create_and_check("configs/test-013-add-shards-1.yaml",
                         {"object_counts": [1, 1, 2]})
コード例 #19
0
def check_alert_state(alert_name,
                      prometheus_pod,
                      alert_state="firing",
                      labels=None,
                      time_range="10s"):
    with Then(
            f"check {alert_name} for state {alert_state} and {labels} labels in {time_range}"
    ):
        cmd = f"exec -n {settings.prometheus_namespace} {prometheus_pod} -c prometheus -- "
        cmd += "wget -qO- 'http://127.0.0.1:9090/api/v1/query?query=ALERTS{"
        if labels is None:
            labels = {}
        if not isinstance(labels, dict):
            fail(f"Invalid labels={labels}")
        labels.update({"alertname": alert_name, "alertstate": alert_state})
        cmd += ",".join(
            [f"{name}=\"{value}\"" for name, value in labels.items()])
        cmd += f"}}[{time_range}]' 2>/dev/null"
        out = kubectl.launch(cmd)
        out = json.loads(out)
        if not ("status" in out and out["status"] == "success"):
            fail("wrong response from prometheus query API")
        if len(out["data"]["result"]) == 0:
            with Then("not present, empty result"):
                return False
        result_labels = out["data"]["result"][0]["metric"].items()
        exists = all(item in result_labels for item in labels.items())
        with Then("got result and contains labels"
                  if exists else "got result, but doesn't contain labels"):
            return exists
コード例 #20
0
def test_zookeeper_hardware_exceptions():
    pod1, svc1, pod2, svc2 = random_pod_choice_for_callbacks()
    chi_name = chi["metadata"]["name"]

    def restart_zookeeper():
        kubectl.launch(
            f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"",
            ok_to_fail=True,
        )
        clickhouse.query_with_error(chi_name, "SELECT name, path FROM system.zookeeper WHERE path='/'", host=svc1)
        clickhouse.query_with_error(chi_name, "SELECT name, path FROM system.zookeeper WHERE path='/'", host=svc2)

    with Then("check ClickHouseZooKeeperHardwareExceptions firing"):
        for svc in (svc1, svc2):
            fired = wait_alert_state("ClickHouseZooKeeperHardwareExceptions", "firing", True, labels={"hostname": svc},
                                     time_range='40s', sleep_time=5, callback=restart_zookeeper)
            assert fired, error("can't get ClickHouseZooKeeperHardwareExceptions alert in firing state")

    kubectl.wait_pod_status("zookeeper-0", "Running", ns=kubectl.namespace)
    kubectl.wait_jsonpath("pod", "zookeeper-0", "{.status.containerStatuses[0].ready}", "true",
                          ns=kubectl.namespace)

    with Then("check ClickHouseZooKeeperHardwareExceptions gone away"):
        for svc in (svc1, svc2):
            resolved = wait_alert_state("ClickHouseZooKeeperHardwareExceptions", "firing", False, labels={"hostname": svc})
            assert resolved, error("can't check ClickHouseZooKeeperHardwareExceptions alert is gone away")
コード例 #21
0
def test_016():
    chi = "test-016-settings"
    create_and_check(
        "configs/test-016-settings.yaml", {
            "apply_templates": {settings.clickhouse_template},
            "pod_count": 1,
            "do_not_delete": 1
        })

    with Then("Custom macro 'layer' should be available"):
        out = clickhouse_query(
            chi,
            query="select substitution from system.macros where macro='layer'")
        assert out == "01"

    with And("Custom macro 'test' should be available"):
        out = clickhouse_query(
            chi,
            query="select substitution from system.macros where macro='test'")
        assert out == "test"

    with And("dictGet() should work"):
        out = clickhouse_query(
            chi, query="select dictGet('one', 'one', toUInt64(0))")
        assert out == "0"

    with And("query_log should be disabled"):
        clickhouse_query(chi, query="system flush logs")
        out = clickhouse_query_with_error(
            chi, query="select count() from system.query_log")
        assert "doesn't exist" in out

    with And("max_memory_usage should be 7000000000"):
        out = clickhouse_query(
            chi,
            query=
            "select value from system.settings where name='max_memory_usage'")
        assert out == "7000000000"

    with And("test_usersd user should be available"):
        clickhouse_query(chi, query="select version()", user="******")

    with When("Update usersd settings"):
        start_time = kube_get_field("pod", f"chi-{chi}-default-0-0-0",
                                    ".status.startTime")
        create_and_check("configs/test-016-settings-2.yaml",
                         {"do_not_delete": 1})
        with Then("Wait 60 seconds for configmap changes to apply"):
            time.sleep(60)
        with Then("test_norestart user should be available"):
            clickhouse_query(chi,
                             query="select version()",
                             user="******")
        with And("ClickHouse should not be restarted"):
            new_start_time = kube_get_field("pod", f"chi-{chi}-default-0-0-0",
                                            ".status.startTime")
            assert start_time == new_start_time

    kube_delete_chi("test-016-settings")
コード例 #22
0
def test_zookeeper_alerts(self):
    zookeeper_spec = kubectl.get("endpoints", "zookeeper")
    zookeeper_pod = random.choice(
        zookeeper_spec["subsets"][0]["addresses"])["targetRef"]["name"]

    def restart_zookeeper():
        kubectl.launch(
            f"exec -n {kubectl.namespace} {zookeeper_pod} -- sh -c \"kill 1\"",
            ok_to_fail=True,
        )

    def wait_when_zookeeper_up():
        kubectl.wait_pod_status(zookeeper_pod, "Running", ns=kubectl.namespace)
        kubectl.wait_jsonpath("pod",
                              zookeeper_pod,
                              "{.status.containerStatuses[0].ready}",
                              "true",
                              ns=kubectl.namespace)

    with Then("check ZookeeperDown firing"):
        fired = alerts.wait_alert_state(
            "ZookeeperDown",
            "firing",
            True,
            labels={"pod_name": zookeeper_pod},
            time_range='1m',
            sleep_time=settings.prometheus_scrape_interval,
            callback=restart_zookeeper)
        assert fired, error("can't get ZookeeperDown alert in firing state")

    wait_when_zookeeper_up()

    with Then("check ZookeeperDown gone away"):
        resolved = alerts.wait_alert_state("ZookeeperDown",
                                           "firing",
                                           False,
                                           labels={"pod_name": zookeeper_pod})
        assert resolved, error("can't check ZookeeperDown alert is gone away")

    restart_zookeeper()

    with Then("check ZookeeperRestartRecently firing"):
        fired = alerts.wait_alert_state("ZookeeperRestartRecently",
                                        "firing",
                                        True,
                                        labels={"pod_name": zookeeper_pod},
                                        time_range='30s')
        assert fired, error(
            "can't get ZookeeperRestartRecently alert in firing state")

    wait_when_zookeeper_up()

    with Then("check ZookeeperRestartRecently gone away"):
        resolved = alerts.wait_alert_state("ZookeeperRestartRecently",
                                           "firing",
                                           False,
                                           labels={"pod_name": zookeeper_pod})
        assert resolved, error(
            "can't check ZookeeperRestartRecently alert is gone away")
コード例 #23
0
def test_014():
    require_zookeeper()

    create_table = """
    create table t (a Int8) 
    Engine = ReplicatedMergeTree('/clickhouse/{installation}/{cluster}/tables/{shard}/{database}/{table}', '{replica}')
    partition by tuple() order by a""".replace('\r', '').replace('\n', '')

    create_and_check(
        "configs/test-014-replication.yaml", {
            "apply_templates": {settings.clickhouse_template},
            "object_counts": [2, 2, 3],
            "do_not_delete": 1
        })

    with Given("Table is created on a first replica and data is inserted"):
        clickhouse_query("test-014-replication",
                         create_table,
                         host="chi-test-014-replication-default-0-0")
        clickhouse_query("test-014-replication",
                         "insert into t values(1)",
                         host="chi-test-014-replication-default-0-0")
        with When("Table is created on the second replica"):
            clickhouse_query("test-014-replication",
                             create_table,
                             host="chi-test-014-replication-default-0-1")
            with Then("Data should be replicated"):
                out = clickhouse_query(
                    "test-014-replication",
                    "select a from t",
                    host="chi-test-014-replication-default-0-1")
                assert out == "1"

    with When("Add one more replica"):
        create_and_check("configs/test-014-replication-2.yaml", {
            "pod_count": 3,
            "do_not_delete": 1
        })
        # that also works:
        # kubectl patch chi test-014-replication -n test --type=json -p '[{"op":"add", "path": "/spec/configuration/clusters/0/layout/shards/0/replicasCount", "value": 3}]'
        with Then("Replicated table should be automatically created"):
            out = clickhouse_query("test-014-replication",
                                   "select a from t",
                                   host="chi-test-014-replication-default-0-2")
            assert out == "1"

    with When("Remove replica"):
        create_and_check("configs/test-014-replication.yaml", {
            "pod_count": 1,
            "do_not_delete": 1
        })
        with Then("Replica needs to be removed from the Zookeeper as well"):
            out = clickhouse_query(
                "test-014-replication",
                "select count() from system.replicas where table='t'")
            assert out == "1"

    kube_delete_chi("test-014-replication")
コード例 #24
0
def kube_wait_objects(chi, objects, ns="test"):
    with Then(f"{objects[0]} statefulsets, {objects[1]} pods and {objects[2]} services should be created"):
        for i in range(1,max_retries):
            counts = kube_count_resources(label = f"-l clickhouse.altinity.com/chi={chi}", ns = ns)
            if counts == objects:
                break
            with Then("Not ready. Wait for " + str(i*5) + " seconds"):
                time.sleep(i*5)
        assert counts == objects, error()
コード例 #25
0
def kube_wait_object(type, name, label="", count = 1, ns="test", retries = max_retries):
    with Then(f"{count} {type}(s) {name} should be created"):
        for i in range(1,retries):
            counts = kube_get_count(type, ns = ns, name = name, label = label)
            if counts >= count:
                break
            with Then("Not ready. Wait for " + str(i*5) + " seconds"):
                time.sleep(i*5)
        assert counts >= count, error()
コード例 #26
0
def wait_command(command, result, count=1, ns=namespace, retries=max_retries):
    with Then(f"{command} should return {result}"):
        for i in range(1, retries):
            res = launch(command, ok_to_fail=True, ns=ns)
            if res == result:
                break
            with Then("Not ready. Wait for " + str(i * 5) + " seconds"):
                time.sleep(i * 5)
        assert res == result, error()
コード例 #27
0
def wait_jsonpath(kind, name, field, value, ns=namespace, retries=max_retries):
    with Then(f"{kind} {name} -o jsonpath={field} should be {value}"):
        for i in range(1, retries):
            cur_value = get_jsonpath(kind, name, field, ns)
            if cur_value == value:
                break
            with Then("Not ready. Wait for " + str(i * 5) + " seconds"):
                time.sleep(i * 5)
        assert cur_value == value, error()
コード例 #28
0
def wait_object(kind, name, label="", count=1, ns=namespace, retries=max_retries, backoff = 5):
    with Then(f"{count} {kind}(s) {name} should be created"):
        for i in range(1, retries):
            cur_count = get_count(kind, ns=ns, name=name, label=label)
            if cur_count >= count:
                break
            with Then("Not ready. Wait for " + str(i * backoff) + " seconds"):
                time.sleep(i * backoff)
        assert cur_count >= count, error()
コード例 #29
0
def kube_wait_field(object, name, field, value, ns="test", retries = max_retries):
    with Then(f"{object} {name} {field} should be {value}"):
        for i in range(1,retries):
            obj_status = kubectl(f"get {object} {name} -o=custom-columns=field:{field}", ns=ns).splitlines()
            if obj_status[1] == value:
                break
            with Then("Not ready. Wait for " + str(i*5) + " seconds"):
                time.sleep(i*5)
        assert obj_status[1] == value, error()
コード例 #30
0
def test_backup_not_run(self):
    not_run_pod, _, _, _ = alerts.random_pod_choice_for_callbacks(chi)
    apply_fake_backup("prepare fake backup for time metric")

    with Then(f"wait {not_run_pod} ready"):
        kubectl.wait_field("pod", not_run_pod, ".spec.containers[1].image",
                           "nginx:latest")
        kubectl.wait_field("pod", not_run_pod,
                           ".status.containerStatuses[1].ready", "true")

    with Then(f"setup {not_run_pod} backup create end time"):
        kubectl.launch(
            f'exec {not_run_pod} -c clickhouse-backup -- bash -xc \''
            'echo "# HELP clickhouse_backup_last_create_finish Last backup create finish timestamp" > /usr/share/nginx/html/metrics && '
            'echo "# TYPE clickhouse_backup_last_create_finish gauge" >> /usr/share/nginx/html/metrics && '
            f'echo "clickhouse_backup_last_create_finish {int((datetime.datetime.now() - datetime.timedelta(days=2)).timestamp())}" >> /usr/share/nginx/html/metrics '
            '\'')

        fired = alerts.wait_alert_state(
            "ClickhouseBackupDoesntRunTooLong",
            "firing",
            expected_state=True,
            sleep_time=settings.prometheus_scrape_interval,
            labels={"pod_name": not_run_pod},
            time_range='60s')
        assert fired, error(
            "can't get ClickhouseBackupDoesntRunTooLong alert in firing state")

    apply_normal_backup()

    backup_name = prepare_table_for_backup(not_run_pod)
    wait_backup_pod_ready_and_curl_installed(not_run_pod)

    with When('Backup is success'):
        exec_on_backup_container(
            not_run_pod,
            f'curl -X POST -sL "http://127.0.0.1:7171/backup/create?name={backup_name}"'
        )
        wait_backup_command_status(not_run_pod,
                                   f'create {backup_name}',
                                   expected_status='success')

        exec_on_backup_container(
            not_run_pod,
            f'curl -X POST -sL "http://127.0.0.1:7171/backup/upload/{backup_name}"'
        )
        wait_backup_command_status(not_run_pod,
                                   f'upload {backup_name}',
                                   expected_status='success')

    with Then("check ClickhouseBackupDoesntRunTooLong gone away"):
        resolved = alerts.wait_alert_state("ClickhouseBackupDoesntRunTooLong",
                                           "firing",
                                           expected_state=False,
                                           labels={"pod_name": not_run_pod})
        assert resolved, error(
            "can't get ClickhouseBackupDoesntRunTooLong alert is gone away")