Exemple #1
0
def get_prometheus_and_alertmanager_spec():
    with Given("get information about prometheus installation"):
        prometheus_operator_spec = kubectl.get(
            "pod",
            ns=settings.prometheus_namespace,
            name="",
            label=
            "-l app.kubernetes.io/component=controller,app.kubernetes.io/name=prometheus-operator"
        )

        alertmanager_spec = kubectl.get(
            "pod",
            ns=settings.prometheus_namespace,
            name="",
            label="-l app=alertmanager,alertmanager=alertmanager")

        prometheus_spec = kubectl.get(
            "pod",
            ns=settings.prometheus_namespace,
            name="",
            label="-l app=prometheus,prometheus=prometheus")
        if not ("items" in prometheus_spec and len(prometheus_spec["items"])
                and "metadata" in prometheus_spec["items"][0]):
            fail("invalid prometheus_spec, please run create-prometheus.sh")
        return prometheus_operator_spec, prometheus_spec, alertmanager_spec
Exemple #2
0
def check_alert_state(alert_name,
                      prometheus_pod,
                      alert_state="firing",
                      labels=None,
                      time_range="10s"):
    with Then(
            f"check {alert_name} for state {alert_state} and {labels} labels in {time_range}"
    ):
        cmd = f"exec -n {settings.prometheus_namespace} {prometheus_pod} -c prometheus -- "
        cmd += "wget -qO- 'http://127.0.0.1:9090/api/v1/query?query=ALERTS{"
        if labels is None:
            labels = {}
        if not isinstance(labels, dict):
            fail(f"Invalid labels={labels}")
        labels.update({"alertname": alert_name, "alertstate": alert_state})
        cmd += ",".join(
            [f"{name}=\"{value}\"" for name, value in labels.items()])
        cmd += f"}}[{time_range}]' 2>/dev/null"
        out = kubectl.launch(cmd)
        out = json.loads(out)
        if not ("status" in out and out["status"] == "success"):
            fail("wrong response from prometheus query API")
        if len(out["data"]["result"]) == 0:
            with Then("not present, empty result"):
                return False
        result_labels = out["data"]["result"][0]["metric"].items()
        exists = all(item in result_labels for item in labels.items())
        with Then("got result and contains labels"
                  if exists else "got result, but doesn't contain labels"):
            return exists
Exemple #3
0
def set_operator_version(version, ns=settings.operator_namespace, timeout=60):
    operator_image = f"{settings.operator_docker_repo}:{version}"
    metrics_exporter_image = f"{settings.metrics_exporter_docker_repo}:{version}"
    kubectl.launch(f"set image deployment.v1.apps/clickhouse-operator clickhouse-operator={operator_image}", ns=ns)
    kubectl.launch(f"set image deployment.v1.apps/clickhouse-operator metrics-exporter={metrics_exporter_image}", ns=ns)
    kubectl.launch("rollout status deployment.v1.apps/clickhouse-operator", ns=ns, timeout=timeout)
    if kubectl.get_count("pod", ns=ns, label=operator_label) == 0:
        fail("invalid clickhouse-operator pod count")
Exemple #4
0
def is_expected_backup_status(command_name, command_is_done, st,
                              expected_status, err_status):
    if 'command' in st and st['command'] == command_name:
        if st['status'] == expected_status:
            command_is_done = True
            return True, command_is_done
        elif st['status'] == err_status:
            if 'error' in st:
                fail(st['error'])
            else:
                fail(f'unexpected status of {command_name} {st}')
        else:
            with Then('Not ready, wait 5 sec'):
                time.sleep(5)
    return False, command_is_done
Exemple #5
0
 def query(self, q, params=[], fetch=True):
     try:
         note(f"query: {q}")
         cursor = self.connection.cursor()
         cursor.execute(q, *params)
         if fetch:
             rows = cursor.fetchall()
             for row in rows:
                 note(row)
             return rows
     except pyodbc.Error as exc:
         exception()
         fail(str(exc))
     finally:
         if self.logs and settings.debug:
             # sleep 0.5 sec to let messages to be written to the logs
             time.sleep(0.5)
             self.logs.read(timeout=0.1)
def test_read_only_replica(self):
    read_only_pod, read_only_svc, other_pod, other_svc = alerts.random_pod_choice_for_callbacks(
        chi)
    chi_name = chi["metadata"]["name"]
    clickhouse.create_table_on_cluster(
        chi, 'all-replicated', 'default.test_repl',
        '(event_time DateTime, test UInt64) ' +
        'ENGINE ReplicatedMergeTree(\'/clickhouse/tables/{installation}-{shard}/test_repl\', \'{replica}\') ORDER BY tuple()'
    )

    def restart_zookeeper():
        kubectl.launch(
            f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"",
            ok_to_fail=True,
        )
        clickhouse.query_with_error(
            chi_name,
            "INSERT INTO default.test_repl VALUES(now(),rand())",
            host=read_only_svc)

    with Then("check ClickHouseReadonlyReplica firing"):
        fired = alerts.wait_alert_state(
            "ClickHouseReadonlyReplica",
            "firing",
            True,
            labels={"hostname": read_only_svc},
            time_range='30s',
            sleep_time=settings.prometheus_scrape_interval,
            callback=restart_zookeeper)
        assert fired, error(
            "can't get ClickHouseReadonlyReplica alert in firing state")
    with Then("check ClickHouseReadonlyReplica gone away"):
        resolved = alerts.wait_alert_state("ClickHouseReadonlyReplica",
                                           "firing",
                                           False,
                                           labels={"hostname": read_only_svc})
        assert resolved, error(
            "can't check ClickHouseReadonlyReplica alert is gone away")

    kubectl.wait_pod_status("zookeeper-0", "Running", ns=kubectl.namespace)
    kubectl.wait_jsonpath("pod",
                          "zookeeper-0",
                          "{.status.containerStatuses[0].ready}",
                          "true",
                          ns=kubectl.namespace)

    for i in range(11):
        zookeeper_status = kubectl.launch(
            f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"echo ruok | nc 127.0.0.1 2181\"",
            ok_to_fail=True)
        if "imok" in zookeeper_status:
            break
        elif i == 10:
            fail(f"invalid zookeeper status after {i} retries")
        with Then("zookeper is not ready, wait 2 seconds"):
            time.sleep(2)

    clickhouse.query_with_error(
        chi_name,
        "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl",
        host=read_only_svc,
        timeout=240)
    clickhouse.query_with_error(
        chi_name,
        "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl",
        host=other_svc,
        timeout=240)

    clickhouse.drop_table_on_cluster(chi, 'all-replicated',
                                     'default.test_repl')