Esempio n. 1
0
def update_retention_check_interval(k8s_client,
                                    kafka_name,
                                    namespace="default",
                                    interval=1000):
    custom_object_client = kubernetes.client.CustomObjectsApi(k8s_client)
    body = {
        "spec": {
            "kafka": {
                "config": {
                    "log.retention.check.interval.ms": interval
                }
            }
        }
    }
    custom_object_client.patch_namespaced_custom_object(
        namespace=namespace,
        group='kafka.strimzi.io',
        version='v1beta1',
        plural='kafkas',
        name=kafka_name,
        body=body)

    waiter.wait_for_predicate(lambda: kubectl.is_stateful_set_ready(
        k8s_client, f"{kafka_name}-kafka", namespace=namespace),
                              timeout=60)
Esempio n. 2
0
    def _expose(self):
        # Checks if kafka is already exposed
        if self._is_exposed:
            return

        logging.debug("Exposing kafka cluster")
        custom_object_client = kubernetes.client.CustomObjectsApi(self._cluster.Kubectl.client())
        kafka_spec = custom_object_client.get_namespaced_custom_object(namespace=self._namespace,
                                                                       group='kafka.strimzi.io',
                                                                       version='v1beta1',
                                                                       plural='kafkas',
                                                                       name=self._name)['spec']
        advertised_brokers = {'brokers': []}
        for i in range(0, kafka_spec['kafka']['replicas']):
            advertised_brokers['brokers'].append({'broker': i, 'advertisedHost': self._master.ip})

        kafka_spec['kafka']['listeners']['external'] = {'type': 'nodeport', 'tls': False, 'overrides': advertised_brokers}

        pods_timestamps = self._brokers_state()
        custom_object_client.patch_namespaced_custom_object(namespace=self._namespace,
                                                            group='kafka.strimzi.io',
                                                            version='v1beta1',
                                                            plural='kafkas',
                                                            name=self._name,
                                                            body={'spec': kafka_spec})

        logging.debug("Waiting for kafka brokers to restart")
        waiter.wait_for_predicate(lambda: self._kafka_brokers_restarted(pods_timestamps) is True, timeout=30)
        waiter.wait_for_predicate(lambda: self._is_running is True, timeout=30)
Esempio n. 3
0
def delete_stateful_set_data(client,
                             name,
                             namespace='default',
                             clear_data=False,
                             timeout=60):
    v1_app = kubernetes.client.AppsV1Api(client)
    sts_spec = v1_app.read_namespaced_stateful_set(name=name,
                                                   namespace=namespace).spec
    replicas = sts_spec.replicas

    scale_stateful_set(client, 0, name, namespace)

    claim_templates = [
        volume.metadata.name for volume in sts_spec.volume_claim_templates
    ]
    pvcs_to_delete = []

    for template in claim_templates:
        for i in range(0, replicas):
            pvcs_to_delete.append(f"{template}-{name}-{i}")

    for pvc in pvcs_to_delete:
        delete_pvc(client, pvc, namespace, clear_data)

    scale_stateful_set(client, replicas, name, namespace, timeout=timeout)
    waiter.wait_for_predicate(lambda: is_stateful_set_ready(client, name),
                              timeout=timeout)
Esempio n. 4
0
    def run(self):
        self.kill()
        self.check_for_legacy_containers()
        ssh_direct = self._ssh_direct

        self.docker.login()

        logging.debug("running docker")
        run_cmd = f'{self._docker_bin_path} run -d --rm ' \
                  f'--volume=/tmp/automation_infra/:/tmp/automation_infra ' \
                  f'--volume=/etc/hosts:/etc/hosts ' \
                  f'--volume=/var/log/journal:/var/log/journal ' \
                  f'--volume=/storage/logs:/storage/logs ' \
                  f'--privileged ' \
                  f'--network=host ' \
                  f'--name=automation_proxy gcr.io/anyvision-training/automation-proxy:{self._automation_proxy_version()}'
        try:
            ssh_direct.execute(run_cmd)
            waiter.wait_for_predicate(lambda: self.running)
        except SSHCalledProcessError as e:
            if "endpoint with name automation_proxy already exists in network host" in e.stderr:
                ssh_direct.execute(
                    f"{self._docker_bin_path} network disconnect --force host automation_proxy"
                )
                ssh_direct.execute(run_cmd)
            if f"manifest for gcr.io/anyvision-training/automation-proxy:{self._automation_proxy_version()} not found" in e.stderr:
                logging.error(
                    f"tag {self._automation_proxy_version()} was not pushed to gcr, "
                    f"please run make push-automation-proxy from devops-infra repo"
                )
                raise e
            else:
                raise e
        logging.debug("docker is running")
Esempio n. 5
0
 def kill(self):
     if not self.running:
         logging.debug("nothing to remove")
         return
     logging.debug("trying to remove docker container")
     self._ssh_direct.execute(
         f"{self._docker_bin_path} kill automation_proxy")
     waiter.wait_for_predicate(lambda: not self.running)
     logging.debug("removed successfully!")
Esempio n. 6
0
def scale_deployment(client, replicas, name, namespace='default'):
    v1 = kubernetes.client.AppsV1Api(client)
    v1.patch_namespaced_deployment_scale(name=name,
                                         namespace=namespace,
                                         body={'spec': {
                                             'replicas': replicas
                                         }})
    waiter.wait_for_predicate(lambda: v1.read_namespaced_deployment_scale(
        name=name, namespace=namespace).status.replicas == replicas,
                              timeout=30)
Esempio n. 7
0
def clear_topic(admin, consumer, name):
    retention = get_topic_config_value(admin, name, 'retention.ms')
    update_topic_config(admin, name, {"retention.ms": 1000})
    consumer.subscribe(name)
    # Lazy initiate of topic assignment
    consumer.topics()
    waiter.wait_for_predicate(lambda: consumer.beginning_offsets(
        consumer.assignment()) == consumer.end_offsets(consumer.assignment()))

    update_topic_config(admin, name, {"retention.ms": retention})
    consumer.close()
Esempio n. 8
0
 def kill(self):
     if not self.running:
         logging.debug("nothing to remove")
         return
     logging.debug("trying to remove automation-proxy daemonset")
     try:
         self._k8s_v1_client.delete_namespaced_daemon_set(
             name=self.daemon_set_name, namespace='default')
     except ApiException as e:
         logging.exception(
             "Exception when calling AppsV1Api->create_namespaced_daemon_set: %s\n"
             % e)
     waiter.wait_for_predicate(lambda: not self.running)
     for host in self._cluster.hosts.values():
         host.TunnelManager.clear()
     logging.debug("removed successfully!")
Esempio n. 9
0
def setup_cluster(cluster, request):
    for host_name, config in request.function.__hardware_reqs.items():
        host = dict(cluster.hosts.items())[host_name]
        host.k3s_config = config['k3s_config']
        host.internal_ip = host.SshDirect.execute("hostname -I | awk {'print $1'}").strip()

    logging.info("Setting up k3s cluster")
    hosts = list(cluster.hosts.values())
    masters = [host for host in hosts if host.k3s_config["role"] == "master"]

    if not masters:
        raise Exception("Couldn't find any master node")
    main_master = next(iter(masters))
    main_master.k8s_name = "k3s-master"

    main_master.SshDirect.execute(
        "curl -sfL https://get.k3s.io | sh -s - --cluster-init --cluster-reset --cluster-reset-restore-path=/root/k3s-infra-1174-snapshot")
    waiter.wait_nothrow(lambda: main_master.SshDirect.execute("journalctl --since='1 min ago' | grep 'restart without'"))
    main_master.SshDirect.execute(
        "curl -sfL https://get.k3s.io | sh -s - --node-name=k3s-master --disable='servicelb,traefik,local-storage,metrics-server'")

    main_master.SshDirect.execute("sudo chmod o+r /etc/rancher/k3s/k3s.yaml")
    cluster_token = main_master.SshDirect.execute("sudo cat /var/lib/rancher/k3s/server/token").strip()
    cluster_ip = main_master.SshDirect.execute("hostname -I").strip()
    waiter.wait_nothrow(lambda: main_master.SshDirect.execute("kubectl get nodes"))

    nodes = [host for host in hosts if host.k3s_config['role'] == "node"]
    masters.remove(main_master)

    jobs = {}
    nodes_jobs = {f"{host.alias}": partial(_join_agent, host, cluster_ip, cluster_token) for host in nodes}
    masters_jobs = {f"{master.alias}": partial(_join_master, master, cluster_ip, cluster_token) for master in masters}
    jobs.update(nodes_jobs)
    jobs.update(masters_jobs)
    if jobs:
        concurrently.run(jobs)

    logging.info("Waiting for cluster to be Ready...")
    k8s_client = cluster.Kubectl.client()
    v1 = kubernetes.client.CoreV1Api(k8s_client)
    waiter.wait_for_predicate(lambda: len(v1.list_node().items) == len(hosts), timeout=30)
    logging.info(f"Number of nodes in cluster: {len(v1.list_node().items)}")
    waiter.wait_for_predicate(lambda: kubectl.is_cluster_ready(k8s_client), timeout=60)

    logging.info("Adding node labels and taints")
    _label_and_taint_nodes(k8s_client, hosts)
Esempio n. 10
0
    def _create_service_account(self):
        if self._api_token:
            return
        ssh = self._master.SshDirect
        try:
            ssh.execute("sudo kubectl create sa automation-admin")
            ssh.execute(
                "sudo kubectl create clusterrolebinding automation-admin --serviceaccount=default:automation-admin --clusterrole=cluster-admin"
            )
        except SSHCalledProcessError as e:
            pass

        get_sa_token = lambda: ssh.execute(
            '''sudo kubectl get secrets -n default -o jsonpath="{.items[?(@.metadata.annotations['kubernetes\.io/service-account\.name']=='automation-admin')].data.token}"|base64 --decode'''
        ).strip()
        waiter.wait_for_predicate(get_sa_token, timeout=30)
        self._api_token = get_sa_token()
Esempio n. 11
0
def recycle_pvc(client, pvc_name, namespace='default', timeout=60):
    k8s_client = kubernetes.client
    v1 = k8s_client.CoreV1Api(client)
    try:
        v1.read_namespaced_persistent_volume_claim(name=pvc_name,
                                                   namespace=namespace)
    except ApiException as e:
        if e.status == 404:
            raise ApiException(
                f"Couldn't find pvc {pvc_name} in namespace {namespace}")

    container = kubernetes.client.V1Container(
        name="pv-cleaner",
        command=["/bin/sh", "-c", "rm -rf /scrub/*"],
        image="k8s.gcr.io/busybox",
        volume_mounts=[
            k8s_client.V1VolumeMount(name="pvc-volume", mount_path="/scrub")
        ])

    volume = k8s_client.V1Volume(
        name="pvc-volume",
        persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource(
            claim_name=pvc_name))
    pod_spec = k8s_client.V1PodSpec(volumes=[volume],
                                    containers=[container],
                                    restart_policy="Never")
    pod_name = f"pv-cleaner-{str(uuid.uuid4())[:6]}"
    pod = k8s_client.V1Pod(metadata=k8s_client.V1ObjectMeta(name=pod_name),
                           spec=pod_spec)

    v1 = k8s_client.CoreV1Api(client)
    v1.create_namespaced_pod(namespace=namespace, body=pod)

    try:
        waiter.wait_for_predicate(lambda: v1.read_namespaced_pod(
            name=pod_name, namespace=namespace).status.phase == "Succeeded",
                                  timeout=timeout)

    except TimeoutError as e:
        logging.debug(
            v1.read_namespaced_pod(name=pod_name, namespace=namespace).status)
        raise e

    v1.delete_namespaced_pod(name=pod_name, namespace=namespace)
Esempio n. 12
0
    def run(self):
        self.kill()
        logging.debug("Deploying automation-proxy DaemonSet")
        kubectl.create_image_pull_secret(self._cluster.Kubectl.client())
        with open(
                os.path.join(os.path.dirname(os.path.abspath(__file__)),
                             "../../proxy_container/daemonset.yaml")) as f:
            ds_yaml = yaml.safe_load(f)
        ds_yaml['spec']['template']['spec']['containers'][0][
            'image'] = f'gcr.io/anyvision-training/automation-proxy:{self._automation_proxy_version()}'
        try:
            res = self._k8s_v1_client.create_namespaced_daemon_set(
                namespace="default", body=ds_yaml)
        except ApiException as e:
            logging.exception(
                "Exception when calling AppsV1Api->create_namespaced_daemon_set: %s\n"
                % e)

        waiter.wait_for_predicate(
            lambda: self._num_ready_pods() == len(self._cluster.hosts),
            timeout=120)
        logging.debug(f"Deployment created. status={res.metadata.name}")
Esempio n. 13
0
 def delete_app_data(self,
                     name,
                     label_value=None,
                     label_name="app",
                     resource_type="statefulset"):
     label_value = label_value or name
     logging.debug(f"get {name} {resource_type} pods")
     pod_list = self.get_pods_using_selector_labels(
         label_name=label_name, label_value=label_value)['items']
     num_of_pods = len(pod_list)
     if num_of_pods == 0:
         raise Exception(f"unable to find {name} {resource_type} pods")
     pvc_list = []
     pv_list = []
     for pod in pod_list:
         pod_name = pod["metadata"]["name"]
         logging.debug(f"get pvc name from {name} pod")
         pvc_name = self.get_pvc_by_pod_name(pod_name)
         pvc_list.append(pvc_name)
         logging.debug(f"get pv name from {pvc_name} pvc")
         pv_name = self.get_pv_by_pvc_name(pvc_name)
         pv_list.append(pv_name)
     for pv in pv_list:
         logging.debug(f"set reclaim policy \"Delete\" to {pv} pv")
         self.set_pv_reclaim_policy(pv, "Delete")
     logging.debug(f"scale down {resource_type}: {name}")
     self.scale(name, resource_type, replicas=0)
     self.delete_pod_by_label(label_value, label_name, "true", 0)
     wait_for_predicate(
         lambda: self.num_of_pod_replicas(name, resource_type) == 0, 120)
     for pvc in pvc_list:
         logging.debug(f"delete {pvc} pvc")
         self.delete_pvc(pvc)
     logging.debug(f"scale up {resource_type} {name}")
     self.scale(name, resource_type, replicas=num_of_pods)
     wait_for_predicate_nothrow(
         lambda: self.num_of_ready_pod_replicas(name, resource_type) ==
         num_of_pods, 180)
Esempio n. 14
0
def test_cluster_network_master_restart(
        base_config,
        clean_up_all_deployments_and_svcs,
        amount_of_replicas=100,
        docker_image_name='gcr.io/hello-minikube-zero-install/hello-node',
        deployment_name="test"):
    # Clean up before test starts
    base_config.hosts.host1.SshDirect.connect(timeout=60)
    create_deployment_with_replicas(base_config.hosts.host1, deployment_name,
                                    docker_image_name, amount_of_replicas)
    base_config.hosts.host1.Power.reboot()
    # Check host has started again
    wait_for_predicate_nothrow(
        lambda: host_is_active(base_config.hosts.host1.ip), timeout=60)
    base_config.hosts.host1.SshDirect.connect(timeout=60)
    wait_for_predicate(
        lambda: base_config.hosts.host1.Gravity.is_cluster_healthy(),
        timeout=120,
        interval=5)
    wait_for_predicate_nothrow(lambda: all_deployments_pods_alive(
        base_config.hosts.host1, deployment_name),
                               timeout=300,
                               interval=10)
Esempio n. 15
0
 def wait_container_health_status(self, name_regex, status, timeout=100):
     waiter.wait_for_predicate(
         lambda: self.get_container_health_status(name_regex) == status,
         timeout=timeout)
Esempio n. 16
0
def wait_for_job_to_succeed(client, job_name, namespace='default', timeout=60):
    waiter.wait_for_predicate(lambda: get_job_status(
        client, namespace=namespace, job_name=job_name).succeeded == 1,
                              timeout=timeout)
Esempio n. 17
0
def create_deployment_with_replicas(host, name, docker_image,
                                    amount_of_replicas):
    wait_for_predicate(lambda: host.K8s.create_deployment(name, docker_image),
                       timeout=120)
    host.K8s.scale_deployment(name, int(amount_of_replicas))
    host.K8s.expose_deployment(name)
Esempio n. 18
0
 def reboot(self, options=""):
     # Reboots the host and verifies using a ping
     host = self._host
     host.SshDirect.execute(
         f"sudo /sbin/reboot {options} > /dev/null 2>&1 &", timeout=0.1)
     wait_for_predicate(lambda: not host_is_active(host.ip), timeout=20)
Esempio n. 19
0
 def wait_for_redis_to_be_up(self):
     waiter.wait_for_predicate(lambda: self.ping(), timeout=30)
Esempio n. 20
0
 def restart_pod_by_service_name(self, service_name):
     self.delete_pod_by_service_name(service_name)
     wait_for_predicate(
         lambda: self.number_ready_pods_in_deployment(service_name) == 1)