Ejemplo n.º 1
0
    def test_install_amq_cephfs(self, test_fixture_amq):
        """
        Testing basics: secret creation,
        storage class creation, pvc and pod with cephfs
        """

        amq = test_fixture_amq.setup_amq()
        if amq.is_amq_pod_running(pod_pattern="cluster-operator"):
            log.info("strimzi-cluster-operator pod is in running state")
        else:
            raise ResourceWrongStatusException(
                "strimzi-cluster-operator pod is not getting to running state")

        if amq.is_amq_pod_running(pod_pattern="zookeeper"):
            log.info("my-cluster-zookeeper Pod is in running state")
        else:
            raise ResourceWrongStatusException(
                "my-cluster-zookeeper Pod is not getting to running state")

        if amq.is_amq_pod_running(pod_pattern="my-connect-cluster-connect"):
            log.info("my-connect-cluster-connect Pod is in running state")
        else:
            raise ResourceWrongStatusException(
                "my-connect-cluster-connect pod is not getting to running state"
            )

        if amq.is_amq_pod_running(pod_pattern="my-bridge-bridge"):
            log.info("my-bridge-bridge Pod is in running state")
        else:
            raise ResourceWrongStatusException(
                "my-bridge-bridge is not getting to running state")
Ejemplo n.º 2
0
    def wait_for_phase(self, phase, timeout=300, sleep=5):
        """
        Wait till phase of resource is the same as required one passed in
        the phase parameter.

        Args:
            phase (str): Desired phase of resource object
            timeout (int): Timeout in seconds to wait for desired phase
            sleep (int): Time in seconds to sleep between attempts

        Raises:
            ResourceWrongStatusException: In case the resource is not in expected
                phase.
            NotSupportedFunctionError: If resource doesn't have phase!
            ResourceNameNotSpecifiedException: in case the name is not
                specified.

        """
        self.check_function_supported(self._has_phase)
        self.check_name_is_specified()
        sampler = TimeoutSampler(timeout,
                                 sleep,
                                 func=self.check_phase,
                                 phase=phase)
        if not sampler.wait_for_func_status(True):
            raise ResourceWrongStatusException(
                f"Resource: {self.resource_name} is not in expected phase: "
                f"{phase}")
Ejemplo n.º 3
0
    def setup_amq_kafka_connect(self):
        """
        The function is to setup amq-kafka-connect, the yaml file is pulling from github
        it will make kind: KafkaConnect and will make sure the status is running

        Returns: kafka_connect object
        """
        try:
            kafka_connect = templating.load_yaml(
                os.path.join(self.dir, self.amq_kafka_connect_yaml)
            )
            self.kafka_connect = OCS(**kafka_connect)
            self.kafka_connect.create()
        except (CommandFailed, CalledProcessError) as cf:
            log.error("Failed during setup of AMQ KafkaConnect")
            raise cf

        if self.is_amq_pod_running(
            pod_pattern="my-connect-cluster-connect", expected_pods=1
        ):
            return self.kafka_connect
        else:
            raise ResourceWrongStatusException(
                "my-connect-cluster-connect pod is not getting to running state"
            )
Ejemplo n.º 4
0
    def create_kafka_topic(self, name="my-topic", partitions=1, replicas=1):
        """
        Creates kafka topic

        Args:
            name (str): Name of the kafka topic
            partitions (int): Number of partitions
            replicas (int): Number of replicas

        Return: kafka_topic object
        """
        try:
            kafka_topic = templating.load_yaml(
                os.path.join(self.dir, self.kafka_topic_yaml)
            )
            kafka_topic["metadata"]["name"] = name
            kafka_topic["spec"]["partitions"] = partitions
            kafka_topic["spec"]["replicas"] = replicas
            self.kafka_topic = OCS(**kafka_topic)
            self.kafka_topic.create()
        except (CommandFailed, CalledProcessError) as cf:
            if f'kafkatopics.kafka.strimzi.io "{name}" already exists' not in str(cf):
                log.error("Failed during creating of Kafka topic")
                raise cf

        # Making sure kafka topic created
        if self.kafka_topic_obj.get(resource_name=name):
            return self.kafka_topic
        else:
            raise ResourceWrongStatusException("kafka topic is not created")
Ejemplo n.º 5
0
    def create_kafka_user(self, name="my-user"):
        """
        Creates kafka user

        Args:
             name (str): Name of the kafka user

        Return: kafka_user object

        """
        try:
            kafka_user = templating.load_yaml(
                os.path.join(self.dir, self.kafka_user_yaml)
            )
            kafka_user["metadata"]["name"] = name
            self.kafka_user = OCS(**kafka_user)
            self.kafka_user.create()
        except (CommandFailed, CalledProcessError) as cf:
            log.error("Failed during creating of Kafka user")
            raise cf

        # Making sure kafka user created
        if self.kafka_user_obj.get(resource_name=name):
            return self.kafka_user
        else:
            raise ResourceWrongStatusException("kafka user is not created")
Ejemplo n.º 6
0
    def create_consumer_pod(self, num_of_pods=1, value="10000"):
        """
        Creates producer pods

        Args:
            num_of_pods (int): Number of consumer pods to be created
            value (str): Number of messages to be received

        Returns: consumer pod object

        """
        try:
            consumer_pod = templating.load_yaml(constants.HELLO_WORLD_CONSUMER_YAML)
            consumer_pod["spec"]["replicas"] = num_of_pods
            consumer_pod["spec"]["template"]["spec"]["containers"][0]["env"][4][
                "value"
            ] = value
            self.consumer_pod = OCS(**consumer_pod)
            self.consumer_pod.create()
        except (CommandFailed, CalledProcessError) as cf:
            log.error("Failed during creation of consumer pod")
            raise cf

        # Making sure the producer pod is running
        if self.is_amq_pod_running(
            pod_pattern="hello-world-consumer", expected_pods=num_of_pods
        ):
            return self.consumer_pod
        else:
            raise ResourceWrongStatusException(
                "consumer pod is not getting to running state"
            )
Ejemplo n.º 7
0
def wait_for_new_node_to_be_ready(machine_set, timeout=300):
    """
    Wait for the new node to reach ready state

    Args:
        machine_set (str): Name of the machine set

    Raises:
        ResourceWrongStatusException: In case the new spun machine fails
            to reach Ready state or replica count didn't match

    """
    replica_count = get_replica_count(machine_set)
    try:
        for timer in TimeoutSampler(timeout,
                                    15,
                                    get_ready_replica_count,
                                    machine_set=machine_set):
            if replica_count == timer:
                log.info("New spun node reached Ready state")
                break
    except TimeoutExpiredError:
        log.error("New spun node failed to reach ready state OR "
                  "Replica count didn't match ready replica count")
        raise ResourceWrongStatusException(
            machine_set,
            [m.describe() for m in get_machineset_objs(machine_set)])
Ejemplo n.º 8
0
    def setup_amq_cluster_operator(self, namespace=constants.AMQ_NAMESPACE):
        """
        Function to setup amq-cluster_operator,
        the file is pulling from github
        it will make sure cluster-operator pod is running

        Args:
            namespace (str): Namespace for AMQ pods

        """

        # Namespace for amq
        try:
            self.create_namespace(namespace)
        except CommandFailed as ef:
            if f'project.project.openshift.io "{namespace}" already exists' not in str(
                ef
            ):
                raise ef

        # Create strimzi-cluster-operator pod
        run(
            f"for i in `(ls strimzi-kafka-operator/packaging/install/cluster-operator/)`;"
            f"do sed 's/{namespace}/myproject/g' "
            f"strimzi-kafka-operator/packaging/install/cluster-operator/$i;done",
            shell=True,
            check=True,
            cwd=self.dir,
        )
        self.strimzi_kafka_operator = os.path.join(self.dir, self.amq_dir)
        pf_files = os.listdir(self.strimzi_kafka_operator)
        crds = []
        for crd in pf_files:
            crds.append(crd)
        self.crd_objects = []
        for adm_yaml in crds:
            try:
                adm_data = templating.load_yaml(self.strimzi_kafka_operator + adm_yaml)
                adm_obj = OCS(**adm_data)
                adm_obj.create()
                self.crd_objects.append(adm_obj)
            except (CommandFailed, CalledProcessError) as cfe:
                if "Error is Error from server (AlreadyExists):" in str(cfe):
                    log.warn(
                        "Some amq leftovers are present, please cleanup the cluster"
                    )
                    pytest.skip(
                        "AMQ leftovers are present needs to cleanup the cluster"
                    )
        time.sleep(30)
        #  Check strimzi-cluster-operator pod created
        if self.is_amq_pod_running(pod_pattern="cluster-operator", expected_pods=1):
            log.info("strimzi-cluster-operator pod is in running state")
        else:
            raise ResourceWrongStatusException(
                "strimzi-cluster-operator pod is not getting to running state"
            )
Ejemplo n.º 9
0
    def setup_amq_cluster_operator(self):
        """
        Function to setup amq-cluster_operator,
        the file file is pulling from github
        it will make sure cluster-operator pod is running
        """

        # self.amq_dir = constants.TEMPLATE_DEPLOYMENT_AMQ_CP
        run(f'oc apply -f {self.amq_dir} -n {self.namespace}', shell=True, check=True, cwd=self.dir)
        time.sleep(5)
        # Wait for strimzi-cluster-operator pod to be created
        if self.is_amq_pod_running(pod_pattern="cluster-operator"):
            log.info("strimzi-cluster-operator pod is in running state")
        else:
            raise ResourceWrongStatusException("strimzi-cluster-operator pod is not getting to running state")

        run(f'oc apply -f {self.amq_dir_examples} -n {self.namespace}', shell=True, check=True, cwd=self.dir)
        # checking pod status one more time
        if self.is_amq_pod_running(pod_pattern="cluster-operator"):
            log.info("strimzi-cluster-operator pod is in running state")
        else:
            raise ResourceWrongStatusException("strimzi-cluster-operator pod is not getting to running state")
Ejemplo n.º 10
0
    def setup_amq_kafka_persistent(self, sc_name, size=100, replicas=3):
        """
        Function to setup amq-kafka-persistent, the file is pulling from github
        it will make kind: Kafka and will make sure the status is running

        Args:
            sc_name (str): Name of sc
            size (int): Size of the storage in Gi
            replicas (int): Number of kafka and zookeeper pods to be created

        return : kafka_persistent

        """
        if storagecluster_independent_check():
            sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD
        try:
            kafka_persistent = templating.load_yaml(
                os.path.join(self.dir, self.amq_kafka_pers_yaml)
            )
            kafka_persistent["spec"]["kafka"]["replicas"] = replicas
            kafka_persistent["spec"]["kafka"]["storage"]["volumes"][0][
                "class"
            ] = sc_name
            kafka_persistent["spec"]["kafka"]["storage"]["volumes"][0][
                "size"
            ] = f"{size}Gi"

            kafka_persistent["spec"]["zookeeper"]["replicas"] = replicas
            kafka_persistent["spec"]["zookeeper"]["storage"]["class"] = sc_name
            kafka_persistent["spec"]["zookeeper"]["storage"]["size"] = f"{size}Gi"
            self.kafka_persistent = OCS(**kafka_persistent)
            self.kafka_persistent.create()

        except (CommandFailed, CalledProcessError) as cf:
            log.error("Failed during setup of AMQ Kafka-persistent")
            raise cf
        time.sleep(40)

        if self.is_amq_pod_running(
            pod_pattern="my-cluster", expected_pods=(replicas * 2) + 1
        ):
            return self.kafka_persistent
        else:
            raise ResourceWrongStatusException(
                "my-cluster-kafka and my-cluster-zookeeper "
                "Pod is not getting to running state"
            )
Ejemplo n.º 11
0
    def setup_amq_cluster_operator(self, namespace=constants.AMQ_NAMESPACE):
        """
        Function to setup amq-cluster_operator,
        the file is pulling from github
        it will make sure cluster-operator pod is running

        Args:
            namespace (str): Namespace for AMQ pods

        """

        # Namespace for amq
        try:
            self.create_namespace(namespace)
        except CommandFailed as ef:
            if f'project.project.openshift.io "{namespace}" already exists' not in str(
                    ef):
                raise ef

        # Create strimzi-cluster-operator pod
        run(
            f"for i in `(ls strimzi-kafka-operator/packaging/install/cluster-operator/)`;"
            f"do sed 's/{namespace}/myproject/g' "
            f"strimzi-kafka-operator/packaging/install/cluster-operator/$i;done",
            shell=True,
            check=True,
            cwd=self.dir,
        )
        self.strimzi_kafka_operator = os.path.join(self.dir, self.amq_dir)
        cmd = f"oc create -f {self.strimzi_kafka_operator}"
        log.info(f"Executing cmd: {cmd}")
        run(
            cmd,
            shell=True,
            check=True,
            cwd=self.dir,
        )
        time.sleep(10)

        #  Check strimzi-cluster-operator pod created
        if self.is_amq_pod_running(pod_pattern="cluster-operator",
                                   expected_pods=1):
            log.info("strimzi-cluster-operator pod is in running state")
        else:
            raise ResourceWrongStatusException(
                "strimzi-cluster-operator pod is not getting to running state")
Ejemplo n.º 12
0
    def setup_amq_kafka_bridge(self):
        """
        Function to setup amq-kafka, the file file is pulling from github
        it will make kind: KafkaBridge and will make sure the pod status is running

        Return: kafka_bridge object
        """
        try:
            kafka_bridge = templating.load_yaml(os.path.join(self.dir, self.amq_kafka_bridge_yaml))
            self.kafka_bridge = OCS(**kafka_bridge)
            self.kafka_bridge.create()
        except(CommandFailed, CalledProcessError) as cf:
            log.error('Failed during setup of AMQ KafkaConnect')
            raise cf
        # Making sure the kafka_bridge is running
        if self.is_amq_pod_running(pod_pattern="my-bridge-bridge"):
            return self.kafka_bridge
        else:
            raise ResourceWrongStatusException("kafka_bridge_pod pod is not getting to running state")
Ejemplo n.º 13
0
    def setup_amq_kafka_persistent(self, sc_name, size=100, replicas=3):
        """
        Function to setup amq-kafka-persistent, the file is pulling from github
        it will make kind: Kafka and will make sure the status is running

        Args:
            sc_name (str): Name of sc
            size (int): Size of the storage in Gi
            replicas (int): Number of kafka and zookeeper pods to be created

        return : kafka_persistent

        """
        try:
            kafka_persistent = templating.load_yaml(
                os.path.join(self.dir, self.amq_kafka_pers_yaml))
            kafka_persistent['spec']['kafka']['replicas'] = replicas
            kafka_persistent['spec']['kafka']['storage']['volumes'][0][
                'class'] = sc_name
            kafka_persistent['spec']['kafka']['storage']['volumes'][0][
                'size'] = f"{size}Gi"

            kafka_persistent['spec']['zookeeper']['replicas'] = replicas
            kafka_persistent['spec']['zookeeper']['storage']['class'] = sc_name
            kafka_persistent['spec']['zookeeper']['storage'][
                'size'] = f"{size}Gi"
            self.kafka_persistent = OCS(**kafka_persistent)
            self.kafka_persistent.create()

        except (CommandFailed, CalledProcessError) as cf:
            log.error('Failed during setup of AMQ Kafka-persistent')
            raise cf
        time.sleep(40)

        if self.is_amq_pod_running(
                pod_pattern="my-cluster-zookeeper",
                expected_pods=replicas) and self.is_amq_pod_running(
                    pod_pattern="my-cluster-kafka", expected_pods=replicas):
            return self.kafka_persistent
        else:
            raise ResourceWrongStatusException(
                "my-cluster-kafka and my-cluster-zookeeper "
                "Pod is not getting to running state")
Ejemplo n.º 14
0
 def verify_uls_state(self, uls_name, is_available):
     check_type = "Delete"
     if is_available:
         check_type = "Create"
     sample = TimeoutSampler(timeout=180,
                             sleep=15,
                             func=self.verify_uls_exists,
                             uls_name=uls_name)
     if sample.wait_for_func_status(result=is_available):
         logger.info(
             f"Underlying Storage {uls_name} {check_type.lower()}d successfully."
         )
     else:
         if is_available:
             raise ResourceWrongStatusException(
                 f"{check_type[:-1]}ion of Underlying Storage {uls_name} timed out. "
                 f"Unable to {check_type.lower()} {uls_name}")
         logger.warning(
             f"{uls_name} still found after 3 minutes, and might require manual removal."
         )
Ejemplo n.º 15
0
    def setup_amq_kafka_persistent(self):
        """
        Function to setup amq-kafka-persistent, the file file is pulling from github
        it will make kind: Kafka and will make sure the status is running
        :return: kafka_persistent
        """

        try:
            kafka_persistent = templating.load_yaml(os.path.join(self.dir, self.amq_kafka_pers_yaml))
            self.kafka_persistent = OCS(**kafka_persistent)
            self.kafka_persistent.create()

        except(CommandFailed, CalledProcessError) as cf:
            log.error('Failed during setup of AMQ Kafka-persistent')
            raise cf
        time.sleep(5)
        if self.is_amq_pod_running(pod_pattern="zookeeper"):
            return self.kafka_persistent
        else:
            raise ResourceWrongStatusException("my-cluster-zookeeper Pod is not getting to running state")
Ejemplo n.º 16
0
    def wait_for_state(self, state, timeout=480, sleep=5):
        """
        Wait till state of catalog source resource is the same as required one
        passed in the state parameter.

        Args:
            state (str): Desired state of catalog source object
            timeout (int): Timeout in seconds to wait for desired state
            sleep (int): Time in seconds to sleep between attempts

        Raises:
            ResourceWrongStatusException: In case the catalog source is not in
                expected state.

        """
        self.check_name_is_specified()
        sampler = TimeoutSampler(timeout, sleep, self.check_state, state=state)
        if not sampler.wait_for_func_status(True):
            raise ResourceWrongStatusException(
                f"Catalog source: {self.resource_name} is not in expected "
                f"state: {state}")
Ejemplo n.º 17
0
def wait_for_resource_state(resource, state, timeout=60):
    """
    Wait for a resource to get to a given status

    Args:
        resource (OCS obj): The resource object
        state (str): The status to wait for
        timeout (int): Time in seconds to wait

    Raises:
        ResourceWrongStatusException: In case the resource hasn't
            reached the desired state

    """
    try:
        resource.ocp.wait_for_resource(condition=state,
                                       resource_name=resource.name,
                                       timeout=timeout)
    except TimeoutExpiredError:
        logger.error(
            f"{resource.kind} {resource.name} failed to reach {state}")
        resource.reload()
        raise ResourceWrongStatusException(resource.name, resource.describe())
    logger.info(f"{resource.kind} {resource.name} reached state {state}")
Ejemplo n.º 18
0
    def run_amq_benchmark(
        self,
        benchmark_pod_name="benchmark",
        kafka_namespace=constants.AMQ_NAMESPACE,
        tiller_namespace=AMQ_BENCHMARK_NAMESPACE,
        num_of_clients=8,
        worker=None,
        timeout=1800,
        amq_workload_yaml=None,
        run_in_bg=False,
    ):
        """
        Run benchmark pod and get the results

        Args:
            benchmark_pod_name (str): Name of the benchmark pod
            kafka_namespace (str): Namespace where kafka cluster created
            tiller_namespace (str): Namespace where tiller pod needs to be created
            num_of_clients (int): Number of clients to be created
            worker (str) : Loads to create on workloads separated with commas
                e.g http://benchmark-worker-0.benchmark-worker:8080,
                http://benchmark-worker-1.benchmark-worker:8080
            timeout (int): Time to complete the run
            amq_workload_yaml (dict): Contains amq workloads information keys and values
                :name (str): Name of the workloads
                :topics (int): Number of topics created
                :partitions_per_topic (int): Number of partitions per topic
                :message_size (int): Message size
                :payload_file (str): Load to run on workload
                :subscriptions_per_topic (int): Number of subscriptions per topic
                :consumer_per_subscription (int): Number of consumers per subscription
                :producers_per_topic (int): Number of producers per topic
                :producer_rate (int): Producer rate
                :consumer_backlog_sizegb (int): Size of block in gb
                :test_duration_minutes (int): Time to run the workloads
            run_in_bg (bool): On true the workload will run in background

        Return:
            result (str/Thread obj): Returns benchmark run information if run_in_bg is False.
                Otherwise a thread of the amq workload execution

        """

        # Namespace for to helm/tiller
        try:
            self.create_namespace(tiller_namespace)
        except CommandFailed as ef:
            if (
                f'project.project.openshift.io "{tiller_namespace}" already exists'
                not in str(ef)
            ):
                raise ef

        # Create rbac file
        try:
            sa_tiller = list(
                templating.load_yaml(constants.AMQ_RBAC_YAML, multi_document=True)
            )
            sa_tiller[0]["metadata"]["namespace"] = tiller_namespace
            sa_tiller[1]["subjects"][0]["namespace"] = tiller_namespace
            self.sa_tiller = OCS(**sa_tiller[0])
            self.crb_tiller = OCS(**sa_tiller[1])
            self.sa_tiller.create()
            self.crb_tiller.create()
        except (CommandFailed, CalledProcessError) as cf:
            log.error("Failed during creation of service account tiller")
            raise cf

        # Install helm cli (version v2.16.0 as we need tiller component)
        # And create tiller pods
        wget_cmd = f"wget -c --read-timeout=5 --tries=0 {URL}"
        untar_cmd = "tar -zxvf helm-v2.16.1-linux-amd64.tar.gz"
        tiller_cmd = (
            f"linux-amd64/helm init --tiller-namespace {tiller_namespace}"
            f" --service-account {tiller_namespace}"
        )
        exec_cmd(cmd=wget_cmd, cwd=self.dir)
        exec_cmd(cmd=untar_cmd, cwd=self.dir)
        exec_cmd(cmd=tiller_cmd, cwd=self.dir)

        # Validate tiller pod is running
        log.info("Waiting for 30s for tiller pod to come up")
        time.sleep(30)
        if self.is_amq_pod_running(
            pod_pattern="tiller", expected_pods=1, namespace=tiller_namespace
        ):
            log.info("Tiller pod is running")
        else:
            raise ResourceWrongStatusException("Tiller pod is not in running state")

        # Create benchmark pods
        log.info("Create benchmark pods")
        values = templating.load_yaml(constants.AMQ_BENCHMARK_VALUE_YAML)
        values["numWorkers"] = num_of_clients
        benchmark_cmd = (
            f"linux-amd64/helm install {constants.AMQ_BENCHMARK_POD_YAML}"
            f" --name {benchmark_pod_name} --tiller-namespace {tiller_namespace}"
        )
        exec_cmd(cmd=benchmark_cmd, cwd=self.dir)

        # Making sure the benchmark pod and clients are running
        if self.is_amq_pod_running(
            pod_pattern="benchmark",
            expected_pods=(1 + num_of_clients),
            namespace=tiller_namespace,
        ):
            log.info("All benchmark pod is up and running")
        else:
            raise ResourceWrongStatusException(
                "Benchmark pod is not getting to running state"
            )

        # Update commonConfig with kafka-bootstrap server details
        driver_kafka = templating.load_yaml(constants.AMQ_DRIVER_KAFKA_YAML)
        driver_kafka[
            "commonConfig"
        ] = f"bootstrap.servers=my-cluster-kafka-bootstrap.{kafka_namespace}.svc.cluster.local:9092"
        json_file = f"{self.dir}/driver_kafka"
        templating.dump_data_to_json(driver_kafka, json_file)
        cmd = f"cp {json_file} {benchmark_pod_name}-driver:/"
        self.pod_obj.exec_oc_cmd(cmd)

        # Update the workload yaml
        if not amq_workload_yaml:
            amq_workload_yaml = templating.load_yaml(constants.AMQ_WORKLOAD_YAML)
        yaml_file = f"{self.dir}/amq_workload.yaml"
        templating.dump_data_to_temp_yaml(amq_workload_yaml, yaml_file)
        cmd = f"cp {yaml_file} {benchmark_pod_name}-driver:/"
        self.pod_obj.exec_oc_cmd(cmd)

        self.benchmark = True

        # Run the benchmark
        if worker:
            cmd = f"bin/benchmark --drivers /driver_kafka --workers {worker} /amq_workload.yaml"
        else:
            cmd = "bin/benchmark --drivers /driver_kafka /amq_workload.yaml"
        log.info(f"Run benchmark and running command {cmd} inside the benchmark pod ")

        if run_in_bg:
            executor = ThreadPoolExecutor(1)
            result = executor.submit(
                self.run_amq_workload,
                cmd,
                benchmark_pod_name,
                tiller_namespace,
                timeout,
            )
            return result

        pod_obj = get_pod_obj(
            name=f"{benchmark_pod_name}-driver", namespace=tiller_namespace
        )
        result = pod_obj.exec_cmd_on_pod(
            command=cmd, out_yaml_format=False, timeout=timeout
        )

        return result
Ejemplo n.º 19
0
    def test_all_worker_nodes_short_network_failure(
        self, nodes, setup, node_restart_teardown
    ):
        """
        OCS-1432/OCS-1433:
        - Start DeploymentConfig based app pods
        - Make all the worker nodes unresponsive by doing abrupt network failure
        - Reboot the unresponsive node after short duration of ~300 seconds
        - When unresponsive node recovers, app pods and ceph cluster should recover
        - Again run IOs from app pods
        """
        pod_objs = setup
        worker_nodes = node.get_worker_nodes()

        # Run IO on pods
        logger.info(f"Starting IO on {len(pod_objs)} app pods")
        with ThreadPoolExecutor() as executor:
            for pod_obj in pod_objs:
                logger.info(f"Starting IO on pod {pod_obj.name}")
                storage_type = (
                    "block" if pod_obj.pvc.get_pvc_vol_mode == "Block" else "fs"
                )
                executor.submit(
                    pod_obj.run_io,
                    storage_type=storage_type,
                    size="2G",
                    runtime=30,
                    fio_filename=f"{pod_obj.name}_io_f1",
                )

        logger.info(f"IO started on all {len(pod_objs)} app pods")

        # Wait for IO results
        for pod_obj in pod_objs:
            pod.get_fio_rw_iops(pod_obj)

        # Induce network failure on all worker nodes
        with ThreadPoolExecutor() as executor:
            for node_name in worker_nodes:
                executor.submit(node.node_network_failure, node_name, False)

        node.wait_for_nodes_status(
            node_names=worker_nodes, status=constants.NODE_NOT_READY
        )

        logger.info(f"Waiting for {self.short_nw_fail_time} seconds")
        sleep(self.short_nw_fail_time)

        # Reboot the worker nodes
        logger.info(f"Stop and start the worker nodes: {worker_nodes}")
        nodes.restart_nodes_by_stop_and_start(node.get_node_objs(worker_nodes))

        try:
            node.wait_for_nodes_status(
                node_names=worker_nodes, status=constants.NODE_READY
            )
            logger.info("Wait for OCS pods to be in running state")
            if not pod.wait_for_pods_to_be_running(timeout=720):
                raise ResourceWrongStatusException("Pods are not in running state")
        except ResourceWrongStatusException:
            # Restart nodes
            nodes.restart_nodes(node.get_node_objs(worker_nodes))

        ceph_health_check(tries=80)

        # Get current info of app pods
        new_pod_objs = list()
        for pod_obj in pod_objs:
            pod_label = pod_obj.labels.get("deploymentconfig")
            pods_data = pod.get_pods_having_label(
                f"deploymentconfig={pod_label}", pod_obj.namespace
            )
            current_pods = [
                pod_data.get("metadata").get("name")
                for pod_data in pods_data
                if "-deploy" not in pod_data.get("metadata").get("name")
            ]
            logger.info(f"Pods with label {pod_label}: {current_pods}")

            # Remove the older pod from the list if pod is rescheduled
            if len(current_pods) > 1:
                current_pods.remove(pod_obj.name)

            new_pod_obj = pod.get_pod_obj(current_pods.pop(), pod_obj.namespace)
            new_pod_obj.pvc = pod_obj.pvc
            new_pod_objs.append(new_pod_obj)

        logger.info("Wait for app pods are in running state")
        for pod_obj in new_pod_objs:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_name=pod_obj.name,
                timeout=720,
                sleep=20,
            )
        logger.info("All the app pods reached running state")

        # Run more IOs on app pods
        with ThreadPoolExecutor() as executor:
            for pod_obj in new_pod_objs:
                logger.info(f"Starting IO on pod {pod_obj.name}")
                pod_obj.wl_setup_done = False
                storage_type = (
                    "block" if pod_obj.pvc.get_pvc_vol_mode == "Block" else "fs"
                )
                executor.submit(
                    pod_obj.run_io,
                    storage_type=storage_type,
                    size="1G",
                    runtime=30,
                    fio_filename=f"{pod_obj.name}_io_f2",
                )

        for pod_obj in new_pod_objs:
            pod.get_fio_rw_iops(pod_obj)
Ejemplo n.º 20
0
    def wait_for_resource(
        self,
        condition,
        resource_name="",
        column="STATUS",
        selector=None,
        resource_count=0,
        timeout=60,
        sleep=3,
        dont_allow_other_resources=False,
        error_condition=None,
    ):
        """
        Wait for a resource to reach to a desired condition

        Args:
            condition (str): The desired state the resource that is sampled
                from 'oc get <kind> <resource_name>' command
            resource_name (str): The name of the resource to wait
                for (e.g.my-pv1)
            column (str): The name of the column to compare with
            selector (str): The resource selector to search with.
                Example: 'app=rook-ceph-mds'
            resource_count (int): How many resources expected to be
            timeout (int): Time in seconds to wait
            sleep (int): Sampling time in seconds
            dont_allow_other_resources (bool): If True it will not allow other
                resources in different state. For example you are waiting for 2
                resources and there are currently 3 (2 in running state,
                1 in ContainerCreating) the function will continue to next
                iteration to wait for only 2 resources in running state and no
                other exists.
            error_condition (str): State of the resource that is sampled
                from 'oc get <kind> <resource_name>' command, which makes this
                method to fail immediately without waiting for a timeout. This
                is optional and makes sense only when there is a well defined
                unrecoverable state of the resource(s) which is not expected to
                be part of a workflow under test, and at the same time, the
                timeout itself is large.

        Returns:
            bool: True in case all resources reached desired condition,
                False otherwise

        """
        if condition == error_condition:
            # when this fails, this method is used in a wrong way
            raise ValueError(
                f"Condition '{condition}' we are waiting for must be different"
                f" from error condition '{error_condition}'"
                " which describes unexpected error state.")
        log.info((f"Waiting for a resource(s) of kind {self._kind}"
                  f" identified by name '{resource_name}'"
                  f" using selector {selector}"
                  f" at column name {column}"
                  f" to reach desired condition {condition}"))
        resource_name = resource_name if resource_name else self.resource_name
        selector = selector if selector else self.selector

        # actual status of the resource we are waiting for, setting it to None
        # now prevents UnboundLocalError raised when waiting timeouts
        actual_status = None

        try:
            for sample in TimeoutSampler(timeout, sleep, self.get,
                                         resource_name, True, selector):

                # Only 1 resource expected to be returned
                if resource_name:
                    retry = int(timeout / sleep if sleep else timeout / 1)
                    status = self.get_resource(
                        resource_name,
                        column,
                        retry=retry,
                        wait=sleep,
                    )
                    if status == condition:
                        log.info(f"status of {resource_name} at {column}"
                                 " reached condition!")
                        return True
                    log.info((
                        f"status of {resource_name} at column {column} was {status},"
                        f" but we were waiting for {condition}"))
                    actual_status = status
                    if error_condition is not None and status == error_condition:
                        raise ResourceWrongStatusException(
                            resource_name,
                            column=column,
                            expected=condition,
                            got=status,
                        )
                # More than 1 resources returned
                elif sample.get("kind") == "List":
                    in_condition = []
                    in_condition_len = 0
                    actual_status = []
                    sample = sample["items"]
                    sample_len = len(sample)
                    for item in sample:
                        try:
                            item_name = item.get("metadata").get("name")
                            status = self.get_resource(item_name, column)
                            actual_status.append(status)
                            if status == condition:
                                in_condition.append(item)
                                in_condition_len = len(in_condition)
                            if (error_condition is not None
                                    and status == error_condition):
                                raise ResourceWrongStatusException(
                                    item_name,
                                    column=column,
                                    expected=condition,
                                    got=status,
                                )
                        except CommandFailed as ex:
                            log.info(
                                f"Failed to get status of resource: {item_name} at column {column}, "
                                f"Error: {ex}")
                        if resource_count:
                            if in_condition_len == resource_count:
                                log.info(
                                    f"{in_condition_len} resources already "
                                    f"reached condition!")
                                if (dont_allow_other_resources
                                        and sample_len != in_condition_len):
                                    log.info(
                                        f"There are {sample_len} resources in "
                                        f"total. Continue to waiting as "
                                        f"you don't allow other resources!")
                                    continue
                                return True
                        elif len(sample) == len(in_condition):
                            return True
                    # preparing logging message with expected number of
                    # resource items we are waiting for
                    if resource_count > 0:
                        exp_num_str = f"all {resource_count}"
                    else:
                        exp_num_str = "all"
                    log.info((
                        f"status of {resource_name} at column {column} - item(s) were {actual_status},"
                        f" but we were waiting"
                        f" for {exp_num_str} of them to be {condition}"))
        except TimeoutExpiredError as ex:
            log.error(f"timeout expired: {ex}")
            # run `oc describe` on the resources we were waiting for to provide
            # evidence so that we can understand what was wrong
            output = self.describe(resource_name, selector=selector)
            log.warning(
                "Description of the resource(s) we were waiting for:\n%s",
                output)
            log.error((
                f"Wait for {self._kind} resource {resource_name} at column {column}"
                f" to reach desired condition {condition} failed,"
                f" last actual status was {actual_status}"))
            raise (ex)
        except ResourceWrongStatusException:
            output = self.describe(resource_name, selector=selector)
            log.warning(
                "Description of the resource(s) we were waiting for:\n%s",
                output)
            log.error(
                ("Waiting for %s resource %s at column %s"
                 " to reach desired condition %s was aborted"
                 " because at least one is in unexpected %s state."),
                self._kind,
                resource_name,
                column,
                condition,
                error_condition,
            )
            raise

        return False