コード例 #1
0
        def finalizer():

            # Validate all nodes are schedulable
            scheduling_disabled_nodes = [
                n.name for n in get_node_objs() if n.ocp.get_resource_status(
                    n.name) == constants.NODE_READY_SCHEDULING_DISABLED
            ]
            if scheduling_disabled_nodes:
                schedule_nodes(scheduling_disabled_nodes)

            # Validate all nodes are in READY state
            not_ready_nodes = [
                n for n in get_node_objs() if n.ocp.get_resource_status(n.name)
                == constants.NODE_NOT_READY
            ]
            log.warning(
                f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
            )
            if not_ready_nodes:
                nodes.restart_nodes_by_stop_and_start(not_ready_nodes)
                wait_for_nodes_status()

            log.info("All nodes are in Ready status")

            assert prometheus_health_check(), "Prometheus health is degraded"
コード例 #2
0
        def finalizer():
            not_ready_nodes = [
                n for n in node.get_node_objs() if n.ocp.get_resource_status(
                    n.name) == constants.NODE_NOT_READY
            ]
            logger.warning(
                f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
            )
            if not_ready_nodes:
                nodes.restart_nodes(not_ready_nodes)
                node.wait_for_nodes_status()

            # Restart node if the osd stays at CLBO state
            osd_pods_obj_list = get_osd_pods()
            for pod in osd_pods_obj_list:
                if (pod.get().get("status").get("containerStatuses")[0].get(
                        "state") == constants.STATUS_CLBO):
                    node_obj = get_pod_node(pod)
                    nodes.restart_nodes([node_obj])
                    node.wait_for_nodes_status([node_obj.name])

            # Verify OSD encrypted
            if config.ENV_DATA.get("encryption_at_rest"):
                osd_encryption_verification()

            logger.info("Clear crash warnings and osd removal leftovers")
            clear_crash_warning_and_osd_removal_leftovers()
コード例 #3
0
    def get_node_by_attached_volume(self, volume):
        """
        Get node OCS object of the EC2 instance that has the volume attached to

        Args:
            volume (Volume): The volume to get the EC2 according to

        Returns:
            OCS: The OCS object of the EC2 instance

        """
        instance_ids = [
            at.get('InstanceId') for at in volume.attachments
        ]
        assert instance_ids, (
            f"EBS Volume {volume.id} is not attached to any EC2 instance"
        )
        instance_id = instance_ids[0]
        all_nodes = get_node_objs()
        nodes = [
            n for n in all_nodes if instance_id in n.get()
            .get('spec').get('providerID')
        ]
        assert nodes, (
            f"Failed to find the OCS object for EC2 instance {instance_id}"
        )
        return nodes[0]
コード例 #4
0
 def finalizer():
     scheduling_disabled_nodes = [
         n.name for n in get_node_objs() if n.ocp.get_resource_status(
             n.name) == constants.NODE_READY_SCHEDULING_DISABLED
     ]
     if scheduling_disabled_nodes:
         schedule_nodes(scheduling_disabled_nodes)
コード例 #5
0
def noobaa_running_node_restart(pod_name):
    """
    Function to restart node which has noobaa pod's running

    Args:
        pod_name (str): Name of noobaa pod

    """

    nb_pod_obj = pod.get_pod_obj(
        (get_pod_name_by_pattern(
            pattern=pod_name,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE))[0],
        namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
    )
    nb_node_name = pod.get_pod_node(nb_pod_obj).name
    factory = platform_nodes.PlatformNodesFactory()
    nodes = factory.get_nodes_platform()
    nb_nodes = get_node_objs(node_names=nb_node_name)
    log.info(f"{pod_name} is running on {nb_node_name}")
    log.info(f"Restating node: {nb_node_name}....")
    nodes.restart_nodes_by_stop_and_start(nodes=nb_nodes, force=True)

    # Validate nodes are up and running
    wait_for_nodes_status()
    ceph_health_check(tries=30, delay=60)
    helpers.wait_for_resource_state(nb_pod_obj,
                                    constants.STATUS_RUNNING,
                                    timeout=180)
コード例 #6
0
    def test_run_couchbase_node_reboot(self, cb_setup, nodes,
                                       pod_name_of_node):
        """
        Test couchbase workload with node reboot
        """
        if pod_name_of_node == 'couchbase':
            node_list = self.cb.get_couchbase_nodes()
        elif pod_name_of_node == 'osd':
            node_list = get_osd_running_nodes()
        elif pod_name_of_node == 'master':
            node_list = get_master_nodes()

        node_1 = get_node_objs(node_list[random.randint(0,
                                                        len(node_list) - 1)])

        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type='worker',
                                                   print_table=True)
        get_node_resource_utilization_from_adm_top(node_type='master',
                                                   print_table=True)
        # Restart relevant node
        nodes.restart_nodes(node_1)
        for sample in TimeoutSampler(300, 5, self.cb.result.done):
            if sample:
                break
            else:
                logging.info(
                    "#### ....Waiting for couchbase threads to complete...")
        self.sanity_helpers.health_check()
コード例 #7
0
    def test_rgw_host_node_failure(
        self, nodes, node_restart_teardown, mcg_obj, bucket_factory
    ):
        """
        Test case to fail node where RGW and Noobaa-db-0 hosting
        and verify new pod spuns on healthy node

        """
        # Get rgw pods
        rgw_pod_obj = get_rgw_pods()

        # Get nooba pods
        noobaa_pod_obj = get_noobaa_pods()

        # Get the node where noobaa-db hosted
        for noobaa_pod in noobaa_pod_obj:
            if noobaa_pod.name == "noobaa-db-0":
                noobaa_pod_node = get_pod_node(noobaa_pod)

        for rgw_pod in rgw_pod_obj:
            pod_node = rgw_pod.get().get("spec").get("nodeName")
            if pod_node == noobaa_pod_node.name:
                # Stop the node
                log.info(
                    f"Stopping node {pod_node} where"
                    f" rgw pod {rgw_pod.name} and noobaa-db-0 hosted"
                )
                node_obj = get_node_objs(node_names=[pod_node])
                nodes.stop_nodes(node_obj)

                # Validate old rgw pod went terminating state
                wait_for_resource_state(
                    resource=rgw_pod, state=constants.STATUS_TERMINATING, timeout=720
                )

                # Validate new rgw pod spun
                ocp_obj = OCP(
                    kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE
                )
                ocp_obj.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    resource_count=len(rgw_pod_obj),
                    selector=constants.RGW_APP_LABEL,
                )

                # Create OBC and read wnd write
                self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-1")

                # Start the node
                nodes.start_nodes(node_obj)

                # Create OBC and read wnd write
                self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-2")

        # Verify cluster health
        self.sanity_helpers.health_check()

        # Verify all storage pods are running
        wait_for_storage_pods()
コード例 #8
0
    def finalizer():
        """
        Make sure that all cluster's nodes are in 'Ready' state and if not,
        change them back to 'Ready' state by marking them as schedulable
        """
        scheduling_disabled_nodes = [
            n.name for n in get_node_objs() if n.ocp.get_resource_status(
                n.name) == constants.NODE_READY_SCHEDULING_DISABLED
        ]
        if scheduling_disabled_nodes:
            schedule_nodes(scheduling_disabled_nodes)

        # Remove label created for DC app pods on all worker nodes
        node_objs = get_node_objs()
        for node_obj in node_objs:
            if "dc" in node_obj.get().get("metadata").get("labels").keys():
                remove_label_from_worker_node([node_obj.name], label_key="dc")
コード例 #9
0
 def test_nodes_restart(self, nodes, pvc_factory, pod_factory, force):
     """
     Test nodes restart (from the platform layer, i.e, EC2 instances, VMWare VMs)
     """
     ocp_nodes = get_node_objs()
     nodes.restart_nodes(nodes=ocp_nodes, force=force)
     self.sanity_helpers.health_check()
     self.sanity_helpers.create_resources(pvc_factory, pod_factory)
コード例 #10
0
        def finalizer():
            ocp_nodes = get_node_objs()
            for n in ocp_nodes:
                recover_node_to_ready_state(n)

            logger.info("Switch to the original cluster index")
            config.switch_ctx(self.orig_index)
            ceph_health_check()
コード例 #11
0
    def test_rolling_nodes_restart(self, nodes, pvc_factory, pod_factory):
        """
        Test restart nodes one after the other and check health status in between

        """
        ocp_nodes = get_node_objs()
        for node in ocp_nodes:
            nodes.restart_nodes(nodes=[node], wait=False)
            self.sanity_helpers.health_check(cluster_check=False, tries=60)
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
コード例 #12
0
def test_get_vm_status():
    """
    Test of RHV get_vm_status() method implementation
    VM  of healthy OCS Cluster has 'up' status by default.
    """
    rhv_depl = RHVIPI()
    vm = rhv_depl.rhv_util.get_rhv_vm_instance(
        get_node_objs()[0].get().get("metadata").get("name"))
    logger.info(f"vm name is: {vm.name}")
    status = rhv_depl.rhv_util.get_vm_status(vm)
    assert "up" == str(status), f"Status of {vm.name} is {status}"
コード例 #13
0
ファイル: test_disk_failures.py プロジェクト: ekuric/ocs-ci
 def finalizer():
     not_ready_nodes = [
         n for n in node.get_node_objs() if n
         .ocp.get_resource_status(n.name) == constants.NODE_NOT_READY
     ]
     logger.warning(
         f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
     )
     if not_ready_nodes:
         nodes.restart_nodes(not_ready_nodes)
         node.wait_for_nodes_status()
コード例 #14
0
def test_stop_and_start_rhv_vms():
    """
    Test of RHV stop_rhv_vms() method implementation
    VM has 'down' status after shutdown and 'up' after power on
    """
    rhv_depl = RHVIPI()
    vm = rhv_depl.rhv_util.get_rhv_vm_instance(
        get_node_objs()[0].get().get("metadata").get("name"))
    logger.info(f"vm name is: {vm.name}")
    rhv_depl.rhv_util.stop_rhv_vms([vm])
    status = rhv_depl.rhv_util.get_vm_status(vm)
    assert "down" == str(status), f"Status of {vm.name} is {status}"
コード例 #15
0
def check_automated_recovery_from_stopped_node(nodes):
    """
    1) Stop node.
    2) The rook ceph pods associated with the node should change to a Terminating state.
    3) The node should power on automatically, or if removed from the cluster,
       a new node should create automatically.
    4) The new osd pods with the same ids should start on the stopped node after it powered on,
       or to start on the new osd node.

    """
    old_wnodes = get_worker_nodes()
    log.info(f"Current worker nodes: {old_wnodes}")

    osd_node_name = random.choice(get_osd_running_nodes())
    osd_node = get_node_objs([osd_node_name])[0]

    machine_name = machine.get_machine_from_node_name(osd_node_name)
    machineset = machine.get_machineset_from_machine_name(machine_name)
    log.info(f"machineset name: {machineset}")

    old_osd_pod_ids = get_node_osd_ids(osd_node_name)
    log.info(f"osd pod ids: {old_osd_pod_ids}")

    pod_names_expected_to_terminate = get_node_pod_names_expected_to_terminate(
        osd_node_name)

    nodes.stop_nodes([osd_node], wait=True)
    log.info(f"Successfully powered off node: {osd_node_name}")

    log.info("Verify the node rook ceph pods go into a Terminating state")
    res = wait_for_pods_to_be_in_statuses([constants.STATUS_TERMINATING],
                                          pod_names_expected_to_terminate)
    assert res, "Not all the node rook ceph pods are in a Terminating state"

    try:
        log.info(f"Wait for the node: {osd_node_name} to power on")
        wait_for_nodes_status([osd_node_name])
        log.info(f"Successfully powered on node {osd_node_name}")
    except ResourceWrongStatusException as e:
        log.info(
            f"The worker node {osd_node_name} didn't start due to the exception {str(e)} "
            f"Probably it has been removed from the cluster. Waiting for a new node to come up..."
        )
        new_wnode = wait_for_new_worker_node_ipi(machineset, old_wnodes)
        osd_node_name = new_wnode.name

    assert wait_for_osd_ids_come_up_on_node(osd_node_name,
                                            old_osd_pod_ids,
                                            timeout=300)
    log.info(
        f"the osd ids {old_osd_pod_ids} Successfully come up on the node {osd_node_name}"
    )
コード例 #16
0
def get_pod_node(pod_obj):
    """
    Get the node that the pod is running on

    Args:
        pod_obj (OCS): The pod object

    Returns:
        ocs_ci.ocs.ocp.OCP: The node object

    """
    node_name = pod_obj.get().get('spec').get('nodeName')
    return node.get_node_objs(node_names=node_name)[0]
コード例 #17
0
    def test_run_couchbase_node_reboot(self, cb_setup, nodes,
                                       pod_name_of_node):
        """
        Test couchbase workload with node reboot
        """
        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type="worker",
                                                   print_table=True)
        get_node_resource_utilization_from_adm_top(node_type="master",
                                                   print_table=True)

        if pod_name_of_node == "couchbase":
            node_list = self.cb.get_couchbase_nodes()
        elif pod_name_of_node == "osd":
            node_list = get_osd_running_nodes()
        elif pod_name_of_node == "master":
            master_node = get_nodes(pod_name_of_node, num_of_nodes=1)

        # Restart relevant node
        if pod_name_of_node == "master":
            nodes.restart_nodes(master_node, wait=False)
            waiting_time = 40
            log.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)
        else:
            restart_node = get_node_objs(node_list[random.randint(
                0,
                len(node_list) - 1)])
            nodes.restart_nodes(restart_node)

        # Validate all nodes and services are in READY state and up

        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(ocp.wait_for_cluster_connectivity(tries=400))
        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(wait_for_nodes_status(timeout=1800))
        bg_handler = flowtest.BackgroundOps()
        bg_ops = [self.cb.result]
        retry((CommandFailed), tries=60,
              delay=15)(bg_handler.wait_for_bg_operations(bg_ops,
                                                          timeout=3600))
        self.sanity_helpers.health_check(tries=40)
コード例 #18
0
        def finalizer():

            # Validate all nodes are in READY state
            not_ready_nodes = [
                n for n in get_node_objs() if n.ocp.get_resource_status(n.name)
                == constants.NODE_NOT_READY
            ]
            log.warning(
                f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
            )
            if not_ready_nodes:
                nodes.restart_nodes_by_stop_and_start(not_ready_nodes)
                wait_for_nodes_status()

            log.info("All nodes are in Ready status")
コード例 #19
0
def test_p_stop_and_start():
    """
    Test of RHV stop_nodes & start Nodes method implementation
    """
    rhv_plfrm = RHVNodes()
    nodes = get_node_objs()
    logger.info(f"nodes are: {nodes}")
    node = [nodes[4]]
    rhv_plfrm.stop_nodes(node)
    vm_name = node[0].get().get("metadata").get("name")
    vm_obj = rhv_plfrm.rhv.get_rhv_vm_instance(vm_name)
    status = rhv_plfrm.rhv.get_vm_status(vm_obj)
    assert "down" == str(status), f"Status of {vm_name} is {status}"
    status = rhv_plfrm.rhv.get_vm_status(vm_obj)
    logger.info(f"Status of {vm_name} is {status}")
コード例 #20
0
def ec2_instances(request, aws_obj):
    """
    Get cluster instances

    Returns:
        dict: The ID keys and the name values of the instances

    """
    # Get all cluster nodes objects
    nodes = node.get_node_objs()

    # Get the cluster nodes ec2 instances
    ec2_instances = aws.get_instances_ids_and_names(nodes)
    assert ec2_instances, f"Failed to get ec2 instances for node {[n.name for n in nodes]}"

    def finalizer():
        """
        Make sure all instances are running
        """
        # Getting the instances that are in status 'stopping' (if there are any), to wait for them to
        # get to status 'stopped' so it will be possible to start them
        stopping_instances = {
            key: val
            for key, val in ec2_instances.items()
            if (aws_obj.get_instances_status_by_id(key) ==
                constants.INSTANCE_STOPPING)
        }

        # Waiting fot the instances that are in status 'stopping'
        # (if there are any) to reach 'stopped'
        if stopping_instances:
            for stopping_instance in stopping_instances:
                instance = aws_obj.get_ec2_instance(stopping_instance.key())
                instance.wait_until_stopped()
        stopped_instances = {
            key: val
            for key, val in ec2_instances.items()
            if (aws_obj.get_instances_status_by_id(key) ==
                constants.INSTANCE_STOPPED)
        }

        # Start the instances
        if stopped_instances:
            aws_obj.start_ec2_instances(instances=stopped_instances, wait=True)

    request.addfinalizer(finalizer)

    return ec2_instances
コード例 #21
0
    def test_run_pgsql_reboot_node(self, pgsql, nodes, transactions, pod_name):
        """
        Test pgsql workload
        """
        # Create pgbench benchmark
        pgsql.create_pgbench_benchmark(replicas=3,
                                       transactions=transactions,
                                       clients=3)

        # Start measuring time
        start_time = datetime.now()

        # Wait for pgbench pod to reach running state
        pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING)

        # Choose a node based on pod it contains
        if pod_name == 'postgres':
            node_list = pgsql.get_pgsql_nodes()
        elif pod_name == 'osd':
            node_list = get_osd_running_nodes()
        node_1 = get_node_objs(node_list[random.randint(0,
                                                        len(node_list) - 1)])

        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type='worker',
                                                   print_table=True)

        # Restart relevant node
        nodes.restart_nodes(node_1)

        # Wait for pg_bench pod to complete
        pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED)

        # Calculate the time from running state to completed state
        end_time = datetime.now()
        diff_time = end_time - start_time
        log.info(
            f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n"
        )

        # Get pgbench pods
        pgbench_pods = pgsql.get_pgbench_pods()

        # Validate pgbench run and parse logs
        pgsql.validate_pgbench_run(pgbench_pods)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
コード例 #22
0
    def test_run_pgsql_reboot_node(self, pgsql, nodes, transactions, pod_name):
        """
        Test pgsql workload
        """
        # Create pgbench benchmark
        pgsql.create_pgbench_benchmark(replicas=1, transactions=transactions)

        # Start measuring time
        start_time = datetime.now()

        # Wait for pgbench pod to reach running state
        pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING)

        # Select a node where pgbench is not running and reboot
        osd_nodes_list = get_osd_running_nodes()
        node_list = pgsql.filter_pgbench_nodes_from_nodeslist(osd_nodes_list)

        node_1 = get_node_objs(node_list[random.randint(0,
                                                        len(node_list) - 1)])
        log.info(f"Selected node {node_1} for reboot operation")

        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type="worker",
                                                   print_table=True)

        # Restart relevant node
        nodes.restart_nodes(node_1)

        # Wait for pg_bench pod to complete
        pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED)

        # Calculate the time from running state to completed state
        end_time = datetime.now()
        diff_time = end_time - start_time
        log.info(
            f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n"
        )

        # Get pgbench pods
        pgbench_pods = pgsql.get_pgbench_pods()

        # Validate pgbench run and parse logs
        pgsql.validate_pgbench_run(pgbench_pods)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check(tries=40)
コード例 #23
0
    def test_run_jenkins_node_reboot(self, jenkins, nodes, node_type,
                                     num_projects, num_of_builds):
        """

        Test Node Reboot jenkins
        """
        # Init number of projects
        jenkins.number_projects = num_projects

        # Create app jenkins
        jenkins.create_app_jenkins()

        # Create jenkins pvc
        jenkins.create_jenkins_pvc()

        # Create jenkins build config
        jenkins.create_jenkins_build_config()

        # Wait jenkins deploy pod reach to completed state
        jenkins.wait_for_jenkins_deploy_status(status=STATUS_COMPLETED)

        # Get relevant node
        nodes_reboot = jenkins.get_node_name_where_jenkins_pod_not_hosted(
            node_type=node_type, num_of_nodes=1)

        # Init number of builds per project
        jenkins.number_builds_per_project = num_of_builds

        # Start Builds
        jenkins.start_build()

        if len(nodes_reboot) > 0:
            # Restart Node
            nodes.restart_nodes(get_node_objs(nodes_reboot))
        else:
            log.info('No node was reboot')

        # Wait build reach 'Complete' state
        jenkins.wait_for_build_to_complete()

        # Print table of builds
        jenkins.print_completed_builds_results()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
コード例 #24
0
    def test_osd_node_restart_and_check_osd_pods_status(self, nodes):
        """
        1) Restart one of the osd nodes.
        2) Check that the osd pods associated with the node should change to a Terminating state.
        3) Wait for the node to reach Ready state.
        4) Check that the new osd pods with the same ids start on the same node.
        5) Check the worker nodes security groups.
        """
        # This is a workaround due to the issue https://github.com/red-hat-storage/ocs-ci/issues/6162
        if is_ms_consumer_cluster():
            logger.info(
                "The test is applicable only for an MS provider cluster. "
                "Switching to the provider cluster...")
            config.switch_to_provider()

        self.create_resources()

        osd_node_name = random.choice(get_osd_running_nodes())
        osd_node = get_node_objs([osd_node_name])[0]

        old_osd_pod_ids = get_node_osd_ids(osd_node_name)
        logger.info(f"osd pod ids: {old_osd_pod_ids}")
        node_osd_pods = pod.get_osd_pods_having_ids(old_osd_pod_ids)
        node_osd_pod_names = [p.name for p in node_osd_pods]

        logger.info(f"Going to restart the node {osd_node_name}")
        nodes.restart_nodes(nodes=[osd_node], wait=False)

        logger.info("Verify the node osd pods go into a Terminating state")
        res = pod.wait_for_pods_to_be_in_statuses(
            [constants.STATUS_TERMINATING], node_osd_pod_names)
        assert res, "Not all the node osd pods are in a Terminating state"

        wait_for_nodes_status(node_names=[osd_node_name])
        assert wait_for_osd_ids_come_up_on_node(osd_node_name,
                                                old_osd_pod_ids,
                                                timeout=300)
        logger.info(
            f"the osd ids {old_osd_pod_ids} Successfully come up on the node {osd_node_name}"
        )

        logger.info(
            "Verify the worker nodes security groups on the provider...")
        assert verify_worker_nodes_security_groups()
コード例 #25
0
ファイル: platform_nodes.py プロジェクト: wangjun1974/ocs-ci
    def restart_nodes_teardown(self):
        """
        Make sure all EC2 instances are up. To be used in the test teardown

        """
        # Get all cluster nodes objects
        ocp_nodes = get_node_objs()

        # Get the cluster nodes ec2 instances
        ec2_instances = self.get_ec2_instances(ocp_nodes)
        assert ec2_instances, (
            f"Failed to get ec2 instances for node {[n.name for n in ocp_nodes]}"
        )

        logger.info(
            "Getting the instances that are in status 'stopping' (if there are any), "
            "and wait for them to get to status 'stopped', "
            "so it will be possible to start them")
        stopping_instances = {
            key: val
            for key, val in ec2_instances.items()
            if self.aws.get_instances_status_by_id(key) ==
            constants.INSTANCE_STOPPING
        }

        logger.info("Waiting fot the instances that are in status 'stopping' "
                    "(if there are any) to reach 'stopped'")
        if stopping_instances:
            for stopping_instance in stopping_instances:
                instance = self.aws.get_ec2_instance(stopping_instance.key())
                instance.wait_until_stopped()
        stopped_instances = {
            key: val
            for key, val in ec2_instances.items()
            if self.aws.get_instances_status_by_id(key) ==
            constants.INSTANCE_STOPPED
        }

        # Start the instances
        if stopped_instances:
            self.aws.start_ec2_instances(instances=stopped_instances,
                                         wait=True)
コード例 #26
0
ファイル: test_disk_failures.py プロジェクト: pkesavap/ocs-ci
        def finalizer():
            not_ready_nodes = [
                n for n in node.get_node_objs() if n.ocp.get_resource_status(
                    n.name) == constants.NODE_NOT_READY
            ]
            logger.warning(
                f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
            )
            if not_ready_nodes:
                nodes.restart_nodes(not_ready_nodes)
                node.wait_for_nodes_status()

            # Restart node if the osd stays at CLBO state
            osd_pods_obj_list = get_osd_pods()
            for pod in osd_pods_obj_list:
                if (pod.get().get("status").get("containerStatuses")[0].get(
                        "state") == constants.STATUS_CLBO):
                    node_obj = get_pod_node(pod)
                    nodes.restart_nodes([node_obj])
                    node.wait_for_nodes_status([node_obj.name])
コード例 #27
0
def cycle_nodes(cluster_path, action):
    """
    Start/Stop AWS nodes to save costs when not in use.

    Args:
        cluster_path(str): location of cluster path that has auth files
        action (str): action to perform either start or stop

    """
    node_obj_file = os.path.join(cluster_path, NODE_OBJ_FILE)
    nodes_file = os.path.join(cluster_path, NODE_FILE)
    instance_file = os.path.join(cluster_path, INSTANCE_FILE)
    if action == 'stop':
        ceph = CephCluster()
        ceph.set_noout()
        node_objs = get_node_objs()
        kls = platform_nodes.PlatformNodesFactory()
        nodes = kls.get_nodes_platform()
        with open(instance_file, "wb") as instance_file:
            log.info("Storing ocs instances objects")
            pickle.dump(nodes.get_ec2_instances(nodes=node_objs),
                        instance_file)
        with open(nodes_file, "wb") as node_file:
            log.info("Storing ocp nodes objects")
            pickle.dump(nodes, node_file)
        with open(node_obj_file, "wb") as node_obj_file:
            log.info("Stopping all nodes")
            pickle.dump(node_objs, node_obj_file)
            nodes.stop_nodes(nodes=node_objs)
    elif action == 'start':
        with open(instance_file, "rb") as instance_file:
            log.info("Reading instance objects")
            instances = pickle.load(instance_file)
        with open(nodes_file, "rb") as node_file:
            log.info("Reading ocp nodes object")
            nodes = pickle.load(node_file)
        with open(node_obj_file, "rb") as node_obj_file:
            log.info("Starting ocs nodes")
            node_objs = pickle.load(node_obj_file)
            nodes.start_nodes(instances=instances, nodes=node_objs)
            unset_noout()
コード例 #28
0
        def finalizer():
            # Start the powered off nodes
            nodes.restart_nodes_teardown()
            try:
                node.wait_for_nodes_status(status=constants.NODE_READY)
            except ResourceWrongStatusException:
                # Restart the nodes if in NotReady state
                not_ready_nodes = [
                    n for n in node.get_node_objs() if n
                    .ocp.get_resource_status(n.name) == constants.NODE_NOT_READY
                ]
                if not_ready_nodes:
                    logger.info(
                        f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
                    )
                    nodes.restart_nodes(not_ready_nodes)
                    node.wait_for_nodes_status(status=constants.NODE_READY)

            # Check ceph health
            assert ceph_health_check(), f"Ceph cluster health is not OK"
            logger.info("Ceph cluster health is OK")
コード例 #29
0
def check_automated_recovery_from_terminated_node(nodes):
    """
    1) Terminate node.
    2) The rook ceph pods associated with the node should change to a Terminating state.
    3) A new node should be created automatically
    4) The new osd pods with the same ids of the terminated node should start on the new osd node.

    """
    old_wnodes = get_worker_nodes()
    log.info(f"Current worker nodes: {old_wnodes}")

    osd_node_name = random.choice(get_osd_running_nodes())
    osd_node = get_node_objs([osd_node_name])[0]

    machine_name = machine.get_machine_from_node_name(osd_node_name)
    machineset = machine.get_machineset_from_machine_name(machine_name)
    log.info(f"machineset name: {machineset}")

    old_osd_pod_ids = get_node_osd_ids(osd_node_name)
    log.info(f"osd pod ids: {old_osd_pod_ids}")

    pod_names_expected_to_terminate = get_node_pod_names_expected_to_terminate(
        osd_node.name)

    nodes.terminate_nodes([osd_node], wait=True)
    log.info(f"Successfully terminated the node: {osd_node_name}")

    log.info("Verify the node rook ceph pods go into a Terminating state")
    res = wait_for_pods_to_be_in_statuses([constants.STATUS_TERMINATING],
                                          pod_names_expected_to_terminate)
    assert res, "Not all the node rook ceph pods are in a Terminating state"

    new_wnode = wait_for_new_worker_node_ipi(machineset, old_wnodes)

    wait_for_osd_ids_come_up_on_node(new_wnode.name,
                                     old_osd_pod_ids,
                                     timeout=300)
    log.info(
        f"the osd ids {old_osd_pod_ids} Successfully come up on the node {new_wnode.name}"
    )
    def test_automated_recovery_from_failed_nodes_IPI_reactive(
            self, nodes, pvc_factory, pod_factory, failure, dc_pod_factory,
            interface):
        """
        Knip-678 Automated recovery from failed nodes
        Reactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key='dc',
                          label_value='fedora')

        # Create DC app pods
        log.info("Creating DC based app pods")
        if interface == 'rbd':
            interface = constants.CEPHBLOCKPOOL
        elif interface == 'cephfs':
            interface = constants.CEPHFILESYSTEM
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={'dc': 'fedora'})
            self.threads.append(pod.run_io_in_bg(dc_pod, fedora_dc=True))
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        log.info(f"Both OSD and app pod is running on nodes {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)
        # Get the failure node obj
        failure_node_obj = get_node_objs(node_names=[common_nodes[0]])

        # Induce failure on the selected failure node
        log.info(f"Inducing failure on node {failure_node_obj[0].name}")
        if failure == "shutdown":
            nodes.stop_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully powered off node: "
                     f"{failure_node_obj[0].name}")
        elif failure == "terminate":
            nodes.terminate_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully terminated node : "
                     f"{failure_node_obj[0].name} instance")

        try:
            # DC app pods on the failed node will get automatically created on other
            # running node. Waiting for all dc app pod to reach running state
            pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj,
                                                            timeout=720)
            log.info("All the dc pods reached running state")
            pod.wait_for_storage_pods()

        except ResourceWrongStatusException:
            if failure == "shutdown":
                nodes.terminate_nodes(failure_node_obj, wait=True)
                log.info(f"Successfully terminated node : "
                         f"{failure_node_obj[0].name} instance")
            raise

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()