Beispiel #1
0
    def label_and_taint_nodes(self):
        """
        Label and taint worker nodes to be used by OCS operator
        """

        nodes = ocp.OCP(kind='node').get().get('items', [])
        worker_nodes = [
            node for node in nodes
            if "node-role.kubernetes.io/worker" in node['metadata']['labels']
        ]
        if not worker_nodes:
            raise UnavailableResourceException("No worker node found!")
        az_worker_nodes = {}
        for node in worker_nodes:
            az = node['metadata']['labels'].get(
                'failure-domain.beta.kubernetes.io/zone')
            az_node_list = az_worker_nodes.get(az, [])
            az_node_list.append(node)
            az_worker_nodes[az] = az_node_list
        logger.info(f"Found worker nodes in AZ: {az_worker_nodes}")
        distributed_worker_nodes = []
        while az_worker_nodes:
            for az in list(az_worker_nodes.keys()):
                az_node_list = az_worker_nodes.get(az)
                if az_node_list:
                    node_name = az_node_list.pop(0)['metadata']['name']
                    distributed_worker_nodes.append(node_name)
                else:
                    del az_worker_nodes[az]
        logger.info(
            f"Distributed worker nodes for AZ: {distributed_worker_nodes}")
        to_label = config.DEPLOYMENT.get('ocs_operator_nodes_to_label', 3)
        to_taint = config.DEPLOYMENT.get('ocs_operator_nodes_to_taint', 0)
        worker_count = len(worker_nodes)
        if worker_count < to_label or worker_count < to_taint:
            logger.info(f"All nodes: {nodes}")
            logger.info(f"Worker nodes: {worker_nodes}")
            raise UnavailableResourceException(
                f"Not enough worker nodes: {worker_count} to label: "
                f"{to_label} or taint: {to_taint}!")

        workers_to_label = " ".join(distributed_worker_nodes[:to_label])
        if workers_to_label:
            _ocp = ocp.OCP(kind='node')
            logger.info(f"Label nodes: {workers_to_label} with label: "
                        f"{constants.OPERATOR_NODE_LABEL}")
            label_cmd = (
                f"label nodes {workers_to_label} {constants.OPERATOR_NODE_LABEL}"
            )
            _ocp.exec_oc_cmd(command=label_cmd)

        workers_to_taint = " ".join(distributed_worker_nodes[:to_taint])
        if workers_to_taint:
            logger.info(f"Taint nodes: {workers_to_taint} with taint: "
                        f"{constants.OPERATOR_NODE_TAINT}")
            taint_cmd = (
                f"adm taint nodes {workers_to_taint} {constants.OPERATOR_NODE_TAINT}"
            )
            _ocp.exec_oc_cmd(command=taint_cmd)
Beispiel #2
0
    def label_and_taint_nodes(self):
        """
        Label and taint worker nodes to be used by OCS operator
        """

        nodes = ocp.OCP(kind='node').get().get('items', [])
        worker_nodes = [
            node for node in nodes if "node-role.kubernetes.io/worker"
            in node['metadata']['labels']
        ]
        if not worker_nodes:
            raise UnavailableResourceException("No worker node found!")
        to_label = config.DEPLOYMENT.get('ocs_operator_nodes_to_label', 3)
        to_taint = config.DEPLOYMENT.get('ocs_operator_nodes_to_taint', 0)
        worker_count = len(worker_nodes)
        if worker_count < to_label or worker_count < to_taint:
            logger.info(f"All nodes: {nodes}")
            logger.info(f"Worker nodes: {worker_nodes}")
            raise UnavailableResourceException(
                f"Not enough worker nodes: {worker_count} to label: "
                f"{to_label} or taint: {to_taint}!"
            )

        workers_to_label = " ".join(
            [node['metadata']['name'] for node in worker_nodes[:to_label]]
        )
        if workers_to_label:
            _ocp = ocp.OCP(kind='node')
            logger.info(
                f"Label nodes: {workers_to_label} with label: "
                f"{constants.OPERATOR_NODE_LABEL}"
            )
            label_cmd = (
                f"label nodes {workers_to_label} {constants.OPERATOR_NODE_LABEL}"
            )
            _ocp.exec_oc_cmd(command=label_cmd)

        workers_to_taint = " ".join(
            [node['metadata']['name'] for node in worker_nodes[:to_taint]]
        )
        if workers_to_taint:
            logger.info(
                f"Taint nodes: {workers_to_taint} with taint: "
                f"{constants.OPERATOR_NODE_TAINT}"
            )
            taint_cmd = (
                f"adm taint nodes {workers_to_taint} {constants.OPERATOR_NODE_TAINT}"
            )
            _ocp.exec_oc_cmd(command=taint_cmd)
Beispiel #3
0
    def get_arbiter_location(self):
        """
        Get arbiter mon location for storage cluster
        """
        if config.DEPLOYMENT.get(
                "arbiter_deployment"
        ) and not config.DEPLOYMENT.get("arbiter_autodetect"):
            return config.DEPLOYMENT.get("arbiter_zone")

        # below logic will autodetect arbiter_zone
        nodes = ocp.OCP(kind="node").get().get("items", [])

        worker_nodes_zones = {
            node["metadata"]["labels"].get(constants.ZONE_LABEL)
            for node in nodes
            if constants.WORKER_LABEL in node["metadata"]["labels"]
            and str(constants.OPERATOR_NODE_LABEL)[:-3] in node["metadata"]
            ["labels"]
        }

        master_nodes_zones = {
            node["metadata"]["labels"].get(constants.ZONE_LABEL)
            for node in nodes
            if constants.MASTER_LABEL in node["metadata"]["labels"]
        }

        arbiter_locations = list(master_nodes_zones - worker_nodes_zones)

        if len(arbiter_locations) < 1:
            raise UnavailableResourceException(
                "Atleast 1 different zone required than storage nodes in master nodes to host arbiter mon"
            )

        return arbiter_locations[0]
Beispiel #4
0
def get_rate_based_on_cls_iops(custom_iops_dict=None, osd_size=2048):
    """
    Function to check ceph cluster iops and suggest rate param for fio.

    Args:
        osd_size (int): Size of the OSD in GB
        custom_iops_dict (dict): Dictionary of rate param to be used during IO run.
        Example, iops_dict = {'usage_below_40%': '16k', 'usage_40%_60%': '8k',
        'usage_60%_80%': '4k', 'usage_80%_95%': '2K'}
        Warning, Make sure dict key is same as above example.

    Returns:
        rate_param (str): Rate parm for fio based on ceph cluster IOPs

    """
    # Check for IOPs limit percentage of cluster and accordingly suggest fio rate param
    cls_obj = cluster.CephCluster()
    iops = cls_obj.get_iops_percentage(osd_size=osd_size)
    logger.info(f"Printing iops from cluster {iops}")
    if custom_iops_dict:
        iops_dict = custom_iops_dict
    else:
        iops_dict = {
            "usage_below_40%": "8k",
            "usage_40%_60%": "8k",
            "usage_60%_80%": "4k",
            "usage_80%_95%": "2K",
        }
    if (iops * 100) <= 40:
        rate_param = iops_dict["usage_below_40%"]
    elif 40 < (iops * 100) <= 60:
        rate_param = iops_dict["usage_40%_60%"]
    elif 60 < (iops * 100) <= 80:
        rate_param = iops_dict["usage_60%_80%"]
    elif 80 < (iops * 100) <= 95:
        rate_param = iops_dict["usage_80%_95%"]
    else:
        logging.warning(
            f"Cluster iops utilization is more than {iops * 100} percent")
        raise UnavailableResourceException(
            "Overall Cluster utilization is more than 95%")
    return rate_param
Beispiel #5
0
def check_and_add_enough_worker(worker_count):
    """
    Function to check if there is enough workers available to scale pods.
    IF there is no enough worker then worker will be added based on supported platforms
    Function also adds scale label to the respective worker nodes.

    Args:
        worker_count (int): Expected worker count to be present in the setup

    Returns:
        book: True is there is enough worker count else raise exception.

    """
    # Check either to use OCS workers for scaling app pods
    # Further continue to label the worker with scale label else not
    worker_list = node.get_worker_nodes()
    ocs_worker_list = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL)
    scale_worker = machine.get_labeled_nodes(constants.SCALE_LABEL)
    if config.RUN.get("use_ocs_worker_for_scale"):
        if not scale_worker:
            helpers.label_worker_node(node_list=worker_list,
                                      label_key="scale-label",
                                      label_value="app-scale")
    else:
        if not scale_worker:
            for node_item in ocs_worker_list:
                worker_list.remove(node_item)
            if worker_list:
                helpers.label_worker_node(
                    node_list=worker_list,
                    label_key="scale-label",
                    label_value="app-scale",
                )
    scale_worker_list = machine.get_labeled_nodes(constants.SCALE_LABEL)
    logging.info(f"Print existing scale worker {scale_worker_list}")

    # Check if there is enough nodes to continue scaling of app pods
    if len(scale_worker_list) >= worker_count:
        logging.info(f"Setup has expected worker count {worker_count} "
                     "to continue scale of pods")
        return True
    else:
        logging.info(
            "There is no enough worker in the setup, will add enough worker "
            "for the automation supported platforms")
        # Add enough worker for AWS
        if (config.ENV_DATA["deployment_type"] == "ipi"
                and config.ENV_DATA["platform"].lower() == "aws"):
            # Create machineset for app worker nodes on each aws zone
            # Each zone will have one app worker node
            ms_name = list()
            labels = [("node-role.kubernetes.io/app", "app-scale")]
            for obj in machine.get_machineset_objs():
                if "app" in obj.name:
                    ms_name.append(obj.name)
            if not ms_name:
                if len(machine.get_machineset_objs()) == 3:
                    for zone in ["a", "b", "c"]:
                        ms_name.append(
                            machine.create_custom_machineset(
                                instance_type="m5.4xlarge",
                                labels=labels,
                                zone=zone,
                            ))
                else:
                    ms_name.append(
                        machine.create_custom_machineset(
                            instance_type="m5.4xlarge",
                            labels=labels,
                            zone="a",
                        ))
                for ms in ms_name:
                    machine.wait_for_new_node_to_be_ready(ms)
            if len(ms_name) == 3:
                exp_count = int(worker_count / 3)
            else:
                exp_count = worker_count
            for name in ms_name:
                machine.add_node(machine_set=name, count=exp_count)
            for ms in ms_name:
                machine.wait_for_new_node_to_be_ready(ms)
            worker_list = node.get_worker_nodes()
            ocs_worker_list = machine.get_labeled_nodes(
                constants.OPERATOR_NODE_LABEL)
            scale_label_worker = machine.get_labeled_nodes(
                constants.SCALE_LABEL)
            ocs_worker_list.extend(scale_label_worker)
            final_list = list(dict.fromkeys(ocs_worker_list))
            for node_item in final_list:
                if node_item in worker_list:
                    worker_list.remove(node_item)
            if worker_list:
                helpers.label_worker_node(
                    node_list=worker_list,
                    label_key="scale-label",
                    label_value="app-scale",
                )
            return True
        elif (config.ENV_DATA["deployment_type"] == "upi"
              and config.ENV_DATA["platform"].lower() == "vsphere"):
            raise UnsupportedPlatformError(
                "Unsupported Platform to add worker")
        elif (config.ENV_DATA["deployment_type"] == "upi"
              and config.ENV_DATA["platform"].lower() == "baremetal"):
            raise UnsupportedPlatformError(
                "Unsupported Platform to add worker")
        elif (config.ENV_DATA["deployment_type"] == "upi"
              and config.ENV_DATA["platform"].lower() == "azure"):
            raise UnsupportedPlatformError(
                "Unsupported Platform to add worker")
        else:
            raise UnavailableResourceException(
                "There is no enough worker nodes to continue app pod scaling")
Beispiel #6
0
    def label_and_taint_nodes(self):
        """
        Label and taint worker nodes to be used by OCS operator
        """

        nodes = ocp.OCP(kind="node").get().get("items", [])
        worker_nodes = [
            node for node in nodes
            if "node-role.kubernetes.io/worker" in node["metadata"]["labels"]
        ]
        if not worker_nodes:
            raise UnavailableResourceException("No worker node found!")
        az_worker_nodes = {}
        for node in worker_nodes:
            az = node["metadata"]["labels"].get(
                "failure-domain.beta.kubernetes.io/zone")
            az_node_list = az_worker_nodes.get(az, [])
            az_node_list.append(node)
            az_worker_nodes[az] = az_node_list
        logger.debug(f"Found the worker nodes in AZ: {az_worker_nodes}")
        distributed_worker_nodes = []
        while az_worker_nodes:
            for az in list(az_worker_nodes.keys()):
                az_node_list = az_worker_nodes.get(az)
                if az_node_list:
                    node_name = az_node_list.pop(0)["metadata"]["name"]
                    distributed_worker_nodes.append(node_name)
                else:
                    del az_worker_nodes[az]
        logger.info(
            f"Distributed worker nodes for AZ: {distributed_worker_nodes}")
        to_label = config.DEPLOYMENT.get("ocs_operator_nodes_to_label", 3)
        to_taint = config.DEPLOYMENT.get("ocs_operator_nodes_to_taint", 0)
        worker_count = len(worker_nodes)
        if worker_count < to_label or worker_count < to_taint:
            logger.info(f"All nodes: {nodes}")
            logger.info(f"Worker nodes: {worker_nodes}")
            raise UnavailableResourceException(
                f"Not enough worker nodes: {worker_count} to label: "
                f"{to_label} or taint: {to_taint}!")

        _ocp = ocp.OCP(kind="node")
        workers_to_label = " ".join(distributed_worker_nodes[:to_label])
        if workers_to_label:

            logger.info(f"Label nodes: {workers_to_label} with label: "
                        f"{constants.OPERATOR_NODE_LABEL}")
            label_cmds = [(f"label nodes {workers_to_label} "
                           f"{constants.OPERATOR_NODE_LABEL} --overwrite")]
            if config.DEPLOYMENT.get(
                    "infra_nodes"
            ) and not config.ENV_DATA.get("infra_replicas"):
                logger.info(f"Label nodes: {workers_to_label} with label: "
                            f"{constants.INFRA_NODE_LABEL}")
                label_cmds.append(f"label nodes {workers_to_label} "
                                  f"{constants.INFRA_NODE_LABEL} --overwrite")

            for cmd in label_cmds:
                _ocp.exec_oc_cmd(command=cmd)

        workers_to_taint = " ".join(distributed_worker_nodes[:to_taint])
        if workers_to_taint:
            logger.info(f"Taint nodes: {workers_to_taint} with taint: "
                        f"{constants.OPERATOR_NODE_TAINT}")
            taint_cmd = (
                f"adm taint nodes {workers_to_taint} {constants.OPERATOR_NODE_TAINT}"
            )
            _ocp.exec_oc_cmd(command=taint_cmd)
Beispiel #7
0
    def label_and_taint_nodes(self):
        """
        Label and taint worker nodes to be used by OCS operator
        """

        arbiter_deployment = config.DEPLOYMENT.get("arbiter_deployment")

        nodes = ocp.OCP(kind="node").get().get("items", [])
        zone_label = self.get_zone_label()

        worker_nodes = [
            node for node in nodes
            if constants.WORKER_LABEL in node["metadata"]["labels"]
        ]
        if not worker_nodes:
            raise UnavailableResourceException("No worker node found!")
        az_worker_nodes = {}
        for node in worker_nodes:
            az = node["metadata"]["labels"].get(zone_label)
            az_node_list = az_worker_nodes.get(az, [])
            az_node_list.append(node["metadata"]["name"])
            az_worker_nodes[az] = az_node_list
        logger.debug(f"Found the worker nodes in AZ: {az_worker_nodes}")

        distributed_worker_nodes = []
        if arbiter_deployment and config.DEPLOYMENT.get("arbiter_autodetect"):
            for az in list(az_worker_nodes.keys()):
                az_node_list = az_worker_nodes.get(az)
                if az_node_list and len(az_node_list) > 1:
                    node_names = az_node_list[:2]
                    distributed_worker_nodes += node_names
        elif arbiter_deployment and not config.DEPLOYMENT.get(
                "arbiter_autodetect"):
            for az in list(config.DEPLOYMENT.get("worker_zones")):
                az_node_list = az_worker_nodes.get(az)
                if az_node_list and len(az_node_list) > 1:
                    node_names = az_node_list[:2]
                    distributed_worker_nodes += node_names
                else:
                    raise UnavailableResourceException(
                        "Atleast 2 worker nodes required for arbiter cluster in zone %s",
                        az,
                    )
        else:
            while az_worker_nodes:
                for az in list(az_worker_nodes.keys()):
                    az_node_list = az_worker_nodes.get(az)
                    if az_node_list:
                        node_name = az_node_list.pop(0)
                        distributed_worker_nodes.append(node_name)
                    else:
                        del az_worker_nodes[az]
        logger.info(
            f"Distributed worker nodes for AZ: {distributed_worker_nodes}")

        if arbiter_deployment:
            to_label = config.DEPLOYMENT.get("ocs_operator_nodes_to_label", 4)
        else:
            to_label = config.DEPLOYMENT.get("ocs_operator_nodes_to_label", 3)

        to_taint = config.DEPLOYMENT.get("ocs_operator_nodes_to_taint", 0)

        distributed_worker_count = len(distributed_worker_nodes)
        if distributed_worker_count < to_label or distributed_worker_count < to_taint:
            logger.info(f"All nodes: {nodes}")
            logger.info(
                f"Distributed worker nodes: {distributed_worker_nodes}")
            raise UnavailableResourceException(
                f"Not enough distributed worker nodes: {distributed_worker_count} to label: "
                f"{to_label} or taint: {to_taint}!")

        _ocp = ocp.OCP(kind="node")
        workers_to_label = " ".join(distributed_worker_nodes[:to_label])
        if workers_to_label:

            logger.info(f"Label nodes: {workers_to_label} with label: "
                        f"{constants.OPERATOR_NODE_LABEL}")
            label_cmds = [(f"label nodes {workers_to_label} "
                           f"{constants.OPERATOR_NODE_LABEL} --overwrite")]
            if config.DEPLOYMENT.get(
                    "infra_nodes"
            ) and not config.ENV_DATA.get("infra_replicas"):
                logger.info(f"Label nodes: {workers_to_label} with label: "
                            f"{constants.INFRA_NODE_LABEL}")
                label_cmds.append(f"label nodes {workers_to_label} "
                                  f"{constants.INFRA_NODE_LABEL} --overwrite")

            for cmd in label_cmds:
                _ocp.exec_oc_cmd(command=cmd)

        workers_to_taint = " ".join(distributed_worker_nodes[:to_taint])
        if workers_to_taint:
            logger.info(f"Taint nodes: {workers_to_taint} with taint: "
                        f"{constants.OPERATOR_NODE_TAINT}")
            taint_cmd = (
                f"adm taint nodes {workers_to_taint} {constants.OPERATOR_NODE_TAINT}"
            )
            _ocp.exec_oc_cmd(command=taint_cmd)
Beispiel #8
0
    def label_and_taint_nodes(self):
        """
        Label and taint worker nodes to be used by OCS operator
        """

        # TODO: remove this "heuristics", it doesn't belong there, the process
        # should be explicit and simple, this is asking for trouble, bugs and
        # silently invalid deployments ...
        # See https://github.com/red-hat-storage/ocs-ci/issues/4470
        arbiter_deployment = config.DEPLOYMENT.get("arbiter_deployment")

        nodes = ocp.OCP(kind="node").get().get("items", [])

        worker_nodes = [
            node for node in nodes
            if constants.WORKER_LABEL in node["metadata"]["labels"]
        ]
        if not worker_nodes:
            raise UnavailableResourceException("No worker node found!")
        az_worker_nodes = {}
        for node in worker_nodes:
            az = node["metadata"]["labels"].get(constants.ZONE_LABEL)
            az_node_list = az_worker_nodes.get(az, [])
            az_node_list.append(node["metadata"]["name"])
            az_worker_nodes[az] = az_node_list
        logger.debug(f"Found the worker nodes in AZ: {az_worker_nodes}")

        if arbiter_deployment:
            to_label = config.DEPLOYMENT.get("ocs_operator_nodes_to_label", 4)
        else:
            to_label = config.DEPLOYMENT.get("ocs_operator_nodes_to_label")

        distributed_worker_nodes = []
        if arbiter_deployment and config.DEPLOYMENT.get("arbiter_autodetect"):
            for az in list(az_worker_nodes.keys()):
                az_node_list = az_worker_nodes.get(az)
                if az_node_list and len(az_node_list) > 1:
                    node_names = az_node_list[:2]
                    distributed_worker_nodes += node_names
        elif arbiter_deployment and not config.DEPLOYMENT.get(
                "arbiter_autodetect"):
            to_label_per_az = int(
                to_label /
                len(config.ENV_DATA.get("worker_availability_zones")))
            for az in list(config.ENV_DATA.get("worker_availability_zones")):
                az_node_list = az_worker_nodes.get(az)
                if az_node_list and len(az_node_list) > 1:
                    node_names = az_node_list[:to_label_per_az]
                    distributed_worker_nodes += node_names
                else:
                    raise UnavailableResourceException(
                        "Atleast 2 worker nodes required for arbiter cluster in zone %s",
                        az,
                    )
        else:
            while az_worker_nodes:
                for az in list(az_worker_nodes.keys()):
                    az_node_list = az_worker_nodes.get(az)
                    if az_node_list:
                        node_name = az_node_list.pop(0)
                        distributed_worker_nodes.append(node_name)
                    else:
                        del az_worker_nodes[az]
        logger.info(
            f"Distributed worker nodes for AZ: {distributed_worker_nodes}")

        to_taint = config.DEPLOYMENT.get("ocs_operator_nodes_to_taint", 0)

        distributed_worker_count = len(distributed_worker_nodes)
        if distributed_worker_count < to_label or distributed_worker_count < to_taint:
            logger.info(f"All nodes: {nodes}")
            logger.info(
                f"Distributed worker nodes: {distributed_worker_nodes}")
            raise UnavailableResourceException(
                f"Not enough distributed worker nodes: {distributed_worker_count} to label: "
                f"{to_label} or taint: {to_taint}!")

        _ocp = ocp.OCP(kind="node")
        workers_to_label = " ".join(distributed_worker_nodes[:to_label])
        if workers_to_label:

            logger.info(f"Label nodes: {workers_to_label} with label: "
                        f"{constants.OPERATOR_NODE_LABEL}")
            label_cmds = [(f"label nodes {workers_to_label} "
                           f"{constants.OPERATOR_NODE_LABEL} --overwrite")]
            if config.DEPLOYMENT.get(
                    "infra_nodes"
            ) and not config.ENV_DATA.get("infra_replicas"):
                logger.info(f"Label nodes: {workers_to_label} with label: "
                            f"{constants.INFRA_NODE_LABEL}")
                label_cmds.append(f"label nodes {workers_to_label} "
                                  f"{constants.INFRA_NODE_LABEL} --overwrite")

            for cmd in label_cmds:
                _ocp.exec_oc_cmd(command=cmd)

        workers_to_taint = " ".join(distributed_worker_nodes[:to_taint])
        if workers_to_taint:
            logger.info(f"Taint nodes: {workers_to_taint} with taint: "
                        f"{constants.OPERATOR_NODE_TAINT}")
            taint_cmd = (
                f"adm taint nodes {workers_to_taint} {constants.OPERATOR_NODE_TAINT}"
            )
            _ocp.exec_oc_cmd(command=taint_cmd)