Exemple #1
0
class GkeScyllaPodCluster(ScyllaPodCluster, IptablesClusterOpsMixin):
    NODE_PREPARE_FILE = sct_abs_path(
        "sdcm/k8s_configs/gke/scylla-node-prepare.yaml")
    node_terminate_methods = [
        'drain_k8s_node',
        # NOTE: uncomment below when following scylla-operator bug is fixed:
        #       https://github.com/scylladb/scylla-operator/issues/643
        #       Also, need to add check that there are no PV duplicates
        # 'terminate_k8s_host',
        # 'terminate_k8s_node',
    ]

    k8s_cluster: 'GkeCluster'
    node_pool: 'GkeNodePool'
    PodContainerClass = GkeScyllaPodContainer

    # pylint: disable=too-many-arguments
    def add_nodes(
            self,
            count: int,
            ec2_user_data: str = "",
            dc_idx: int = 0,
            rack: int = 0,
            enable_auto_bootstrap: bool = False
    ) -> List[GkeScyllaPodContainer]:
        new_nodes = super().add_nodes(
            count=count,
            ec2_user_data=ec2_user_data,
            dc_idx=dc_idx,
            rack=rack,
            enable_auto_bootstrap=enable_auto_bootstrap)

        self.add_hydra_iptables_rules(nodes=new_nodes)
        self.update_nodes_iptables_redirect_rules(nodes=new_nodes,
                                                  loaders=False)

        return new_nodes
def get_cloud_init_config():
    return Path(sct_abs_path("sdcm/provision/aws-cloud-init.txt")).read_text(encoding="utf-8")
Exemple #3
0
    Any, Optional, Type, Dict, List, Tuple, Callable, Generic, TypeVar, Protocol, runtime_checkable, Union
from keyword import iskeyword
from weakref import proxy as weakproxy
from datetime import datetime
from functools import partialmethod

import yaml
import dateutil.parser
from dateutil.relativedelta import relativedelta

from sdcm import sct_abs_path
from sdcm.sct_events import Severity, SctEventProtocol
from sdcm.sct_events.events_processes import EventsProcessesRegistry
from sdcm.utils.metaclasses import Singleton

DEFAULT_SEVERITIES = sct_abs_path("defaults/severities.yaml")

LOGGER = logging.getLogger(__name__)


class ContinuousRegistryFilter:
    def __init__(self, registry: List[Any]):
        self._registry = registry
        self._output = registry.copy()

    def filter_by_id(self, event_id: str) -> ContinuousRegistryFilter:
        self._output = [
            event for event in self._output if event.event_id == event_id
        ]

        return self
from sdcm import sct_abs_path, cluster
from sdcm.utils.decorators import retrying
from sdcm.utils.k8s import ApiCallRateLimiter, TokenUpdateThread
from sdcm.utils.gce_utils import GcloudContextManager
from sdcm.cluster_k8s import KubernetesCluster, ScyllaPodCluster, BaseScyllaPodContainer, CloudK8sNodePool

from sdcm.cluster_k8s.iptables import IptablesPodIpRedirectMixin, IptablesClusterOpsMixin
from sdcm.cluster_gce import MonitorSetGCE

GKE_API_CALL_RATE_LIMIT = 5  # ops/s
GKE_API_CALL_QUEUE_SIZE = 1000  # ops
GKE_URLLIB_RETRY = 5  # How many times api request is retried before reporting failure
GKE_URLLIB_BACKOFF_FACTOR = 0.1

LOADER_CLUSTER_CONFIG = sct_abs_path("sdcm/k8s_configs/gke-loaders.yaml")
CPU_POLICY_DAEMONSET = sct_abs_path("sdcm/k8s_configs/cpu-policy-daemonset.yaml")
RAID_DAEMONSET = sct_abs_path("sdcm/k8s_configs/raid-daemonset.yaml")
LOGGER = logging.getLogger(__name__)


class GkeNodePool(CloudK8sNodePool):
    k8s_cluster: 'GkeCluster'

    # pylint: disable=too-many-arguments
    def __init__(
            self,
            k8s_cluster: 'KubernetesCluster',
            name: str,
            num_nodes: int,
            instance_type: str,
    V1ServicePort
from paramiko.config import invoke
from urllib3.util.retry import Retry

from sdcm import sct_abs_path
from sdcm.remote import LOCALRUNNER
from sdcm.utils.decorators import timeout as timeout_decor, retrying
from sdcm.utils.docker_utils import ContainerManager, DockerException, Container
from sdcm.wait import wait_for

KUBECTL_BIN = "kubectl"
HELM_IMAGE = "alpine/helm:3.3.4"

KUBECTL_TIMEOUT = 300  # seconds

K8S_CONFIGS_PATH_SCT = sct_abs_path("sdcm/k8s_configs")

JSON_PATCH_TYPE = "application/json-patch+json"

LOGGER = logging.getLogger(__name__)
K8S_MEM_CPU_RE = re.compile('^([0-9]+)([a-zA-Z]*)$')
K8S_MEM_CONVERSION_MAP = {
    'e': lambda x: x * 1073741824,
    'p': lambda x: x * 1048576,
    't': lambda x: x * 1024,
    'g': lambda x: x,
    'm': lambda x: x / 1024,
    'k': lambda x: x / 1048576,
    '': lambda x: x,
}
K8S_CPU_CONVERSION_MAP = {
import yaml

from sdcm import sct_abs_path, cluster
from sdcm.utils.k8s import ApiCallRateLimiter, TokenUpdateThread
from sdcm.utils.gce_utils import GcloudContextManager
from sdcm.cluster_k8s import KubernetesCluster, ScyllaPodCluster, BaseScyllaPodContainer, CloudK8sNodePool

from sdcm.cluster_k8s.iptables import IptablesPodIpRedirectMixin, IptablesClusterOpsMixin
from sdcm.cluster_gce import MonitorSetGCE

GKE_API_CALL_RATE_LIMIT = 5  # ops/s
GKE_API_CALL_QUEUE_SIZE = 1000  # ops
GKE_URLLIB_RETRY = 5  # How many times api request is retried before reporting failure
GKE_URLLIB_BACKOFF_FACTOR = 0.1

LOADER_CLUSTER_CONFIG = sct_abs_path("sdcm/k8s_configs/gke-loaders.yaml")
LOGGER = logging.getLogger(__name__)


class GkeNodePool(CloudK8sNodePool):
    k8s_cluster: 'GkeCluster'

    # pylint: disable=too-many-arguments
    def __init__(self,
                 k8s_cluster: 'KubernetesCluster',
                 name: str,
                 num_nodes: int,
                 instance_type: str,
                 disk_size: int = None,
                 disk_type: str = None,
                 image_type: str = 'UBUNTU',
Exemple #7
0
class LongevityPipelineTest:
    """
    This class takes pipeline parameters and produces hydra test cases parameters as a tuple in hydra_test_cases
    """
    sct_base_path = sct_abs_path("").rstrip('/')
    test_id = '11111111-1111-1111-1111-111111111111'
    runner_arg = '--execute-on-runner 1.1.1.1 '

    def __init__(self, backend: str, runner: bool, aws_creds: bool,
                 gce_creds: bool):
        self.home_dir = tempfile.mkdtemp()
        self.backend = backend
        self.runner = runner
        self.aws_creds = aws_creds
        self.gce_creds = gce_creds
        self.home_dir_postfix = ''

    def set_test_home_dir_postfix(self, postfix: str):
        self.home_dir_postfix = postfix

    @staticmethod
    def docker_run_prefix(runner: bool):
        if runner:
            return "docker -H ssh://[email protected] run --rm -it --privileged -h SCT-CONTAINER.*"
        return "docker run --rm -it --privileged .*"

    def sct_path(self, runner: bool):
        if runner:
            return '/home/ubuntu/scylla-cluster-tests'
        return self.sct_base_path

    def expected(self, runner):
        docker_run_prefix = self.docker_run_prefix(runner)
        sct_dir = self.sct_path(runner)
        expected = (
            f'{self.sct_base_path}/get-qa-ssh-keys.sh',
            re.compile(
                'sudo chown -R [^: ]+:[^ ]+ ~/sct-results &> /dev/null [|][|] true'
            ),
            re.compile(
                f"sudo chown -R [^: ]+:[^ ]+ \"{self.sct_base_path}/sct-results\" &> /dev/null [|][|] true"
            ),
            re.compile(
                f"{docker_run_prefix} -l TestId=11111111-1111-1111-1111-111111111111"
            ),
            re.compile(
                f"{docker_run_prefix} -v /var/run:/run -v {sct_dir}:{sct_dir}"
            ),
            re.compile(
                f"{docker_run_prefix} --group-add 1 --group-add 2 --group-add 3"
            ),
        )
        if not runner:
            return expected

        expected += (
            self.pattern_remove_known_key,
            self.pattern_rsync_aws_token,
            self.pattern_rsync_sct_dir,
        )
        if 'gce' not in self.backend:
            return expected
        # Should sync gcloud token if backend is gcloud
        expected += (self.pattern_gcloud_token_sync, )
        return expected

    def not_expected(self, runner: bool):
        # Hydra arguments should not leak
        not_expected = ('--dry-run-hydra', '--execute-on-runner')
        if not runner:
            # No sync if no runner is used
            return not_expected + (
                self.pattern_remove_known_key,
                'rsync -ar -e ssh -o StrictHostKeyChecking=no --delete ',
            )

        if self.is_gce_or_gke:
            return not_expected
        # Should not sync gcloud token if backend is not gcloud
        return not_expected + (self.pattern_gcloud_token_sync, )

    @cached_property
    def pattern_gcloud_token_sync(self):  # pylint: disable=no-self-use
        return "rsync -ar -e 'ssh -o StrictHostKeyChecking=no' --delete " \
               "~/.google_libcloud_auth.skilled-adapter-452 [email protected]:/home/ubuntu/"

    @cached_property
    def pattern_remove_known_key(self):  # pylint: disable=no-self-use
        return 'ssh-keygen -R "1.1.1.1" || true'

    @cached_property
    def pattern_rsync_aws_token(self):  # pylint: disable=no-self-use
        return "rsync -ar -e 'ssh -o StrictHostKeyChecking=no' --delete ~/.aws [email protected]:/home/ubuntu/"

    @cached_property
    def pattern_rsync_sct_dir(self):
        return f"rsync -ar -e 'ssh -o StrictHostKeyChecking=no' --delete " \
               f"{self.sct_base_path} [email protected]:/home/ubuntu/"

    @cached_property
    def step_name_prefix(self):
        return f'{self.backend}_{self.runner}_{self.aws_creds}_{self.gce_creds}'

    @cached_property
    def show_conf_cmd(self):
        return f'output-conf -b {self.backend}'

    @cached_property
    def create_runner_cmd(self):
        return f'create-runner-instance --cloud-provider {self.backend} --region eu-north-1 --availability-zone a ' \
               f'--test-id {self.test_id} --duration 465'

    @cached_property
    def run_test_cmd(self):
        # Command line of the hydra it self
        if self.runner:
            return f'{self.runner_arg}{self.run_test_cmd_docker}'
        return self.run_test_cmd_docker

    @cached_property
    def run_test_cmd_docker(self):
        # Command line that should be run in the docker
        return f'run-test longevity_test.LongevityTest.test_custom_time --backend {self.backend}'

    @cached_property
    def collect_logs_cmd(self):
        # Command line of the hydra it self
        if self.runner:
            return f'{self.runner_arg}{self.collect_logs_cmd_docker}'
        return self.collect_logs_cmd_docker

    @cached_property
    def collect_logs_cmd_docker(self):  # pylint: disable=no-self-use
        # Command line that should be run in the docker
        return 'collect-logs'

    @cached_property
    def clean_resources_cmd(self):
        # Command line of the hydra it self
        if self.runner:
            return f'{self.runner_arg}{self.clean_resources_cmd_docker}'
        return self.clean_resources_cmd_docker

    @cached_property
    def clean_resources_cmd_docker(self):
        # Command line that should be run in the docker
        return f'clean-resources --post-behavior --test-id {self.test_id}'

    @cached_property
    def send_email_cmd(self):
        # Command line of the hydra it self
        if self.runner:
            return f'{self.runner_arg}{self.send_email_cmd_docker}'
        return self.send_email_cmd_docker

    @cached_property
    def send_email_cmd_docker(self):  # pylint: disable=no-self-use
        # Command line that should be run in the docker
        return 'send-email --test-status SUCCESS --start-time 1627268929 --email-recipients [email protected]'

    @property
    def test_home_dir(self) -> str:
        return os.path.join(self.home_dir, self.home_dir_postfix)

    @cached_property
    def is_gce_or_gke(self) -> bool:
        return 'gce' in self.backend or 'gke' in self.backend

    @property
    def get_longevity_env(self) -> Dict[str, str]:
        longevity_end = {
            'SCT_TEST_ID':
            self.test_id,
            'HOME':
            self.test_home_dir,
            'USER':
            '******',
            'SCT_CLUSTER_BACKEND':
            self.backend,
            'SCT_CONFIG_FILES':
            '["/jenkins/slave/workspace/siren-tests/longevity-tests/cloud-longevity-small-data-'
            'set-1h-gcp/siren-tests/sct_plugin/configurations/scylla_cloud_nemesis_small_set.yaml"]'
        }
        return longevity_end

    @property
    def before_runner_not_expected(self):
        # All steps before runner is created should not have runner related steps
        return self.not_expected(runner=False)

    @property
    def before_runner_expected(self):
        # All steps before runner is created should not have runner related steps
        return self.expected(runner=False)

    @property
    def before_runner_docker_run_prefix(self):
        # All steps before runner is created do not have runner parameter
        return self.docker_run_prefix(runner=False)

    @property
    def after_runner_not_expected(self):
        # All steps before runner is created should not have runner related steps
        return self.not_expected(runner=self.runner)

    @property
    def after_runner_expected(self):
        # All steps before runner is created should not have runner related steps
        return self.expected(runner=self.runner)

    @property
    def after_runner_docker_run_prefix(self):
        # All steps before runner is created do not have runner parameter
        return self.docker_run_prefix(runner=self.runner)

    @property
    def test_tmp_dir(self) -> HydraTestCaseTmpDir:
        return HydraTestCaseTmpDir(home_dir=self.test_home_dir,
                                   aws_creds=self.aws_creds,
                                   gce_creds=self.gce_creds)

    @property
    def test_case_show_conf(self):
        self.set_test_home_dir_postfix('show_conf')
        return HydraTestCaseParams(
            name=f'{self.step_name_prefix}_show_conf',
            cmd=self.show_conf_cmd,
            expected=[
                *self.before_runner_expected,
                re.compile(
                    f"{self.before_runner_docker_run_prefix} eval './sct.py  {self.show_conf_cmd}'"
                )
            ],
            not_expected=[*self.before_runner_not_expected],
            return_code=0,
            env=self.get_longevity_env,
        ), self.test_tmp_dir

    @property
    def test_case_create_runner(self):
        self.set_test_home_dir_postfix('create_runner')
        return HydraTestCaseParams(
            name=f'{self.step_name_prefix}_create_runner',
            cmd=self.create_runner_cmd,
            expected=[
                *self.before_runner_expected,
                re.compile(
                    f"{self.before_runner_docker_run_prefix} eval './sct.py  {self.create_runner_cmd}'"
                )
            ],
            not_expected=[*self.before_runner_not_expected],
            return_code=0,
            env=self.get_longevity_env), self.test_tmp_dir

    @property
    def test_case_run_test(self):
        self.set_test_home_dir_postfix('run_test')
        return HydraTestCaseParams(
            name=f'{self.step_name_prefix}_run_test',
            cmd=self.run_test_cmd,
            expected=[
                *self.after_runner_expected,
                re.compile(
                    f"{self.after_runner_docker_run_prefix} eval './sct.py  {self.run_test_cmd_docker}'"
                )
            ],
            not_expected=[*self.after_runner_not_expected],
            return_code=0,
            env=self.get_longevity_env), self.test_tmp_dir

    @property
    def test_case_collect_logs(self):
        self.set_test_home_dir_postfix('collect_logs')
        return HydraTestCaseParams(
            name=f'{self.step_name_prefix}_collect_logs',
            cmd=self.collect_logs_cmd,
            expected=[
                *self.after_runner_expected,
                re.compile(
                    f"{self.after_runner_docker_run_prefix} eval './sct.py  {self.collect_logs_cmd_docker}'"
                )
            ],
            not_expected=[*self.after_runner_not_expected],
            return_code=0,
            env=self.get_longevity_env), self.test_tmp_dir

    @property
    def test_case_clean_resources(self):
        self.set_test_home_dir_postfix('clean_resources')
        return HydraTestCaseParams(
            name=f'{self.step_name_prefix}_clean_resources',
            cmd=self.clean_resources_cmd,
            expected=[
                *self.after_runner_expected,
                re.compile(
                    f"{self.after_runner_docker_run_prefix} "
                    f"eval './sct.py  {self.clean_resources_cmd_docker}'")
            ],
            not_expected=[*self.after_runner_not_expected],
            return_code=0,
            env=self.get_longevity_env), self.test_tmp_dir

    @property
    def test_case_send_email(self):
        self.set_test_home_dir_postfix('send_email')
        return HydraTestCaseParams(
            name=f'{self.step_name_prefix}_send_email',
            cmd=self.send_email_cmd,
            expected=[
                *self.after_runner_expected,
                re.compile(
                    f"{self.after_runner_docker_run_prefix} eval './sct.py  {self.send_email_cmd_docker}'"
                )
            ],
            not_expected=[*self.after_runner_not_expected],
            return_code=0,
            env=self.get_longevity_env), self.test_tmp_dir

    @property
    def hydra_test_cases(
            self) -> Iterable[Tuple[HydraTestCaseParams, HydraTestCaseTmpDir]]:
        """
        Creates list of test case parameters that represent steps in longevity pipeline steps
        """
        return (self.test_case_show_conf, self.test_case_create_runner,
                self.test_case_run_test, self.test_case_collect_logs,
                self.test_case_clean_resources)
from functools import cached_property

import kubernetes as k8s
from urllib3.util.retry import Retry

from sdcm import sct_abs_path
from sdcm.remote import LOCALRUNNER
from sdcm.utils.decorators import timeout as timeout_decor
from sdcm.utils.docker_utils import ContainerManager, DockerException, Container

KUBECTL_BIN = "kubectl"
HELM_IMAGE = "alpine/helm:3.3.4"

KUBECTL_TIMEOUT = 300  # seconds

K8S_CONFIGS = sct_abs_path("sdcm/k8s_configs")

JSON_PATCH_TYPE = "application/json-patch+json"

LOGGER = logging.getLogger(__name__)

logging.getLogger("kubernetes.client.rest").setLevel(logging.INFO)


class ApiLimiterClient(k8s.client.ApiClient):
    _api_rate_limiter: 'ApiCallRateLimiter' = None

    def call_api(self, *args, **kwargs):
        if self._api_rate_limiter:
            self._api_rate_limiter.wait()
        return super().call_api(*args, **kwargs)
Exemple #9
0
from textwrap import dedent
from functools import cached_property

from invoke.exceptions import UnexpectedExit

from sdcm import sct_abs_path, cluster, cluster_gce
from sdcm.remote import LOCALRUNNER
from sdcm.remote.kubernetes_cmd_runner import KubernetesCmdRunner
from sdcm.cluster_k8s import KubernetesCluster, BasePodContainer, ScyllaPodCluster
from sdcm.cluster_k8s.iptables import IptablesPodPortsRedirectMixin, IptablesClusterOpsMixin
from sdcm.utils.k8s import KubernetesOps
from sdcm.utils.common import get_free_port, wait_for_port
from sdcm.utils.decorators import retrying
from sdcm.utils.docker_utils import ContainerManager

SCYLLA_CLUSTER_CONFIG = sct_abs_path("sdcm/k8s_configs/cluster-minikube.yaml")
KUBECTL_PROXY_PORT = 8001
KUBECTL_PROXY_CONTAINER = "auto_ssh:kubectl_proxy"
SCYLLA_POD_EXPOSED_PORTS = [
    3000,
    9042,
    9180,
]

LOGGER = logging.getLogger(__name__)


class MinikubeOps:
    @classmethod
    def setup_minikube(cls, node: cluster.BaseNode, kubectl_version: str,
                       minikube_version: str) -> None:
    BaseScyllaPodContainer,
    ScyllaPodCluster,
    COMMON_CONTAINERS_RESOURCES,
    LOCAL_MINIO_DIR,
    LOCAL_PROVISIONER_DIR,
    OPERATOR_CONTAINERS_RESOURCES,
    SCYLLA_MANAGER_AGENT_RESOURCES,
    SCYLLA_MANAGER_AGENT_VERSION_IN_SCYLLA_MANAGER,
    SCYLLA_VERSION_IN_SCYLLA_MANAGER,
)
from sdcm.utils.k8s import TokenUpdateThread, HelmValues
from sdcm.utils.decorators import retrying
from sdcm.utils.docker_utils import docker_hub_login
from sdcm.utils import version_utils

SRC_APISERVER_AUDIT_POLICY = sct_abs_path(
    "sdcm/k8s_configs/local-kind/audit-policy.yaml")
DST_APISERVER_AUDIT_POLICY = "/etc/kubernetes/policies/audit-policy.yaml"
DST_APISERVER_AUDIT_LOG = "/var/log/kubernetes/kube-apiserver-audit.log"

CNI_CALICO_CONFIG = sct_abs_path("sdcm/k8s_configs/cni-calico.yaml")
CNI_CALICO_VERSION = "v3.23.0"
LOGGER = logging.getLogger(__name__)
POOL_LABEL_NAME = 'minimal-k8s-nodepool'


class MinimalK8SNodePool(CloudK8sNodePool):
    k8s_cluster: 'LocalKindCluster'

    def deploy(self) -> None:
        self.is_deployed = True
Exemple #11
0
class EksCluster(KubernetesCluster, EksClusterCleanupMixin):
    POOL_LABEL_NAME = 'eks.amazonaws.com/nodegroup'
    IS_NODE_TUNING_SUPPORTED = True
    NODE_PREPARE_FILE = sct_abs_path(
        "sdcm/k8s_configs/eks/scylla-node-prepare.yaml")
    pools: Dict[str, EksNodePool]
    short_cluster_name: str

    # pylint: disable=too-many-arguments
    def __init__(self,
                 eks_cluster_version,
                 ec2_security_group_ids,
                 ec2_subnet_ids,
                 ec2_role_arn,
                 credentials,
                 user_prefix,
                 service_ipv4_cidr,
                 vpc_cni_version,
                 nodegroup_role_arn,
                 params=None,
                 cluster_uuid=None,
                 region_name=None):
        super().__init__(user_prefix=user_prefix,
                         cluster_uuid=cluster_uuid,
                         region_name=region_name,
                         params=params)
        self.credentials = credentials
        self.eks_cluster_version = eks_cluster_version
        self.ec2_role_arn = ec2_role_arn
        self.nodegroup_role_arn = nodegroup_role_arn
        self.ec2_security_group_ids = ec2_security_group_ids
        self.ec2_subnet_ids = ec2_subnet_ids
        self.service_ipv4_cidr = service_ipv4_cidr
        self.vpc_cni_version = vpc_cni_version

    @cached_property
    def allowed_labels_on_scylla_node(self) -> list:
        allowed_labels_on_scylla_node = [
            ('name', 'node-setup'),
            ('name', 'cpu-policy'),
            ('k8s-app', 'aws-node'),
            ('app', 'local-volume-provisioner'),
            ('k8s-app', 'kube-proxy'),
        ]
        if self.tenants_number > 1:
            allowed_labels_on_scylla_node.append(('app', 'scylla'))
            allowed_labels_on_scylla_node.append(
                ('app.kubernetes.io/name', 'scylla'))
        else:
            allowed_labels_on_scylla_node.append(
                ('scylla/cluster', self.k8s_scylla_cluster_name))
        if self.is_performance_tuning_enabled:
            # NOTE: add performance tuning related pods only if we expect it to be.
            #       When we have tuning disabled it must not exist.
            allowed_labels_on_scylla_node.extend(self.perf_pods_labels)
        return allowed_labels_on_scylla_node

    def create_eks_cluster(self, wait_till_functional=True):
        self.eks_client.create_cluster(
            name=self.short_cluster_name,
            version=self.eks_cluster_version,
            roleArn=self.ec2_role_arn,
            resourcesVpcConfig={
                'securityGroupIds': self.ec2_security_group_ids[0],
                'subnetIds': self.ec2_subnet_ids,
                'endpointPublicAccess': True,
                'endpointPrivateAccess': True,
                'publicAccessCidrs': [
                    '0.0.0.0/0',
                ]
            },
            kubernetesNetworkConfig={
                'serviceIpv4Cidr': self.service_ipv4_cidr
            },
            logging={
                'clusterLogging': [
                    {
                        'types': [
                            'api', 'audit', 'authenticator',
                            'controllerManager', 'scheduler'
                        ],
                        'enabled':
                        True
                    },
                ]
            },
            tags=self.tags,
        )
        self.eks_client.create_addon(clusterName=self.short_cluster_name,
                                     addonName='vpc-cni',
                                     addonVersion=self.vpc_cni_version)
        if wait_till_functional:
            wait_for(
                lambda: self.cluster_status == 'ACTIVE',
                step=60,
                throw_exc=True,
                timeout=1200,
                text=
                f'Waiting till EKS cluster {self.short_cluster_name} become operational'
            )

    @property
    def cluster_info(self) -> dict:
        return self.eks_client.describe_cluster(
            name=self.short_cluster_name)['cluster']

    @property
    def cluster_status(self) -> str:
        return self.cluster_info['status']

    def __str__(self):
        return f"{type(self).__name__} {self.name} | Version: {self.eks_cluster_version}"

    def create_token_update_thread(self):
        return EksTokenUpdateThread(
            aws_cmd=
            f'aws eks --region {self.region_name} get-token --cluster-name {self.short_cluster_name}',
            kubectl_token_path=self.kubectl_token_path)

    def create_kubectl_config(self):
        LOCALRUNNER.run(
            f'aws eks --region {self.region_name} update-kubeconfig --name {self.short_cluster_name}'
        )

    def deploy(self):
        LOGGER.info("Create EKS cluster `%s'", self.short_cluster_name)
        self.create_eks_cluster()
        LOGGER.info("Patch kubectl config")
        self.patch_kubectl_config()

    def tune_network(self):
        """Tune networking on all nodes of an EKS cluster to reduce number of reserved IPs.

        Set following special EKS-specific env vars to 'aws-node' daemonset:

            WARM_ENI_TARGET (default is 1) - number of 'ready-to-use' additional
                network interfaces which in our case almost always stay idle.
                Setting this one to 0 means we have 1 network interface at start,
                and new ones will be added (expensive API calls) only when needed automatically.
            MINIMUM_IP_TARGET (no default) - minimum number of IP addresses that must be
                dedicated to an EC2 instance. If not set then number of reserved IPs equal
                to number of IP capacity of a network interface.
            WARM_IP_TARGET (no default) - how many unused IP must be kept as 'ready-to-use'.
                if not set then depend on the available IPs of ENIs.

        General idea behind it: reduce number of 'reserved' IP addresses by each of
        EC2 instances used in an EKS cluster.
        Without such tweaking one cluster we create uses more than 300 IP addresses
        and having /22 subnet (<1024 IP addresses) we can run just 3 EKS clusters at once.

        Env vars details:
            https://github.com/aws/amazon-vpc-cni-k8s/blob/master/docs/eni-and-ip-target.md
        IPs per network interface per node type details:
            https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-eni.html
        """
        LOGGER.info("Tune network of the EKS cluster")
        env_vars = (
            "WARM_ENI_TARGET=0",
            "MINIMUM_IP_TARGET=8",
            "WARM_IP_TARGET=2",
        )
        self.kubectl(f"set env daemonset aws-node {' '.join(env_vars)}",
                     namespace="kube-system")

    def deploy_node_pool(self,
                         pool: EksNodePool,
                         wait_till_ready=True) -> None:
        self._add_pool(pool)
        if pool.is_deployed:
            return
        if wait_till_ready:
            pool.deploy_and_wait_till_ready()
        else:
            pool.deploy()

    def resize_node_pool(self,
                         name: str,
                         num_nodes: int,
                         wait_till_ready=True) -> None:
        self.pools[name].resize(num_nodes)
        if wait_till_ready:
            self.pools[name].wait_for_nodes_readiness()

    def destroy(self):
        EksClusterCleanupMixin.destroy(self)
        self.stop_token_update_thread()

    def get_ec2_instance_by_id(self, instance_id):
        return boto3.resource(
            'ec2', region_name=self.region_name).Instance(id=instance_id)

    def deploy_scylla_manager(self, pool_name: str = None) -> None:
        self.deploy_minio_s3_backend()
        super().deploy_scylla_manager(pool_name=pool_name)

    def _get_all_instance_ids(self):
        cmd = "get node --no-headers -o custom-columns=:.spec.providerID"
        return [
            name.split("/")[-1] for name in self.kubectl(cmd).stdout.split()
        ]

    def set_tags(self, instance_ids):
        if isinstance(instance_ids, str):
            instance_ids = [instance_ids]
        boto3.client('ec2', region_name=self.region_name).create_tags(
            Resources=instance_ids,
            Tags=[{
                "Key": key,
                "Value": value
            } for key, value in self.tags.items()],
        )

    def set_tags_on_all_instances(self):
        # NOTE: EKS doesn't apply nodeGroup's tags to nodes.
        # So, we add it for each node explicitly.
        self.set_tags(self._get_all_instance_ids())

    def set_security_groups(self, instance):
        for network_interface in instance.network_interfaces:
            security_groups = [g["GroupId"] for g in network_interface.groups]
            # NOTE: Make API call only if it is needed
            if self.ec2_security_group_ids[0][0] not in security_groups:
                security_groups.append(self.ec2_security_group_ids[0][0])
                network_interface.modify_attribute(Groups=security_groups)

    def set_security_groups_on_all_instances(self):
        # NOTE: EKS doesn't apply nodeGroup's security groups to nodes
        # So, we add it for each network interface of a node explicitly.
        for instance_id in self._get_all_instance_ids():
            self.set_security_groups(self.get_ec2_instance_by_id(instance_id))

    def deploy_scylla_cluster(self, *args, **kwargs) -> None:  # pylint: disable=signature-differs
        super().deploy_scylla_cluster(*args, **kwargs)
        self.set_security_groups_on_all_instances()
        self.set_tags_on_all_instances()

    def deploy_monitoring_cluster(self, *args, **kwargs) -> None:  # pylint: disable=signature-differs
        super().deploy_monitoring_cluster(*args, **kwargs)
        self.set_security_groups_on_all_instances()
        self.set_tags_on_all_instances()

    def upgrade_kubernetes_platform(self) -> str:
        upgrade_version = f"1.{int(self.eks_cluster_version.split('.')[1]) + 1}"

        # Upgrade control plane (API, scheduler, manager and so on ...)
        LOGGER.info("Upgrading K8S control plane to the '%s' version",
                    upgrade_version)
        self.eks_client.update_cluster_version(
            name=self.short_cluster_name,
            version=upgrade_version,
        )
        # NOTE: sleep for some small period of time to make sure that cluster's status changes
        #       before we poll for it's readiness.
        #       Upgrade takes dozens of minutes, so, 'sleep'ing for some time won't cause
        #       time waste by calling 'time.sleep' function.
        #       5sec is not enough in some cases. 20sec must be sufficient
        time.sleep(20)
        self.eks_client.get_waiter('cluster_active').wait(
            name=self.short_cluster_name,
            WaiterConfig={
                'Delay': 30,
                'MaxAttempts': 120
            },
        )

        # Upgrade scylla-related node pools
        for node_pool in (self.AUXILIARY_POOL_NAME, self.SCYLLA_POOL_NAME):
            LOGGER.info("Upgrading '%s' node pool to the '%s' version",
                        node_pool, upgrade_version)
            self.eks_client.update_nodegroup_version(
                clusterName=self.short_cluster_name,
                nodegroupName=node_pool,
                version=upgrade_version,
            )
            time.sleep(20)
            self.eks_client.get_waiter('nodegroup_active').wait(
                clusterName=self.short_cluster_name,
                nodegroupName=node_pool,
                WaiterConfig={
                    # NOTE: one Scylla K8S node upgrade takes about 10 minutes
                    #       So, wait timeout will be different for different number of DB nodes
                    #       Set it bigger than the expected value to avoid possible fluctuations
                    'Delay': 30,
                    'MaxAttempts': self.params.get("n_db_nodes") * 30,
                },
            )
        return upgrade_version
Exemple #12
0
class GkeCluster(KubernetesCluster):
    AUXILIARY_POOL_NAME = 'default-pool'  # This is default pool that is deployed with the cluster
    POOL_LABEL_NAME = 'cloud.google.com/gke-nodepool'
    IS_NODE_TUNING_SUPPORTED = True
    NODE_PREPARE_FILE = sct_abs_path("sdcm/k8s_configs/gke/scylla-node-prepare.yaml")
    pools: Dict[str, GkeNodePool]

    # pylint: disable=too-many-arguments
    def __init__(self,
                 gke_cluster_version,
                 gke_k8s_release_channel,
                 gce_disk_size,
                 gce_disk_type,
                 gce_network,
                 services,
                 gce_instance_type='n1-standard-2',
                 user_prefix=None,
                 params=None,
                 gce_datacenter=None,
                 cluster_uuid=None,
                 n_nodes=2,
                 ):
        super().__init__(
            params=params,
            cluster_uuid=cluster_uuid,
            user_prefix=user_prefix
        )
        self.gke_cluster_version = gke_cluster_version
        self.gke_k8s_release_channel = gke_k8s_release_channel.strip()
        self.gce_disk_type = gce_disk_type
        self.gce_disk_size = gce_disk_size
        self.gce_network = gce_network
        self.gce_services = services
        self.gce_instance_type = gce_instance_type
        self.n_nodes = n_nodes
        self.gce_project = services[0].project
        self.gce_user = services[0].key
        self.gce_zone = gce_datacenter[0]
        self.gke_cluster_created = False
        self.api_call_rate_limiter = ApiCallRateLimiter(
            rate_limit=GKE_API_CALL_RATE_LIMIT,
            queue_size=GKE_API_CALL_QUEUE_SIZE,
            urllib_retry=GKE_URLLIB_RETRY,
            urllib_backoff_factor=GKE_URLLIB_BACKOFF_FACTOR,
        )
        self.api_call_rate_limiter.start()

    @cached_property
    def allowed_labels_on_scylla_node(self) -> list:
        allowed_labels_on_scylla_node = [
            ('app', 'xfs-formatter'),
            ('app', 'local-volume-provisioner'),
            ('k8s-app', 'fluentbit-gke'),
            ('k8s-app', 'gke-metrics-agent'),
            ('component', 'kube-proxy'),
            ('k8s-app', 'gcp-compute-persistent-disk-csi-driver'),
        ]
        if self.tenants_number > 1:
            allowed_labels_on_scylla_node.append(('app.kubernetes.io/name', 'scylla'))
            allowed_labels_on_scylla_node.append(('app', 'scylla'))
        else:
            allowed_labels_on_scylla_node.append(('scylla/cluster', self.k8s_scylla_cluster_name))
        if self.is_performance_tuning_enabled:
            # NOTE: add performance tuning related pods only if we expect it to be.
            #       When we have tuning disabled it must not exist.
            allowed_labels_on_scylla_node.extend(self.perf_pods_labels)
        return allowed_labels_on_scylla_node

    def __str__(self):
        return f"{type(self).__name__} {self.name} | Zone: {self.gce_zone} | Version: {self.gke_cluster_version}"

    def deploy(self):
        LOGGER.info("Create GKE cluster `%s' with %d node(s) in %s",
                    self.short_cluster_name, self.n_nodes, self.AUXILIARY_POOL_NAME)
        tags = ",".join(f"{key}={value}" for key, value in self.tags.items())
        with self.gcloud as gcloud:
            # NOTE: only static K8S release channel supports disabling of autoupgrade
            gcloud.run(f"container --project {self.gce_project} clusters create {self.short_cluster_name}"
                       f" --no-enable-basic-auth"
                       f" --zone {self.gce_zone}"
                       f" --cluster-version {self.gke_cluster_version}"
                       f"{' --release-channel ' + self.gke_k8s_release_channel if self.gke_k8s_release_channel else ''}"
                       f" --network {self.gce_network}"
                       f" --num-nodes {self.n_nodes}"
                       f" --machine-type {self.gce_instance_type}"
                       f" --image-type UBUNTU"
                       f" --disk-type {self.gce_disk_type}"
                       f" --disk-size {self.gce_disk_size}"
                       f" --enable-stackdriver-kubernetes"
                       f"{'' if self.gke_k8s_release_channel else ' --no-enable-autoupgrade'}"
                       f"{'' if self.gke_k8s_release_channel else ' --no-enable-autorepair'}"
                       f" --metadata {tags}")
            self.patch_kubectl_config()
            self.deploy_node_pool(GkeNodePool(
                name=self.AUXILIARY_POOL_NAME,
                num_nodes=self.n_nodes,
                disk_size=self.gce_disk_size,
                disk_type=self.gce_disk_type,
                k8s_cluster=self,
                instance_type=self.gce_instance_type,
                is_deployed=True
            ))

        LOGGER.info("Setup RBAC for GKE cluster `%s'", self.name)
        self.kubectl("create clusterrolebinding cluster-admin-binding --clusterrole cluster-admin "
                     f"--user {self.gce_user}")

    @cached_property
    def gcloud(self) -> GcloudContextManager:  # pylint: disable=no-self-use
        return self.test_config.tester_obj().localhost.gcloud

    def deploy_node_pool(self, pool: GkeNodePool, wait_till_ready=True) -> None:
        self._add_pool(pool)
        if pool.is_deployed:
            return
        LOGGER.info("Create %s pool with %d node(s) in GKE cluster `%s'", pool.name, pool.num_nodes, self.name)
        if wait_till_ready:
            with self.api_call_rate_limiter.pause:
                pool.deploy_and_wait_till_ready()
                self.api_call_rate_limiter.wait_till_api_become_stable(self)
        else:
            pool.deploy()

    def wait_all_node_pools_to_be_ready(self):
        with self.api_call_rate_limiter.pause:
            super().wait_all_node_pools_to_be_ready()
            self.api_call_rate_limiter.wait_till_api_become_stable(self)

    def resize_node_pool(self, name: str, num_nodes: int) -> None:
        with self.api_call_rate_limiter.pause:
            self.pools[name].resize(num_nodes)
            self.api_call_rate_limiter.wait_till_api_become_stable(self)

    def get_instance_group_name_for_pool(self, pool_name: str, default=None) -> str:
        try:
            group_link = yaml.safe_load(
                self.gcloud.run(
                    f'container node-pools describe {pool_name} '
                    f'--zone {self.gce_zone} --project {self.gce_project} '
                    f'--cluster {self.short_cluster_name}')
            ).get('instanceGroupUrls')[0]
            return group_link.split('/')[-1]
        except Exception as exc:
            if default is not None:
                return default
            raise RuntimeError(f"Can't get instance group name due to the: {exc}") from exc

    def delete_instance_that_belong_to_instance_group(self, group_name: str, instance_name: str):
        self.gcloud.run(f'compute instance-groups managed delete-instances {group_name} '
                        f'--zone={self.gce_zone} --instances={instance_name}')

    def create_token_update_thread(self):
        return GcloudTokenUpdateThread(self.gcloud, self.kubectl_token_path)

    def create_kubectl_config(self):
        self.gcloud.run(f"container clusters get-credentials {self.short_cluster_name} --zone {self.gce_zone}")

    def destroy(self):
        self.api_call_rate_limiter.stop()
        self.stop_token_update_thread()

    def deploy_scylla_manager(self, pool_name: str = None) -> None:
        self.deploy_minio_s3_backend()
        super().deploy_scylla_manager(pool_name=pool_name)

    # NOTE: blocked by https://github.com/scylladb/scylla-operator/issues/760
    def upgrade_kubernetes_platform(self) -> str:
        # NOTE: 'self.gke_cluster_version' can be like 1.21.3-gke.N or 1.21
        upgrade_version = f"1.{int(self.gke_cluster_version.split('.')[1]) + 1}"

        with self.gcloud as gcloud:
            # Upgrade control plane (API, scheduler, manager and so on ...)
            LOGGER.info("Upgrading K8S control plane to the '%s' version", upgrade_version)
            gcloud.run(f"container clusters upgrade {self.short_cluster_name} "
                       f"--master --quiet --project {self.gce_project} --zone {self.gce_zone} "
                       f"--cluster-version {upgrade_version}")

            # Upgrade scylla-related node pools
            for node_pool in (self.AUXILIARY_POOL_NAME, self.SCYLLA_POOL_NAME):
                LOGGER.info("Upgrading '%s' node pool to the '%s' version",
                            node_pool, upgrade_version)
                # NOTE: one node upgrade takes about 10 minutes
                gcloud.run(f"container clusters upgrade {self.short_cluster_name} "
                           f"--quiet --project {self.gce_project} --zone {self.gce_zone} "
                           f"--node-pool={node_pool}")
        return upgrade_version