class GkeScyllaPodCluster(ScyllaPodCluster, IptablesClusterOpsMixin): NODE_PREPARE_FILE = sct_abs_path( "sdcm/k8s_configs/gke/scylla-node-prepare.yaml") node_terminate_methods = [ 'drain_k8s_node', # NOTE: uncomment below when following scylla-operator bug is fixed: # https://github.com/scylladb/scylla-operator/issues/643 # Also, need to add check that there are no PV duplicates # 'terminate_k8s_host', # 'terminate_k8s_node', ] k8s_cluster: 'GkeCluster' node_pool: 'GkeNodePool' PodContainerClass = GkeScyllaPodContainer # pylint: disable=too-many-arguments def add_nodes( self, count: int, ec2_user_data: str = "", dc_idx: int = 0, rack: int = 0, enable_auto_bootstrap: bool = False ) -> List[GkeScyllaPodContainer]: new_nodes = super().add_nodes( count=count, ec2_user_data=ec2_user_data, dc_idx=dc_idx, rack=rack, enable_auto_bootstrap=enable_auto_bootstrap) self.add_hydra_iptables_rules(nodes=new_nodes) self.update_nodes_iptables_redirect_rules(nodes=new_nodes, loaders=False) return new_nodes
def get_cloud_init_config(): return Path(sct_abs_path("sdcm/provision/aws-cloud-init.txt")).read_text(encoding="utf-8")
Any, Optional, Type, Dict, List, Tuple, Callable, Generic, TypeVar, Protocol, runtime_checkable, Union from keyword import iskeyword from weakref import proxy as weakproxy from datetime import datetime from functools import partialmethod import yaml import dateutil.parser from dateutil.relativedelta import relativedelta from sdcm import sct_abs_path from sdcm.sct_events import Severity, SctEventProtocol from sdcm.sct_events.events_processes import EventsProcessesRegistry from sdcm.utils.metaclasses import Singleton DEFAULT_SEVERITIES = sct_abs_path("defaults/severities.yaml") LOGGER = logging.getLogger(__name__) class ContinuousRegistryFilter: def __init__(self, registry: List[Any]): self._registry = registry self._output = registry.copy() def filter_by_id(self, event_id: str) -> ContinuousRegistryFilter: self._output = [ event for event in self._output if event.event_id == event_id ] return self
from sdcm import sct_abs_path, cluster from sdcm.utils.decorators import retrying from sdcm.utils.k8s import ApiCallRateLimiter, TokenUpdateThread from sdcm.utils.gce_utils import GcloudContextManager from sdcm.cluster_k8s import KubernetesCluster, ScyllaPodCluster, BaseScyllaPodContainer, CloudK8sNodePool from sdcm.cluster_k8s.iptables import IptablesPodIpRedirectMixin, IptablesClusterOpsMixin from sdcm.cluster_gce import MonitorSetGCE GKE_API_CALL_RATE_LIMIT = 5 # ops/s GKE_API_CALL_QUEUE_SIZE = 1000 # ops GKE_URLLIB_RETRY = 5 # How many times api request is retried before reporting failure GKE_URLLIB_BACKOFF_FACTOR = 0.1 LOADER_CLUSTER_CONFIG = sct_abs_path("sdcm/k8s_configs/gke-loaders.yaml") CPU_POLICY_DAEMONSET = sct_abs_path("sdcm/k8s_configs/cpu-policy-daemonset.yaml") RAID_DAEMONSET = sct_abs_path("sdcm/k8s_configs/raid-daemonset.yaml") LOGGER = logging.getLogger(__name__) class GkeNodePool(CloudK8sNodePool): k8s_cluster: 'GkeCluster' # pylint: disable=too-many-arguments def __init__( self, k8s_cluster: 'KubernetesCluster', name: str, num_nodes: int, instance_type: str,
V1ServicePort from paramiko.config import invoke from urllib3.util.retry import Retry from sdcm import sct_abs_path from sdcm.remote import LOCALRUNNER from sdcm.utils.decorators import timeout as timeout_decor, retrying from sdcm.utils.docker_utils import ContainerManager, DockerException, Container from sdcm.wait import wait_for KUBECTL_BIN = "kubectl" HELM_IMAGE = "alpine/helm:3.3.4" KUBECTL_TIMEOUT = 300 # seconds K8S_CONFIGS_PATH_SCT = sct_abs_path("sdcm/k8s_configs") JSON_PATCH_TYPE = "application/json-patch+json" LOGGER = logging.getLogger(__name__) K8S_MEM_CPU_RE = re.compile('^([0-9]+)([a-zA-Z]*)$') K8S_MEM_CONVERSION_MAP = { 'e': lambda x: x * 1073741824, 'p': lambda x: x * 1048576, 't': lambda x: x * 1024, 'g': lambda x: x, 'm': lambda x: x / 1024, 'k': lambda x: x / 1048576, '': lambda x: x, } K8S_CPU_CONVERSION_MAP = {
import yaml from sdcm import sct_abs_path, cluster from sdcm.utils.k8s import ApiCallRateLimiter, TokenUpdateThread from sdcm.utils.gce_utils import GcloudContextManager from sdcm.cluster_k8s import KubernetesCluster, ScyllaPodCluster, BaseScyllaPodContainer, CloudK8sNodePool from sdcm.cluster_k8s.iptables import IptablesPodIpRedirectMixin, IptablesClusterOpsMixin from sdcm.cluster_gce import MonitorSetGCE GKE_API_CALL_RATE_LIMIT = 5 # ops/s GKE_API_CALL_QUEUE_SIZE = 1000 # ops GKE_URLLIB_RETRY = 5 # How many times api request is retried before reporting failure GKE_URLLIB_BACKOFF_FACTOR = 0.1 LOADER_CLUSTER_CONFIG = sct_abs_path("sdcm/k8s_configs/gke-loaders.yaml") LOGGER = logging.getLogger(__name__) class GkeNodePool(CloudK8sNodePool): k8s_cluster: 'GkeCluster' # pylint: disable=too-many-arguments def __init__(self, k8s_cluster: 'KubernetesCluster', name: str, num_nodes: int, instance_type: str, disk_size: int = None, disk_type: str = None, image_type: str = 'UBUNTU',
class LongevityPipelineTest: """ This class takes pipeline parameters and produces hydra test cases parameters as a tuple in hydra_test_cases """ sct_base_path = sct_abs_path("").rstrip('/') test_id = '11111111-1111-1111-1111-111111111111' runner_arg = '--execute-on-runner 1.1.1.1 ' def __init__(self, backend: str, runner: bool, aws_creds: bool, gce_creds: bool): self.home_dir = tempfile.mkdtemp() self.backend = backend self.runner = runner self.aws_creds = aws_creds self.gce_creds = gce_creds self.home_dir_postfix = '' def set_test_home_dir_postfix(self, postfix: str): self.home_dir_postfix = postfix @staticmethod def docker_run_prefix(runner: bool): if runner: return "docker -H ssh://[email protected] run --rm -it --privileged -h SCT-CONTAINER.*" return "docker run --rm -it --privileged .*" def sct_path(self, runner: bool): if runner: return '/home/ubuntu/scylla-cluster-tests' return self.sct_base_path def expected(self, runner): docker_run_prefix = self.docker_run_prefix(runner) sct_dir = self.sct_path(runner) expected = ( f'{self.sct_base_path}/get-qa-ssh-keys.sh', re.compile( 'sudo chown -R [^: ]+:[^ ]+ ~/sct-results &> /dev/null [|][|] true' ), re.compile( f"sudo chown -R [^: ]+:[^ ]+ \"{self.sct_base_path}/sct-results\" &> /dev/null [|][|] true" ), re.compile( f"{docker_run_prefix} -l TestId=11111111-1111-1111-1111-111111111111" ), re.compile( f"{docker_run_prefix} -v /var/run:/run -v {sct_dir}:{sct_dir}" ), re.compile( f"{docker_run_prefix} --group-add 1 --group-add 2 --group-add 3" ), ) if not runner: return expected expected += ( self.pattern_remove_known_key, self.pattern_rsync_aws_token, self.pattern_rsync_sct_dir, ) if 'gce' not in self.backend: return expected # Should sync gcloud token if backend is gcloud expected += (self.pattern_gcloud_token_sync, ) return expected def not_expected(self, runner: bool): # Hydra arguments should not leak not_expected = ('--dry-run-hydra', '--execute-on-runner') if not runner: # No sync if no runner is used return not_expected + ( self.pattern_remove_known_key, 'rsync -ar -e ssh -o StrictHostKeyChecking=no --delete ', ) if self.is_gce_or_gke: return not_expected # Should not sync gcloud token if backend is not gcloud return not_expected + (self.pattern_gcloud_token_sync, ) @cached_property def pattern_gcloud_token_sync(self): # pylint: disable=no-self-use return "rsync -ar -e 'ssh -o StrictHostKeyChecking=no' --delete " \ "~/.google_libcloud_auth.skilled-adapter-452 [email protected]:/home/ubuntu/" @cached_property def pattern_remove_known_key(self): # pylint: disable=no-self-use return 'ssh-keygen -R "1.1.1.1" || true' @cached_property def pattern_rsync_aws_token(self): # pylint: disable=no-self-use return "rsync -ar -e 'ssh -o StrictHostKeyChecking=no' --delete ~/.aws [email protected]:/home/ubuntu/" @cached_property def pattern_rsync_sct_dir(self): return f"rsync -ar -e 'ssh -o StrictHostKeyChecking=no' --delete " \ f"{self.sct_base_path} [email protected]:/home/ubuntu/" @cached_property def step_name_prefix(self): return f'{self.backend}_{self.runner}_{self.aws_creds}_{self.gce_creds}' @cached_property def show_conf_cmd(self): return f'output-conf -b {self.backend}' @cached_property def create_runner_cmd(self): return f'create-runner-instance --cloud-provider {self.backend} --region eu-north-1 --availability-zone a ' \ f'--test-id {self.test_id} --duration 465' @cached_property def run_test_cmd(self): # Command line of the hydra it self if self.runner: return f'{self.runner_arg}{self.run_test_cmd_docker}' return self.run_test_cmd_docker @cached_property def run_test_cmd_docker(self): # Command line that should be run in the docker return f'run-test longevity_test.LongevityTest.test_custom_time --backend {self.backend}' @cached_property def collect_logs_cmd(self): # Command line of the hydra it self if self.runner: return f'{self.runner_arg}{self.collect_logs_cmd_docker}' return self.collect_logs_cmd_docker @cached_property def collect_logs_cmd_docker(self): # pylint: disable=no-self-use # Command line that should be run in the docker return 'collect-logs' @cached_property def clean_resources_cmd(self): # Command line of the hydra it self if self.runner: return f'{self.runner_arg}{self.clean_resources_cmd_docker}' return self.clean_resources_cmd_docker @cached_property def clean_resources_cmd_docker(self): # Command line that should be run in the docker return f'clean-resources --post-behavior --test-id {self.test_id}' @cached_property def send_email_cmd(self): # Command line of the hydra it self if self.runner: return f'{self.runner_arg}{self.send_email_cmd_docker}' return self.send_email_cmd_docker @cached_property def send_email_cmd_docker(self): # pylint: disable=no-self-use # Command line that should be run in the docker return 'send-email --test-status SUCCESS --start-time 1627268929 --email-recipients [email protected]' @property def test_home_dir(self) -> str: return os.path.join(self.home_dir, self.home_dir_postfix) @cached_property def is_gce_or_gke(self) -> bool: return 'gce' in self.backend or 'gke' in self.backend @property def get_longevity_env(self) -> Dict[str, str]: longevity_end = { 'SCT_TEST_ID': self.test_id, 'HOME': self.test_home_dir, 'USER': '******', 'SCT_CLUSTER_BACKEND': self.backend, 'SCT_CONFIG_FILES': '["/jenkins/slave/workspace/siren-tests/longevity-tests/cloud-longevity-small-data-' 'set-1h-gcp/siren-tests/sct_plugin/configurations/scylla_cloud_nemesis_small_set.yaml"]' } return longevity_end @property def before_runner_not_expected(self): # All steps before runner is created should not have runner related steps return self.not_expected(runner=False) @property def before_runner_expected(self): # All steps before runner is created should not have runner related steps return self.expected(runner=False) @property def before_runner_docker_run_prefix(self): # All steps before runner is created do not have runner parameter return self.docker_run_prefix(runner=False) @property def after_runner_not_expected(self): # All steps before runner is created should not have runner related steps return self.not_expected(runner=self.runner) @property def after_runner_expected(self): # All steps before runner is created should not have runner related steps return self.expected(runner=self.runner) @property def after_runner_docker_run_prefix(self): # All steps before runner is created do not have runner parameter return self.docker_run_prefix(runner=self.runner) @property def test_tmp_dir(self) -> HydraTestCaseTmpDir: return HydraTestCaseTmpDir(home_dir=self.test_home_dir, aws_creds=self.aws_creds, gce_creds=self.gce_creds) @property def test_case_show_conf(self): self.set_test_home_dir_postfix('show_conf') return HydraTestCaseParams( name=f'{self.step_name_prefix}_show_conf', cmd=self.show_conf_cmd, expected=[ *self.before_runner_expected, re.compile( f"{self.before_runner_docker_run_prefix} eval './sct.py {self.show_conf_cmd}'" ) ], not_expected=[*self.before_runner_not_expected], return_code=0, env=self.get_longevity_env, ), self.test_tmp_dir @property def test_case_create_runner(self): self.set_test_home_dir_postfix('create_runner') return HydraTestCaseParams( name=f'{self.step_name_prefix}_create_runner', cmd=self.create_runner_cmd, expected=[ *self.before_runner_expected, re.compile( f"{self.before_runner_docker_run_prefix} eval './sct.py {self.create_runner_cmd}'" ) ], not_expected=[*self.before_runner_not_expected], return_code=0, env=self.get_longevity_env), self.test_tmp_dir @property def test_case_run_test(self): self.set_test_home_dir_postfix('run_test') return HydraTestCaseParams( name=f'{self.step_name_prefix}_run_test', cmd=self.run_test_cmd, expected=[ *self.after_runner_expected, re.compile( f"{self.after_runner_docker_run_prefix} eval './sct.py {self.run_test_cmd_docker}'" ) ], not_expected=[*self.after_runner_not_expected], return_code=0, env=self.get_longevity_env), self.test_tmp_dir @property def test_case_collect_logs(self): self.set_test_home_dir_postfix('collect_logs') return HydraTestCaseParams( name=f'{self.step_name_prefix}_collect_logs', cmd=self.collect_logs_cmd, expected=[ *self.after_runner_expected, re.compile( f"{self.after_runner_docker_run_prefix} eval './sct.py {self.collect_logs_cmd_docker}'" ) ], not_expected=[*self.after_runner_not_expected], return_code=0, env=self.get_longevity_env), self.test_tmp_dir @property def test_case_clean_resources(self): self.set_test_home_dir_postfix('clean_resources') return HydraTestCaseParams( name=f'{self.step_name_prefix}_clean_resources', cmd=self.clean_resources_cmd, expected=[ *self.after_runner_expected, re.compile( f"{self.after_runner_docker_run_prefix} " f"eval './sct.py {self.clean_resources_cmd_docker}'") ], not_expected=[*self.after_runner_not_expected], return_code=0, env=self.get_longevity_env), self.test_tmp_dir @property def test_case_send_email(self): self.set_test_home_dir_postfix('send_email') return HydraTestCaseParams( name=f'{self.step_name_prefix}_send_email', cmd=self.send_email_cmd, expected=[ *self.after_runner_expected, re.compile( f"{self.after_runner_docker_run_prefix} eval './sct.py {self.send_email_cmd_docker}'" ) ], not_expected=[*self.after_runner_not_expected], return_code=0, env=self.get_longevity_env), self.test_tmp_dir @property def hydra_test_cases( self) -> Iterable[Tuple[HydraTestCaseParams, HydraTestCaseTmpDir]]: """ Creates list of test case parameters that represent steps in longevity pipeline steps """ return (self.test_case_show_conf, self.test_case_create_runner, self.test_case_run_test, self.test_case_collect_logs, self.test_case_clean_resources)
from functools import cached_property import kubernetes as k8s from urllib3.util.retry import Retry from sdcm import sct_abs_path from sdcm.remote import LOCALRUNNER from sdcm.utils.decorators import timeout as timeout_decor from sdcm.utils.docker_utils import ContainerManager, DockerException, Container KUBECTL_BIN = "kubectl" HELM_IMAGE = "alpine/helm:3.3.4" KUBECTL_TIMEOUT = 300 # seconds K8S_CONFIGS = sct_abs_path("sdcm/k8s_configs") JSON_PATCH_TYPE = "application/json-patch+json" LOGGER = logging.getLogger(__name__) logging.getLogger("kubernetes.client.rest").setLevel(logging.INFO) class ApiLimiterClient(k8s.client.ApiClient): _api_rate_limiter: 'ApiCallRateLimiter' = None def call_api(self, *args, **kwargs): if self._api_rate_limiter: self._api_rate_limiter.wait() return super().call_api(*args, **kwargs)
from textwrap import dedent from functools import cached_property from invoke.exceptions import UnexpectedExit from sdcm import sct_abs_path, cluster, cluster_gce from sdcm.remote import LOCALRUNNER from sdcm.remote.kubernetes_cmd_runner import KubernetesCmdRunner from sdcm.cluster_k8s import KubernetesCluster, BasePodContainer, ScyllaPodCluster from sdcm.cluster_k8s.iptables import IptablesPodPortsRedirectMixin, IptablesClusterOpsMixin from sdcm.utils.k8s import KubernetesOps from sdcm.utils.common import get_free_port, wait_for_port from sdcm.utils.decorators import retrying from sdcm.utils.docker_utils import ContainerManager SCYLLA_CLUSTER_CONFIG = sct_abs_path("sdcm/k8s_configs/cluster-minikube.yaml") KUBECTL_PROXY_PORT = 8001 KUBECTL_PROXY_CONTAINER = "auto_ssh:kubectl_proxy" SCYLLA_POD_EXPOSED_PORTS = [ 3000, 9042, 9180, ] LOGGER = logging.getLogger(__name__) class MinikubeOps: @classmethod def setup_minikube(cls, node: cluster.BaseNode, kubectl_version: str, minikube_version: str) -> None:
BaseScyllaPodContainer, ScyllaPodCluster, COMMON_CONTAINERS_RESOURCES, LOCAL_MINIO_DIR, LOCAL_PROVISIONER_DIR, OPERATOR_CONTAINERS_RESOURCES, SCYLLA_MANAGER_AGENT_RESOURCES, SCYLLA_MANAGER_AGENT_VERSION_IN_SCYLLA_MANAGER, SCYLLA_VERSION_IN_SCYLLA_MANAGER, ) from sdcm.utils.k8s import TokenUpdateThread, HelmValues from sdcm.utils.decorators import retrying from sdcm.utils.docker_utils import docker_hub_login from sdcm.utils import version_utils SRC_APISERVER_AUDIT_POLICY = sct_abs_path( "sdcm/k8s_configs/local-kind/audit-policy.yaml") DST_APISERVER_AUDIT_POLICY = "/etc/kubernetes/policies/audit-policy.yaml" DST_APISERVER_AUDIT_LOG = "/var/log/kubernetes/kube-apiserver-audit.log" CNI_CALICO_CONFIG = sct_abs_path("sdcm/k8s_configs/cni-calico.yaml") CNI_CALICO_VERSION = "v3.23.0" LOGGER = logging.getLogger(__name__) POOL_LABEL_NAME = 'minimal-k8s-nodepool' class MinimalK8SNodePool(CloudK8sNodePool): k8s_cluster: 'LocalKindCluster' def deploy(self) -> None: self.is_deployed = True
class EksCluster(KubernetesCluster, EksClusterCleanupMixin): POOL_LABEL_NAME = 'eks.amazonaws.com/nodegroup' IS_NODE_TUNING_SUPPORTED = True NODE_PREPARE_FILE = sct_abs_path( "sdcm/k8s_configs/eks/scylla-node-prepare.yaml") pools: Dict[str, EksNodePool] short_cluster_name: str # pylint: disable=too-many-arguments def __init__(self, eks_cluster_version, ec2_security_group_ids, ec2_subnet_ids, ec2_role_arn, credentials, user_prefix, service_ipv4_cidr, vpc_cni_version, nodegroup_role_arn, params=None, cluster_uuid=None, region_name=None): super().__init__(user_prefix=user_prefix, cluster_uuid=cluster_uuid, region_name=region_name, params=params) self.credentials = credentials self.eks_cluster_version = eks_cluster_version self.ec2_role_arn = ec2_role_arn self.nodegroup_role_arn = nodegroup_role_arn self.ec2_security_group_ids = ec2_security_group_ids self.ec2_subnet_ids = ec2_subnet_ids self.service_ipv4_cidr = service_ipv4_cidr self.vpc_cni_version = vpc_cni_version @cached_property def allowed_labels_on_scylla_node(self) -> list: allowed_labels_on_scylla_node = [ ('name', 'node-setup'), ('name', 'cpu-policy'), ('k8s-app', 'aws-node'), ('app', 'local-volume-provisioner'), ('k8s-app', 'kube-proxy'), ] if self.tenants_number > 1: allowed_labels_on_scylla_node.append(('app', 'scylla')) allowed_labels_on_scylla_node.append( ('app.kubernetes.io/name', 'scylla')) else: allowed_labels_on_scylla_node.append( ('scylla/cluster', self.k8s_scylla_cluster_name)) if self.is_performance_tuning_enabled: # NOTE: add performance tuning related pods only if we expect it to be. # When we have tuning disabled it must not exist. allowed_labels_on_scylla_node.extend(self.perf_pods_labels) return allowed_labels_on_scylla_node def create_eks_cluster(self, wait_till_functional=True): self.eks_client.create_cluster( name=self.short_cluster_name, version=self.eks_cluster_version, roleArn=self.ec2_role_arn, resourcesVpcConfig={ 'securityGroupIds': self.ec2_security_group_ids[0], 'subnetIds': self.ec2_subnet_ids, 'endpointPublicAccess': True, 'endpointPrivateAccess': True, 'publicAccessCidrs': [ '0.0.0.0/0', ] }, kubernetesNetworkConfig={ 'serviceIpv4Cidr': self.service_ipv4_cidr }, logging={ 'clusterLogging': [ { 'types': [ 'api', 'audit', 'authenticator', 'controllerManager', 'scheduler' ], 'enabled': True }, ] }, tags=self.tags, ) self.eks_client.create_addon(clusterName=self.short_cluster_name, addonName='vpc-cni', addonVersion=self.vpc_cni_version) if wait_till_functional: wait_for( lambda: self.cluster_status == 'ACTIVE', step=60, throw_exc=True, timeout=1200, text= f'Waiting till EKS cluster {self.short_cluster_name} become operational' ) @property def cluster_info(self) -> dict: return self.eks_client.describe_cluster( name=self.short_cluster_name)['cluster'] @property def cluster_status(self) -> str: return self.cluster_info['status'] def __str__(self): return f"{type(self).__name__} {self.name} | Version: {self.eks_cluster_version}" def create_token_update_thread(self): return EksTokenUpdateThread( aws_cmd= f'aws eks --region {self.region_name} get-token --cluster-name {self.short_cluster_name}', kubectl_token_path=self.kubectl_token_path) def create_kubectl_config(self): LOCALRUNNER.run( f'aws eks --region {self.region_name} update-kubeconfig --name {self.short_cluster_name}' ) def deploy(self): LOGGER.info("Create EKS cluster `%s'", self.short_cluster_name) self.create_eks_cluster() LOGGER.info("Patch kubectl config") self.patch_kubectl_config() def tune_network(self): """Tune networking on all nodes of an EKS cluster to reduce number of reserved IPs. Set following special EKS-specific env vars to 'aws-node' daemonset: WARM_ENI_TARGET (default is 1) - number of 'ready-to-use' additional network interfaces which in our case almost always stay idle. Setting this one to 0 means we have 1 network interface at start, and new ones will be added (expensive API calls) only when needed automatically. MINIMUM_IP_TARGET (no default) - minimum number of IP addresses that must be dedicated to an EC2 instance. If not set then number of reserved IPs equal to number of IP capacity of a network interface. WARM_IP_TARGET (no default) - how many unused IP must be kept as 'ready-to-use'. if not set then depend on the available IPs of ENIs. General idea behind it: reduce number of 'reserved' IP addresses by each of EC2 instances used in an EKS cluster. Without such tweaking one cluster we create uses more than 300 IP addresses and having /22 subnet (<1024 IP addresses) we can run just 3 EKS clusters at once. Env vars details: https://github.com/aws/amazon-vpc-cni-k8s/blob/master/docs/eni-and-ip-target.md IPs per network interface per node type details: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-eni.html """ LOGGER.info("Tune network of the EKS cluster") env_vars = ( "WARM_ENI_TARGET=0", "MINIMUM_IP_TARGET=8", "WARM_IP_TARGET=2", ) self.kubectl(f"set env daemonset aws-node {' '.join(env_vars)}", namespace="kube-system") def deploy_node_pool(self, pool: EksNodePool, wait_till_ready=True) -> None: self._add_pool(pool) if pool.is_deployed: return if wait_till_ready: pool.deploy_and_wait_till_ready() else: pool.deploy() def resize_node_pool(self, name: str, num_nodes: int, wait_till_ready=True) -> None: self.pools[name].resize(num_nodes) if wait_till_ready: self.pools[name].wait_for_nodes_readiness() def destroy(self): EksClusterCleanupMixin.destroy(self) self.stop_token_update_thread() def get_ec2_instance_by_id(self, instance_id): return boto3.resource( 'ec2', region_name=self.region_name).Instance(id=instance_id) def deploy_scylla_manager(self, pool_name: str = None) -> None: self.deploy_minio_s3_backend() super().deploy_scylla_manager(pool_name=pool_name) def _get_all_instance_ids(self): cmd = "get node --no-headers -o custom-columns=:.spec.providerID" return [ name.split("/")[-1] for name in self.kubectl(cmd).stdout.split() ] def set_tags(self, instance_ids): if isinstance(instance_ids, str): instance_ids = [instance_ids] boto3.client('ec2', region_name=self.region_name).create_tags( Resources=instance_ids, Tags=[{ "Key": key, "Value": value } for key, value in self.tags.items()], ) def set_tags_on_all_instances(self): # NOTE: EKS doesn't apply nodeGroup's tags to nodes. # So, we add it for each node explicitly. self.set_tags(self._get_all_instance_ids()) def set_security_groups(self, instance): for network_interface in instance.network_interfaces: security_groups = [g["GroupId"] for g in network_interface.groups] # NOTE: Make API call only if it is needed if self.ec2_security_group_ids[0][0] not in security_groups: security_groups.append(self.ec2_security_group_ids[0][0]) network_interface.modify_attribute(Groups=security_groups) def set_security_groups_on_all_instances(self): # NOTE: EKS doesn't apply nodeGroup's security groups to nodes # So, we add it for each network interface of a node explicitly. for instance_id in self._get_all_instance_ids(): self.set_security_groups(self.get_ec2_instance_by_id(instance_id)) def deploy_scylla_cluster(self, *args, **kwargs) -> None: # pylint: disable=signature-differs super().deploy_scylla_cluster(*args, **kwargs) self.set_security_groups_on_all_instances() self.set_tags_on_all_instances() def deploy_monitoring_cluster(self, *args, **kwargs) -> None: # pylint: disable=signature-differs super().deploy_monitoring_cluster(*args, **kwargs) self.set_security_groups_on_all_instances() self.set_tags_on_all_instances() def upgrade_kubernetes_platform(self) -> str: upgrade_version = f"1.{int(self.eks_cluster_version.split('.')[1]) + 1}" # Upgrade control plane (API, scheduler, manager and so on ...) LOGGER.info("Upgrading K8S control plane to the '%s' version", upgrade_version) self.eks_client.update_cluster_version( name=self.short_cluster_name, version=upgrade_version, ) # NOTE: sleep for some small period of time to make sure that cluster's status changes # before we poll for it's readiness. # Upgrade takes dozens of minutes, so, 'sleep'ing for some time won't cause # time waste by calling 'time.sleep' function. # 5sec is not enough in some cases. 20sec must be sufficient time.sleep(20) self.eks_client.get_waiter('cluster_active').wait( name=self.short_cluster_name, WaiterConfig={ 'Delay': 30, 'MaxAttempts': 120 }, ) # Upgrade scylla-related node pools for node_pool in (self.AUXILIARY_POOL_NAME, self.SCYLLA_POOL_NAME): LOGGER.info("Upgrading '%s' node pool to the '%s' version", node_pool, upgrade_version) self.eks_client.update_nodegroup_version( clusterName=self.short_cluster_name, nodegroupName=node_pool, version=upgrade_version, ) time.sleep(20) self.eks_client.get_waiter('nodegroup_active').wait( clusterName=self.short_cluster_name, nodegroupName=node_pool, WaiterConfig={ # NOTE: one Scylla K8S node upgrade takes about 10 minutes # So, wait timeout will be different for different number of DB nodes # Set it bigger than the expected value to avoid possible fluctuations 'Delay': 30, 'MaxAttempts': self.params.get("n_db_nodes") * 30, }, ) return upgrade_version
class GkeCluster(KubernetesCluster): AUXILIARY_POOL_NAME = 'default-pool' # This is default pool that is deployed with the cluster POOL_LABEL_NAME = 'cloud.google.com/gke-nodepool' IS_NODE_TUNING_SUPPORTED = True NODE_PREPARE_FILE = sct_abs_path("sdcm/k8s_configs/gke/scylla-node-prepare.yaml") pools: Dict[str, GkeNodePool] # pylint: disable=too-many-arguments def __init__(self, gke_cluster_version, gke_k8s_release_channel, gce_disk_size, gce_disk_type, gce_network, services, gce_instance_type='n1-standard-2', user_prefix=None, params=None, gce_datacenter=None, cluster_uuid=None, n_nodes=2, ): super().__init__( params=params, cluster_uuid=cluster_uuid, user_prefix=user_prefix ) self.gke_cluster_version = gke_cluster_version self.gke_k8s_release_channel = gke_k8s_release_channel.strip() self.gce_disk_type = gce_disk_type self.gce_disk_size = gce_disk_size self.gce_network = gce_network self.gce_services = services self.gce_instance_type = gce_instance_type self.n_nodes = n_nodes self.gce_project = services[0].project self.gce_user = services[0].key self.gce_zone = gce_datacenter[0] self.gke_cluster_created = False self.api_call_rate_limiter = ApiCallRateLimiter( rate_limit=GKE_API_CALL_RATE_LIMIT, queue_size=GKE_API_CALL_QUEUE_SIZE, urllib_retry=GKE_URLLIB_RETRY, urllib_backoff_factor=GKE_URLLIB_BACKOFF_FACTOR, ) self.api_call_rate_limiter.start() @cached_property def allowed_labels_on_scylla_node(self) -> list: allowed_labels_on_scylla_node = [ ('app', 'xfs-formatter'), ('app', 'local-volume-provisioner'), ('k8s-app', 'fluentbit-gke'), ('k8s-app', 'gke-metrics-agent'), ('component', 'kube-proxy'), ('k8s-app', 'gcp-compute-persistent-disk-csi-driver'), ] if self.tenants_number > 1: allowed_labels_on_scylla_node.append(('app.kubernetes.io/name', 'scylla')) allowed_labels_on_scylla_node.append(('app', 'scylla')) else: allowed_labels_on_scylla_node.append(('scylla/cluster', self.k8s_scylla_cluster_name)) if self.is_performance_tuning_enabled: # NOTE: add performance tuning related pods only if we expect it to be. # When we have tuning disabled it must not exist. allowed_labels_on_scylla_node.extend(self.perf_pods_labels) return allowed_labels_on_scylla_node def __str__(self): return f"{type(self).__name__} {self.name} | Zone: {self.gce_zone} | Version: {self.gke_cluster_version}" def deploy(self): LOGGER.info("Create GKE cluster `%s' with %d node(s) in %s", self.short_cluster_name, self.n_nodes, self.AUXILIARY_POOL_NAME) tags = ",".join(f"{key}={value}" for key, value in self.tags.items()) with self.gcloud as gcloud: # NOTE: only static K8S release channel supports disabling of autoupgrade gcloud.run(f"container --project {self.gce_project} clusters create {self.short_cluster_name}" f" --no-enable-basic-auth" f" --zone {self.gce_zone}" f" --cluster-version {self.gke_cluster_version}" f"{' --release-channel ' + self.gke_k8s_release_channel if self.gke_k8s_release_channel else ''}" f" --network {self.gce_network}" f" --num-nodes {self.n_nodes}" f" --machine-type {self.gce_instance_type}" f" --image-type UBUNTU" f" --disk-type {self.gce_disk_type}" f" --disk-size {self.gce_disk_size}" f" --enable-stackdriver-kubernetes" f"{'' if self.gke_k8s_release_channel else ' --no-enable-autoupgrade'}" f"{'' if self.gke_k8s_release_channel else ' --no-enable-autorepair'}" f" --metadata {tags}") self.patch_kubectl_config() self.deploy_node_pool(GkeNodePool( name=self.AUXILIARY_POOL_NAME, num_nodes=self.n_nodes, disk_size=self.gce_disk_size, disk_type=self.gce_disk_type, k8s_cluster=self, instance_type=self.gce_instance_type, is_deployed=True )) LOGGER.info("Setup RBAC for GKE cluster `%s'", self.name) self.kubectl("create clusterrolebinding cluster-admin-binding --clusterrole cluster-admin " f"--user {self.gce_user}") @cached_property def gcloud(self) -> GcloudContextManager: # pylint: disable=no-self-use return self.test_config.tester_obj().localhost.gcloud def deploy_node_pool(self, pool: GkeNodePool, wait_till_ready=True) -> None: self._add_pool(pool) if pool.is_deployed: return LOGGER.info("Create %s pool with %d node(s) in GKE cluster `%s'", pool.name, pool.num_nodes, self.name) if wait_till_ready: with self.api_call_rate_limiter.pause: pool.deploy_and_wait_till_ready() self.api_call_rate_limiter.wait_till_api_become_stable(self) else: pool.deploy() def wait_all_node_pools_to_be_ready(self): with self.api_call_rate_limiter.pause: super().wait_all_node_pools_to_be_ready() self.api_call_rate_limiter.wait_till_api_become_stable(self) def resize_node_pool(self, name: str, num_nodes: int) -> None: with self.api_call_rate_limiter.pause: self.pools[name].resize(num_nodes) self.api_call_rate_limiter.wait_till_api_become_stable(self) def get_instance_group_name_for_pool(self, pool_name: str, default=None) -> str: try: group_link = yaml.safe_load( self.gcloud.run( f'container node-pools describe {pool_name} ' f'--zone {self.gce_zone} --project {self.gce_project} ' f'--cluster {self.short_cluster_name}') ).get('instanceGroupUrls')[0] return group_link.split('/')[-1] except Exception as exc: if default is not None: return default raise RuntimeError(f"Can't get instance group name due to the: {exc}") from exc def delete_instance_that_belong_to_instance_group(self, group_name: str, instance_name: str): self.gcloud.run(f'compute instance-groups managed delete-instances {group_name} ' f'--zone={self.gce_zone} --instances={instance_name}') def create_token_update_thread(self): return GcloudTokenUpdateThread(self.gcloud, self.kubectl_token_path) def create_kubectl_config(self): self.gcloud.run(f"container clusters get-credentials {self.short_cluster_name} --zone {self.gce_zone}") def destroy(self): self.api_call_rate_limiter.stop() self.stop_token_update_thread() def deploy_scylla_manager(self, pool_name: str = None) -> None: self.deploy_minio_s3_backend() super().deploy_scylla_manager(pool_name=pool_name) # NOTE: blocked by https://github.com/scylladb/scylla-operator/issues/760 def upgrade_kubernetes_platform(self) -> str: # NOTE: 'self.gke_cluster_version' can be like 1.21.3-gke.N or 1.21 upgrade_version = f"1.{int(self.gke_cluster_version.split('.')[1]) + 1}" with self.gcloud as gcloud: # Upgrade control plane (API, scheduler, manager and so on ...) LOGGER.info("Upgrading K8S control plane to the '%s' version", upgrade_version) gcloud.run(f"container clusters upgrade {self.short_cluster_name} " f"--master --quiet --project {self.gce_project} --zone {self.gce_zone} " f"--cluster-version {upgrade_version}") # Upgrade scylla-related node pools for node_pool in (self.AUXILIARY_POOL_NAME, self.SCYLLA_POOL_NAME): LOGGER.info("Upgrading '%s' node pool to the '%s' version", node_pool, upgrade_version) # NOTE: one node upgrade takes about 10 minutes gcloud.run(f"container clusters upgrade {self.short_cluster_name} " f"--quiet --project {self.gce_project} --zone {self.gce_zone} " f"--node-pool={node_pool}") return upgrade_version