def validate_cluster(splunk, record): from kubernetes import client try: connection_stanza = splunklib.client.Stanza(splunk, "", skip_refresh=True) connection_stanza.refresh( state=splunklib.data.record({"content": record})) config = create_client_configuration(connection_stanza) api_client = client.ApiClient(config) version_api = client.VersionApi(api_client) version_api.get_code() except errors.ApplicationError as e: raise Exception("Could not connect to Kubernetes.\n\n%s" % e) except Exception: raise Exception(traceback.format_exc()) try: extensions_api = client.ApiextensionsV1beta1Api(api_client) crd = extensions_api.read_custom_resource_definition( "standalones.enterprise.splunk.com") if crd.spec.version != "v1alpha2": raise errors.ApplicationError( "Unexpected Splunk Operator version: %s" % crd.spec.version) except client.rest.ApiException as e: if e.status == 404: raise errors.ApplicationError("Could not find Splunk Operator.") raise except errors.ApplicationError: raise except Exception: raise Exception(traceback.format_exc()) try: indexer_server_count = 0 for server in record.indexer_server.split(","): components = server.split(":") if len(components) != 2: raise errors.ApplicationError( "Expect format \"<server>:<port>,...\" for indexer server. Got \"%s\"" % (server)) hostname = components[0].strip() port = int(components[1].strip()) import socket s = socket.socket() try: s.connect((hostname, port)) except Exception as e: raise errors.ApplicationError( "Could not connect to indexer server \"%s\": %s" % (server, e)) finally: s.close() indexer_server_count += 1 if indexer_server_count == 0: raise errors.ApplicationError("Invalid or misssing indexer server") except errors.ApplicationError: raise except Exception: raise Exception(traceback.format_exc())
def create_deployment(splunk, kubernetes, stack_id, stack_config, cluster_config): core_api = kuberneteslib.CoreV1Api(kubernetes) custom_objects_api = kuberneteslib.CustomObjectsApi(kubernetes) if stack_config["license_master_mode"] == "local": licensemasters.deploy_license(core_api, stack_id, stack_config) if stack_config["deployment_type"] == "standalone": standalones.deploy(splunk, kubernetes, stack_id, stack_config, cluster_config) standalones.wait_until_ready(splunk, kubernetes, stack_id, stack_config) elif stack_config["deployment_type"] == "distributed": if stack_config["license_master_mode"] == "local": licensemasters.deploy(splunk, kubernetes, stack_id, stack_config, cluster_config) deployed_license_master = True else: deployed_license_master = False cluster_master.deploy(splunk, kubernetes, stack_id, stack_config, cluster_config) indexer_cluster.deploy(splunk, kubernetes, stack_id, stack_config, cluster_config) search_head_cluster.deploy(splunk, kubernetes, stack_id, stack_config, cluster_config) cluster_master.wait_until_ready(splunk, kubernetes, stack_id, stack_config) indexer_cluster.wait_until_ready(splunk, kubernetes, stack_id, stack_config) search_head_cluster.wait_until_ready(splunk, kubernetes, stack_id, stack_config) if deployed_license_master: licensemasters.wait_until_ready(splunk, kubernetes, stack_id, stack_config) else: raise errors.ApplicationError("Unknown deployment type: '%s'" % (stack_config["deployment_type"])) create_load_balancers(core_api, stack_id, stack_config) verify_load_balancers_completed(core_api, stack_id, stack_config)
def deploy(splunk, kubernetes, stack_id, stack_config, cluster_config): license_master = get(splunk, kubernetes, stack_id, stack_config) if license_master: return core_api = kuberneteslib.CoreV1Api(kubernetes) license_config_map = get_license_config_map(core_api, stack_id, stack_config) custom_objects_api = kuberneteslib.CustomObjectsApi(kubernetes) splunk_defaults = { "splunk": { "conf": { } } } spec = { "image": cluster_config.default_splunk_image, "imagePullPolicy": "Always", "resources": { "requests": { "memory": stack_config["memory_per_instance"], "cpu": stack_config["cpu_per_instance"], }, "limits": { "memory": stack_config["memory_per_instance"], "cpu": stack_config["cpu_per_instance"], }, }, "etcStorage": '%sGi' % stack_config["etc_storage_in_gb"], "varStorage": '%sGi' % stack_config["other_var_storage_in_gb"], "defaults": yaml.dump(splunk_defaults), "volumes": [{ "name": "licenses", "configMap": { "name": license_config_map.metadata.name, } }], "licenseUrl": "/mnt/licenses/enterprise.lic", } if cluster_config.node_selector: labels = cluster_config.node_selector.split(",") match_expressions = [] for label in labels: if label: kv = label.split("=") if len(kv) != 2: raise errors.ApplicationError( "invalid node selector format (%s)" % cluster_config.node_selector) match_expressions.append({ "key": kv[0], "operator": "In", "values": [kv[1]], }) spec["affinity"] = { "nodeAffinity": { "requiredDuringSchedulingIgnoredDuringExecution": { "nodeSelectorTerms": [ { "matchExpressions": match_expressions, } ], } } } if "storage_class" in cluster_config and cluster_config.storage_class: spec["storageClassName"] = cluster_config.storage_class custom_objects_api.create_namespaced_custom_object( group="enterprise.splunk.com", version="v1", namespace=stack_config["namespace"], plural="licensemasters", body={ "apiVersion": "enterprise.splunk.com/v1", "kind": "LicenseMaster", "metadata": { "name": stack_id, "finalizers": ["enterprise.splunk.com/delete-pvc"], "labels": { "app": "saas", "stack_id": stack_id, } }, "spec": spec, }, )
def deploy(splunk, kubernetes, stack_id, stack_config, cluster_config): indexer_cluster = get(splunk, kubernetes, stack_id, stack_config) if indexer_cluster: return splunk_defaults = { "splunk": { "conf": { "inputs": { "content": { "tcp://:9996": { "connection_host": "dns", "source": "tcp:9996", } } } } } } spec = { "replicas": int(stack_config["indexer_count"]), "clusterMasterRef": { "name": stack_id }, "image": cluster_config.default_splunk_image, "imagePullPolicy": "Always", "resources": { "requests": { "memory": stack_config["memory_per_instance"], "cpu": stack_config["cpu_per_instance"], }, "limits": { "memory": stack_config["memory_per_instance"], "cpu": stack_config["cpu_per_instance"], }, }, "etcStorage": '%sGi' % stack_config["etc_storage_in_gb"], "varStorage": '%sGi' % stack_config["indexer_var_storage_in_gb"], } if stack_config["license_master_mode"] == "remote": splunk_defaults.update({ "splunk": { "license_master_url": cluster_config.license_master_url } }) if len(splunk_defaults) > 0: spec["defaults"] = yaml.dump(splunk_defaults) if cluster_config.node_selector: labels = cluster_config.node_selector.split(",") match_expressions = [] for label in labels: if label: kv = label.split("=") if len(kv) != 2: raise errors.ApplicationError( "invalid node selector format (%s)" % cluster_config.node_selector) match_expressions.append({ "key": kv[0], "operator": "In", "values": [kv[1]], }) spec["affinity"] = { "nodeAffinity": { "requiredDuringSchedulingIgnoredDuringExecution": { "nodeSelectorTerms": [{ "matchExpressions": match_expressions, }], } } } if "storage_class" in cluster_config and cluster_config.storage_class: spec["storageClassName"] = cluster_config.storage_class if stack_config["license_master_mode"] == "local": spec["licenseMasterRef"] = {"name": stack_id} custom_objects_api = kuberneteslib.CustomObjectsApi(kubernetes) custom_objects_api.create_namespaced_custom_object( group="enterprise.splunk.com", version="v1", namespace=stack_config["namespace"], plural="indexerclusters", body={ "apiVersion": "enterprise.splunk.com/v1", "kind": "IndexerCluster", "metadata": { "name": stack_id, "finalizers": ["enterprise.splunk.com/delete-pvc"], "labels": { "app": "saas", "stack_id": stack_id, } }, "spec": spec, }, )
def create_client_configuration(connection_stanza): config = client.Configuration() if connection_stanza.auth_mode == "aws-iam": # https://github.com/kubernetes-sigs/aws-iam-authenticator # https://aws.amazon.com/de/about-aws/whats-new/2019/05/amazon-eks-simplifies-kubernetes-cluster-authentication/ # https://github.com/aws/aws-cli/blob/develop/awscli/customizations/eks/get_token.py # get cluster info import boto3 eks_client = boto3.client( 'eks', region_name=connection_stanza.aws_region_name, aws_access_key_id=connection_stanza.aws_access_key_id, aws_secret_access_key=connection_stanza.aws_secret_access_key) cluster_info = eks_client.describe_cluster( name=connection_stanza.aws_cluster_name) aws_cluster_ca = cluster_info['cluster']['certificateAuthority'][ 'data'] aws_cluster_url = cluster_info['cluster']['endpoint'] # get authentication token from botocore.signers import RequestSigner # pylint: disable=import-error STS_TOKEN_EXPIRES_IN = 60 session = boto3.Session( region_name=connection_stanza.aws_region_name, aws_access_key_id=connection_stanza.aws_access_key_id, aws_secret_access_key=connection_stanza.aws_secret_access_key) sts_client = session.client('sts') service_id = sts_client.meta.service_model.service_id token_signer = RequestSigner(service_id, connection_stanza.aws_region_name, 'sts', 'v4', session.get_credentials(), session.events) signed_url = token_signer.generate_presigned_url( { 'method': 'GET', 'url': 'https://sts.{}.amazonaws.com/?Action=GetCallerIdentity&Version=2011-06-15' .format(connection_stanza.aws_region_name), 'body': {}, 'headers': { 'x-k8s-aws-id': connection_stanza.aws_cluster_name }, 'context': {} }, region_name=connection_stanza.aws_region_name, expires_in=STS_TOKEN_EXPIRES_IN, operation_name='') base64_url = base64.urlsafe_b64encode( signed_url.encode('utf-8')).decode('utf-8') auth_token = 'k8s-aws-v1.' + re.sub(r'=*', '', base64_url) config.host = aws_cluster_url ca_data = base64.standard_b64decode(aws_cluster_ca) fp = tempfile.NamedTemporaryFile(delete=False) # TODO when to delete? fp.write(ca_data) fp.close() config.ssl_ca_cert = fp.name config.api_key["authorization"] = auth_token config.api_key_prefix["authorization"] = "Bearer" elif connection_stanza.auth_mode == "cert-key": config.host = connection_stanza.cluster_url if connection_stanza.client_cert: try: cert_data = base64.standard_b64decode( connection_stanza.client_cert) fp = tempfile.NamedTemporaryFile( delete=False) # TODO when to delete? fp.write(cert_data) fp.close() config.cert_file = fp.name except Exception as e: raise errors.ApplicationError( "Error applying cluster cert: %s" % (e)) if connection_stanza.client_key: try: key_data = base64.standard_b64decode( connection_stanza.client_key) fp = tempfile.NamedTemporaryFile( delete=False) # TODO when to delete? fp.write(key_data) fp.close() config.key_file = fp.name except Exception as e: raise errors.ApplicationError( "Error applying cluster key: %s" % (e)) if connection_stanza.cluster_ca: try: cluster_ca_data = base64.standard_b64decode( connection_stanza.cluster_ca) fp = tempfile.NamedTemporaryFile( delete=False) # TODO when to delete? fp.write(cluster_ca_data) fp.close() config.ssl_ca_cert = fp.name except Exception as e: raise errors.ApplicationError("Error applying cluster ca: %s" % (e)) config.verify_ssl = False elif connection_stanza.auth_mode == "user-token": config.host = connection_stanza.cluster_url config.api_key["authorization"] = connection_stanza.user_token config.api_key_prefix["authorization"] = "Bearer" config.verify_ssl = False else: raise Exception("invalid auth mode '%s'" % connection_stanza.auth_mode) return config
def run_cases(splunk, test_id, test): cases_collection = get_performance_test_cases_collection(splunk) cases = cases_collection.query( query=json.dumps({ "test_id": test_id, }), sort="index:1", ) for case in cases: case_id = case["_key"] status = case["status"] if status == CASE_FINISHED: continue if status == CASE_WAITING: result = splunk.post( "saas/stacks", **{ "deployment_type": case["deployment_type"], "indexer_count": case["indexer_count"], "search_head_count": case["search_head_count"], "cpu_per_instance": case["cpu_per_instance"], "etc_storage_in_gb": case["etc_storage_in_gb"], "other_var_storage_in_gb": case["other_var_storage_in_gb"], "indexer_var_storage_in_gb": case["indexer_var_storage_in_gb"], "memory_per_instance": case["memory_per_instance"], "title": "Performance Test %s and Case %s" % (test_id, case_id), "cluster": test["cluster"], }) response = json.loads(result.body.read())["entry"][0]["content"] stack_id = response["stack_id"] logging.info("created stack %s for test case %s" % (stack_id, case_id)) case.update({ "status": CASE_STARTING, "stack_id": stack_id, }) cases_collection.update(case_id, json.dumps(case)) raise errors.RetryOperation( "waiting for stack %s in test case %s starting up ..." % (stack_id, case_id)) elif status == CASE_STARTING: stack_id = case["stack_id"] stack = splunk.get("saas/stack/%s" % stack_id) stack_status = json.loads( stack.body.read())["entry"][0]["content"]["status"] if stack_status == stacks.CREATING: raise errors.RetryOperation() if stack_status != stacks.CREATED: raise Exception("unexpected stack status: %s" % stack_status) logging.info("successfully created stack %s for case %s" % (stack_id, case_id)) stack_config = stacks.get_stack_config(splunk, stack_id) kube_client = clusters.create_client(splunk, stack_config["cluster"]) cluster_config = clusters.get_cluster(splunk, test["cluster"]) node_selector_labels = cluster_config["node_selector"].split(",") node_selector_for_generators = {} for label in node_selector_labels: if label: kv = label.split("=") if len(kv) != 2: raise errors.ApplicationError( "invalid node selector format (%s)" % cluster_config.node_selector) node_selector_for_generators[kv[0]] = kv[1] apps_api = kubernetes.AppsV1Api(kube_client) core_api = kubernetes.CoreV1Api(kube_client) if stack_config["deployment_type"] == "standalone": indexer_hosts = services.get_load_balancer_hosts( core_api, stack_id, services.standalone_role, stack_config["namespace"]) elif stack_config["deployment_type"] == "distributed": indexer_hosts = services.get_load_balancer_hosts( core_api, stack_id, services.indexer_role, stack_config["namespace"]) else: raise Exception("unexpected deployment type: %s" % stack_config["deployment_type"]) data_volume_in_gb_per_day = int(case["data_volume_in_gb_per_day"]) logging.debug("data_volume_in_gb_per_day=%s" % (data_volume_in_gb_per_day)) data_volume_in_gb_per_second = data_volume_in_gb_per_day / 24 / 60 / 60 logging.debug("data_volume_in_gb_per_second=%s" % (data_volume_in_gb_per_second)) data_volume_in_kb_per_second = data_volume_in_gb_per_second * 1024 * 1024 logging.debug("data_volume_in_kb_per_second=%s" % (data_volume_in_kb_per_second)) max_kb_per_second_per_data_generator = 100 logging.debug("max_kb_per_second_per_data_generator=%s" % (max_kb_per_second_per_data_generator)) number_of_data_generators = max( int(data_volume_in_kb_per_second / max_kb_per_second_per_data_generator) + 1, 1) logging.debug("number_of_data_generators=%s" % (number_of_data_generators)) data_volume_in_kb_per_second_per_data_generator = data_volume_in_kb_per_second / \ number_of_data_generators logging.debug( "data_volume_in_kb_per_second_per_data_generator=%s" % (data_volume_in_kb_per_second_per_data_generator)) deployment_name = "datagen-%s" % (stack_id) try: apps_api.read_namespaced_deployment( deployment_name, namespace=stack_config["namespace"]) data_gen_deployment_already_exists = True except kubernetes.rest.ApiException as e: if e.status != 404: raise data_gen_deployment_already_exists = False if not data_gen_deployment_already_exists: apps_api.create_namespaced_deployment( namespace=stack_config["namespace"], body=kubernetes.V1Deployment( metadata=kubernetes.V1ObjectMeta( name=deployment_name, namespace=stack_config["namespace"], labels={ "app": "datagen", "test": test_id, "case": case_id, }, ), spec=kubernetes.V1DeploymentSpec( replicas=number_of_data_generators, selector=kubernetes.V1LabelSelector( match_labels={ "name": "datagen-%s" % (stack_id), }), template=kubernetes.V1PodTemplateSpec( metadata=kubernetes.V1ObjectMeta(labels={ "name": "datagen-%s" % (stack_id), "app": "datagen", "test": test_id, "case": case_id, "stack": stack_id, }, ), spec=kubernetes.V1PodSpec( containers=[ kubernetes.V1Container( name="datagen", image= "blackhypothesis/splunkeventgenerator:latest", resources=kubernetes. V1ResourceRequirements( requests={ "memory": "10Mi", "cpu": "500m", }, limits={ "memory": "50Mi", "cpu": "1", }, ), env=[ kubernetes.V1EnvVar( name="DSTHOST", value=";".join( map( lambda host: host + ":9996", indexer_hosts)), ), kubernetes.V1EnvVar( name="KB_S", value="%s" % data_volume_in_kb_per_second_per_data_generator, ), ], ), ], node_selector=node_selector_for_generators, ), ), ), ), ) logging.info("created %s data generators for case %s" % (number_of_data_generators, case_id)) if stack_config["deployment_type"] == "standalone": search_head_host = services.get_load_balancer_hosts( core_api, stack_id, services.standalone_role, stack_config["namespace"])[0] elif stack_config["deployment_type"] == "distributed": search_head_host = services.get_load_balancer_hosts( core_api, stack_id, services.search_head_role, stack_config["namespace"])[0] else: raise Exception("unexpected deployment type: %s" % stack_config["deployment_type"]) searches_per_day = int(case["searches_per_day"]) logging.debug("searches_per_day=%s" % (searches_per_day)) searches_per_second = searches_per_day / 24 / 60 / 60 logging.debug("searches_per_second=%s" % (searches_per_second)) max_searches_per_second_per_generator = 5 logging.debug("max_searches_per_second_per_generator=%s" % (max_searches_per_second_per_generator)) number_of_search_generators = max( int(searches_per_second / max_searches_per_second_per_generator) + 1, 1) logging.debug("number_of_search_generators=%s" % (number_of_search_generators)) searches_per_second_per_generator = searches_per_second / \ number_of_search_generators logging.debug("searches_per_second_per_generator=%s" % (searches_per_second_per_generator)) search_template = case["search_template"] if searches_per_day > 0 and search_template: deployment_name = "searchgen-%s" % (stack_id) try: apps_api.read_namespaced_deployment( deployment_name, namespace=stack_config["namespace"]) search_gen_deployment_already_exists = True except kubernetes.rest.ApiException as e: if e.status != 404: raise search_gen_deployment_already_exists = False if not search_gen_deployment_already_exists: admin_password = instances.get_admin_password( core_api, stack_id, stack_config, services.search_head_role) apps_api.create_namespaced_deployment( namespace=stack_config["namespace"], body=kubernetes.V1Deployment( metadata=kubernetes.V1ObjectMeta( name=deployment_name, namespace=stack_config["namespace"], labels={ "app": "searchgen", "test": test_id, "case": case_id, }, ), spec=kubernetes.V1DeploymentSpec( replicas=number_of_search_generators, selector=kubernetes.V1LabelSelector( match_labels={ "name": "searchgen-%s" % (stack_id), }), template=kubernetes.V1PodTemplateSpec( metadata=kubernetes.V1ObjectMeta(labels={ "name": "searchgen-%s" % (stack_id), "app": "searchgen", "test": test_id, "case": case_id, "stack": stack_id, }, ), spec=kubernetes.V1PodSpec( containers=[ kubernetes.V1Container( name="searchgen", image= "hovu96/splunk-searchgen:latest", resources=kubernetes. V1ResourceRequirements( requests={ "memory": "10Mi", "cpu": "500m", }, limits={ "memory": "50Mi", "cpu": "1", }, ), env=[ kubernetes.V1EnvVar( name="SEARCH_GEN_SPL", value=search_template, ), kubernetes.V1EnvVar( name="SEARCH_GEN_HOST", value=search_head_host, ), kubernetes.V1EnvVar( name="SEARCH_GEN_USER", value="admin", ), kubernetes.V1EnvVar( name= "SEARCH_GEN_PASSWORD", value=admin_password, ), kubernetes.V1EnvVar( name="SEARCH_GEN_SPS", value="%s" % searches_per_second_per_generator, ), ], ), ], node_selector= node_selector_for_generators, ), ), ), ), ) logging.info("created %s search generators for case %s" % (number_of_search_generators, case_id)) else: logging.info("no search generators started") case.update({ "status": CASE_RUNNING, "time_started_running": time.time(), }) cases_collection.update(case_id, json.dumps(case)) raise errors.RetryOperation("running test case %s ..." % case_id) elif status == CASE_RUNNING: time_started_running = case["time_started_running"] time_now = time.time() seconds_running_to_far = time_now - time_started_running target_run_duration = test["run_duration"] logging.debug( "time_started_running=%s time_now=%s seconds_running_to_far=%s" % (time_started_running, time_now, seconds_running_to_far)) if seconds_running_to_far < (target_run_duration * 60): logging.debug("still waiting") raise errors.RetryOperation() logging.info("time elapsed for case %s" % (case_id)) case.update({ "status": CASE_STOPPING, "time_finished_running": time.time(), }) cases_collection.update(case_id, json.dumps(case)) raise errors.RetryOperation("stopping test case %s" % case_id) elif status == CASE_STOPPING: stop_case(splunk, test_id, case_id, case) case.update({ "status": CASE_FINISHED, }) cases_collection.update(case_id, json.dumps(case)) logging.info("finished test case %s" % case_id) else: logging.error("run_cases: unexpected status for test case %s: %s" % (case_id, status)) raise errors.RetryOperation()
def deploy(splunk, kubernetes, stack_id, stack_config, cluster_config): standalone = get(splunk, kubernetes, stack_id, stack_config) if standalone: return core_api = kuberneteslib.CoreV1Api(kubernetes) custom_objects_api = kuberneteslib.CustomObjectsApi(kubernetes) standalones = custom_objects_api.list_namespaced_custom_object( group="enterprise.splunk.com", version="v1alpha2", plural="standalones", namespace=stack_config["namespace"], label_selector="app=saas,stack_id=%s" % stack_id, )["items"] if len(standalones): return splunk_defaults = { "splunk": { "conf": { "inputs": { "content": { "tcp://:9996": { "connection_host": "dns", "source": "tcp:9996", } } } } } } if stack_config["license_master_mode"] == "remote": splunk_defaults["splunk"]["conf"]["server"] = { "content": { "license": { "master_uri": cluster_config.license_master_url, }, "general": { "pass4SymmKey": cluster_config.license_master_pass4symmkey, }, } } spec = { "image": cluster_config.default_splunk_image, "imagePullPolicy": "Always", "resources": { "requests": { "memory": stack_config["memory_per_instance"], "cpu": stack_config["cpu_per_instance"], }, "limits": { "memory": stack_config["memory_per_instance"], "cpu": stack_config["cpu_per_instance"], }, }, "etcStorage": '%sGi' % stack_config["etc_storage_in_gb"], "varStorage": '%sGi' % stack_config["indexer_var_storage_in_gb"], "defaults": yaml.dump(splunk_defaults), } if stack_config["license_master_mode"] == "local": license_config_map = licensemasters.get_license_config_map(core_api, stack_id, stack_config) spec.update({ "volumes": [{ "name": "licenses", "configMap": { "name": license_config_map.metadata.name, } }], "licenseUrl": "/mnt/licenses/enterprise.lic" }) if cluster_config.node_selector: labels = cluster_config.node_selector.split(",") match_expressions = [] for label in labels: if label: kv = label.split("=") if len(kv) != 2: raise errors.ApplicationError( "invalid node selector format (%s)" % cluster_config.node_selector) match_expressions.append({ "key": kv[0], "operator": "In", "values": [kv[1]], }) spec["affinity"] = { "nodeAffinity": { "requiredDuringSchedulingIgnoredDuringExecution": { "nodeSelectorTerms": [ { "matchExpressions": match_expressions, } ], } } } if "storage_class" in cluster_config and cluster_config.storage_class: spec["storageClassName"] = cluster_config.storage_class custom_objects_api.create_namespaced_custom_object( group="enterprise.splunk.com", version="v1alpha2", namespace=stack_config["namespace"], plural="standalones", body={ "apiVersion": "enterprise.splunk.com/v1alpha2", "kind": "Standalone", "metadata": { "name": stack_id, "finalizers": ["enterprise.splunk.com/delete-pvc"], "labels": { "app": "saas", "stack_id": stack_id, } }, "spec": spec, }, )