def verify_load_balancer_completed(core_api, stack_id, stack_config, role): # getaddrinfo(host, port, 0, SOCK_STREAM) hosts = services.get_load_balancer_hosts(core_api, stack_id, role, stack_config["namespace"]) if not len(hosts): raise errors.RetryOperation("waiting for %s load balancer hostname" % role) for host in hosts: try: import socket socket.gethostbyname(host) except socket.error: raise errors.RetryOperation( "Waiting for %s ingress connection (cannot resolve hostname)" % role) pass if role == services.standalone_role or role == services.deployer_role or role == services.search_head_role or role == services.cluster_master_role: try: instances.create_client(core_api, stack_id, stack_config, role) except ssl.SSLEOFError: raise errors.RetryOperation( "Waiting for %s ingress connection (SSL protocol error)" % role) except TimeoutError: raise errors.RetryOperation( "Waiting for %s ingress connection (timeout)" % role)
def push_deployer_bundle(core_api, stack_id, stack_config): search_head_hostnames = services.get_load_balancer_hosts( core_api, stack_id, services.search_head_role, stack_config["namespace"]) if len(search_head_hostnames) == 0: raise errors.RetryOperation( "Waiting for hostname for search heads ...") search_head_hostname = search_head_hostnames[0] if is_sh_cluster_restart_in_progress(core_api, stack_id, stack_config): raise errors.RetryOperation( "wait for SH cluster restart process completed ...") service = instances.create_client(core_api, stack_id, stack_config, services.deployer_role) service.post( "apps/deploy", target="https://%s:8089" % (search_head_hostname), action="all", advertising="true", force="true", ) logging.info("pushed SH deployer bundle") if is_sh_cluster_restart_in_progress(core_api, stack_id, stack_config): raise errors.RetryOperation( "wait for SH cluster restart process completed ...")
def wait_until_ready(splunk, kubernetes, stack_id, stack_config): indexer_cluster = get(splunk, kubernetes, stack_id, stack_config) if not indexer_cluster: raise Exception("could not find indexer cluster") if not "status" in indexer_cluster: raise errors.RetryOperation("waiting for cluster master status") status = indexer_cluster["status"] target_indexer_count = int(stack_config["indexer_count"]) actualy_ready_replica = status["readyReplicas"] if target_indexer_count != actualy_ready_replica: raise errors.RetryOperation( "waiting for target number of indexers (expected %s, got %s)" % ( target_indexer_count, actualy_ready_replica, )) cluster_master_phase = status["clusterMasterPhase"] if cluster_master_phase != "Ready": raise errors.RetryOperation( "waiting for cluster master to become ready (currently it's in %s phase)" % (cluster_master_phase)) phase = status["phase"] if phase != "Ready": raise errors.RetryOperation( "waiting for indexer cluster to become ready (currently it's in %s phase)" % (phase))
def wait_until_ready(splunk, kubernetes, stack_id, stack_config): license_master = get(splunk, kubernetes, stack_id, stack_config) if not license_master: raise Exception("could not find license master") if not "status" in license_master: raise errors.RetryOperation("waiting for license master status") status = license_master["status"] phase = status["phase"] if phase != "Ready": raise errors.RetryOperation("waiting for license master to become ready (currently it's in %s phase)" % ( phase ))
def wait_until_ready(splunk, kubernetes, stack_id, stack_config): standalone = get(splunk, kubernetes, stack_id, stack_config) if not standalone: raise Exception("could not find standalone") if not "status" in standalone: raise errors.RetryOperation("waiting for standalone status") status = standalone["status"] phase = status["phase"] if phase != "Ready": raise errors.RetryOperation("waiting for standalone to become ready (currently it's in %s phase)" % ( phase ))
def stop_cases(splunk, test_id, test): cases_collection = get_performance_test_cases_collection(splunk) cases = cases_collection.query( query=json.dumps({ "test_id": test_id, }), sort="index:1", ) for case in cases: case_id = case["_key"] status = case["status"] if "stopped" in case and case["stopped"] == True: continue if status == CASE_WAITING: pass elif status == CASE_STARTING: stop_case(splunk, test_id, case_id, case) logging.info("stopped test case %s" % case_id) elif status == CASE_RUNNING: stop_case(splunk, test_id, case_id, case) logging.info("stopped test case %s" % case_id) elif status == CASE_STOPPING: stop_case(splunk, test_id, case_id, case) logging.info("stopped test case %s" % case_id) elif status == CASE_FINISHED: pass else: logging.error( "stop_cases: unexpected status for test case %s: %s" % (case_id, status)) raise errors.RetryOperation() case.update({"stopped": True}) cases_collection.update(case_id, json.dumps(case))
def wait_until_ready(splunk, kubernetes, stack_id, stack_config): search_head_cluster = get(splunk, kubernetes, stack_id, stack_config) if not search_head_cluster: raise Exception("could not find search head cluster") status = search_head_cluster["status"] # captain: ... # captainReady: true # deployerPhase: Ready # initialized: true # maintenanceMode: false # members: # - active_historical_search_count: 0 # active_realtime_search_count: 0 # adhoc_searchhead: false # is_registered: true # name: ... # status: Up # minPeersJoined: true # phase: Ready # readyReplicas: 3 # replicas: 3 # selector: ... target_search_head_count = int(stack_config["search_head_count"]) actualy_ready_replica = status["readyReplicas"] if target_search_head_count != actualy_ready_replica: raise errors.RetryOperation( "waiting for target number of search heads (expected %s, got %s)" % ( target_search_head_count, actualy_ready_replica, )) deployer_phase = status["deployerPhase"] if deployer_phase != "Ready": raise errors.RetryOperation( "waiting for deployer to become ready (currently it's in %s phase)" % (deployer_phase)) captain_ready = status["captainReady"] if not captain_ready: raise errors.RetryOperation( "search head cluster captain not yet ready") phase = status["phase"] if phase != "Ready": raise errors.RetryOperation( "waiting for search head cluster to become ready (currently it's in %s phase)" % (phase))
def stop_case(splunk, test_id, case_id, case): if "stack_id" not in case: return stack_id = case["stack_id"] result = splunk.get("saas/stack/%s" % stack_id) logging.debug("get stack result: %s" % result) response = json.loads(result.body.read())["entry"][0]["content"] logging.debug("get stack response: %s" % response) stack_status = response["status"] if stack_status == stacks.DELETING: raise errors.RetryOperation("still in status %s" % stacks.DELETING) elif stack_status == stacks.DELETED: pass elif stack_status != stacks.DELETED: result = splunk.delete("saas/stack/%s" % stack_id) response = json.loads(result.body.read())["entry"][0]["content"] logging.debug("delete stack result: %s" % response) raise errors.RetryOperation("issued deletion of stack %s" % (stack_id)) stack_config = stacks.get_stack_config(splunk, stack_id) kube_client = clusters.create_client(splunk, stack_config["cluster"]) apps_api = kubernetes.AppsV1Api(kube_client) datagen_deployments = apps_api.list_namespaced_deployment( namespace=stack_config["namespace"], label_selector="app=datagen,test=%s" % test_id, ).items for deployment in datagen_deployments: apps_api.delete_namespaced_deployment( name=deployment.metadata.name, namespace=stack_config["namespace"], ) logging.debug("deleted deployment %s" % deployment.metadata.name) searchgen_deployments = apps_api.list_namespaced_deployment( namespace=stack_config["namespace"], label_selector="app=searchgen,test=%s" % test_id, ).items for deployment in searchgen_deployments: apps_api.delete_namespaced_deployment( name=deployment.metadata.name, namespace=stack_config["namespace"], ) logging.debug("deleted deployment %s" % deployment.metadata.name)
def run_cases(splunk, test_id, test): cases_collection = get_performance_test_cases_collection(splunk) cases = cases_collection.query( query=json.dumps({ "test_id": test_id, }), sort="index:1", ) for case in cases: case_id = case["_key"] status = case["status"] if status == CASE_FINISHED: continue if status == CASE_WAITING: result = splunk.post( "saas/stacks", **{ "deployment_type": case["deployment_type"], "indexer_count": case["indexer_count"], "search_head_count": case["search_head_count"], "cpu_per_instance": case["cpu_per_instance"], "etc_storage_in_gb": case["etc_storage_in_gb"], "other_var_storage_in_gb": case["other_var_storage_in_gb"], "indexer_var_storage_in_gb": case["indexer_var_storage_in_gb"], "memory_per_instance": case["memory_per_instance"], "title": "Performance Test %s and Case %s" % (test_id, case_id), "cluster": test["cluster"], }) response = json.loads(result.body.read())["entry"][0]["content"] stack_id = response["stack_id"] logging.info("created stack %s for test case %s" % (stack_id, case_id)) case.update({ "status": CASE_STARTING, "stack_id": stack_id, }) cases_collection.update(case_id, json.dumps(case)) raise errors.RetryOperation( "waiting for stack %s in test case %s starting up ..." % (stack_id, case_id)) elif status == CASE_STARTING: stack_id = case["stack_id"] stack = splunk.get("saas/stack/%s" % stack_id) stack_status = json.loads( stack.body.read())["entry"][0]["content"]["status"] if stack_status == stacks.CREATING: raise errors.RetryOperation() if stack_status != stacks.CREATED: raise Exception("unexpected stack status: %s" % stack_status) logging.info("successfully created stack %s for case %s" % (stack_id, case_id)) stack_config = stacks.get_stack_config(splunk, stack_id) kube_client = clusters.create_client(splunk, stack_config["cluster"]) cluster_config = clusters.get_cluster(splunk, test["cluster"]) node_selector_labels = cluster_config["node_selector"].split(",") node_selector_for_generators = {} for label in node_selector_labels: if label: kv = label.split("=") if len(kv) != 2: raise errors.ApplicationError( "invalid node selector format (%s)" % cluster_config.node_selector) node_selector_for_generators[kv[0]] = kv[1] apps_api = kubernetes.AppsV1Api(kube_client) core_api = kubernetes.CoreV1Api(kube_client) if stack_config["deployment_type"] == "standalone": indexer_hosts = services.get_load_balancer_hosts( core_api, stack_id, services.standalone_role, stack_config["namespace"]) elif stack_config["deployment_type"] == "distributed": indexer_hosts = services.get_load_balancer_hosts( core_api, stack_id, services.indexer_role, stack_config["namespace"]) else: raise Exception("unexpected deployment type: %s" % stack_config["deployment_type"]) data_volume_in_gb_per_day = int(case["data_volume_in_gb_per_day"]) logging.debug("data_volume_in_gb_per_day=%s" % (data_volume_in_gb_per_day)) data_volume_in_gb_per_second = data_volume_in_gb_per_day / 24 / 60 / 60 logging.debug("data_volume_in_gb_per_second=%s" % (data_volume_in_gb_per_second)) data_volume_in_kb_per_second = data_volume_in_gb_per_second * 1024 * 1024 logging.debug("data_volume_in_kb_per_second=%s" % (data_volume_in_kb_per_second)) max_kb_per_second_per_data_generator = 100 logging.debug("max_kb_per_second_per_data_generator=%s" % (max_kb_per_second_per_data_generator)) number_of_data_generators = max( int(data_volume_in_kb_per_second / max_kb_per_second_per_data_generator) + 1, 1) logging.debug("number_of_data_generators=%s" % (number_of_data_generators)) data_volume_in_kb_per_second_per_data_generator = data_volume_in_kb_per_second / \ number_of_data_generators logging.debug( "data_volume_in_kb_per_second_per_data_generator=%s" % (data_volume_in_kb_per_second_per_data_generator)) deployment_name = "datagen-%s" % (stack_id) try: apps_api.read_namespaced_deployment( deployment_name, namespace=stack_config["namespace"]) data_gen_deployment_already_exists = True except kubernetes.rest.ApiException as e: if e.status != 404: raise data_gen_deployment_already_exists = False if not data_gen_deployment_already_exists: apps_api.create_namespaced_deployment( namespace=stack_config["namespace"], body=kubernetes.V1Deployment( metadata=kubernetes.V1ObjectMeta( name=deployment_name, namespace=stack_config["namespace"], labels={ "app": "datagen", "test": test_id, "case": case_id, }, ), spec=kubernetes.V1DeploymentSpec( replicas=number_of_data_generators, selector=kubernetes.V1LabelSelector( match_labels={ "name": "datagen-%s" % (stack_id), }), template=kubernetes.V1PodTemplateSpec( metadata=kubernetes.V1ObjectMeta(labels={ "name": "datagen-%s" % (stack_id), "app": "datagen", "test": test_id, "case": case_id, "stack": stack_id, }, ), spec=kubernetes.V1PodSpec( containers=[ kubernetes.V1Container( name="datagen", image= "blackhypothesis/splunkeventgenerator:latest", resources=kubernetes. V1ResourceRequirements( requests={ "memory": "10Mi", "cpu": "500m", }, limits={ "memory": "50Mi", "cpu": "1", }, ), env=[ kubernetes.V1EnvVar( name="DSTHOST", value=";".join( map( lambda host: host + ":9996", indexer_hosts)), ), kubernetes.V1EnvVar( name="KB_S", value="%s" % data_volume_in_kb_per_second_per_data_generator, ), ], ), ], node_selector=node_selector_for_generators, ), ), ), ), ) logging.info("created %s data generators for case %s" % (number_of_data_generators, case_id)) if stack_config["deployment_type"] == "standalone": search_head_host = services.get_load_balancer_hosts( core_api, stack_id, services.standalone_role, stack_config["namespace"])[0] elif stack_config["deployment_type"] == "distributed": search_head_host = services.get_load_balancer_hosts( core_api, stack_id, services.search_head_role, stack_config["namespace"])[0] else: raise Exception("unexpected deployment type: %s" % stack_config["deployment_type"]) searches_per_day = int(case["searches_per_day"]) logging.debug("searches_per_day=%s" % (searches_per_day)) searches_per_second = searches_per_day / 24 / 60 / 60 logging.debug("searches_per_second=%s" % (searches_per_second)) max_searches_per_second_per_generator = 5 logging.debug("max_searches_per_second_per_generator=%s" % (max_searches_per_second_per_generator)) number_of_search_generators = max( int(searches_per_second / max_searches_per_second_per_generator) + 1, 1) logging.debug("number_of_search_generators=%s" % (number_of_search_generators)) searches_per_second_per_generator = searches_per_second / \ number_of_search_generators logging.debug("searches_per_second_per_generator=%s" % (searches_per_second_per_generator)) search_template = case["search_template"] if searches_per_day > 0 and search_template: deployment_name = "searchgen-%s" % (stack_id) try: apps_api.read_namespaced_deployment( deployment_name, namespace=stack_config["namespace"]) search_gen_deployment_already_exists = True except kubernetes.rest.ApiException as e: if e.status != 404: raise search_gen_deployment_already_exists = False if not search_gen_deployment_already_exists: admin_password = instances.get_admin_password( core_api, stack_id, stack_config, services.search_head_role) apps_api.create_namespaced_deployment( namespace=stack_config["namespace"], body=kubernetes.V1Deployment( metadata=kubernetes.V1ObjectMeta( name=deployment_name, namespace=stack_config["namespace"], labels={ "app": "searchgen", "test": test_id, "case": case_id, }, ), spec=kubernetes.V1DeploymentSpec( replicas=number_of_search_generators, selector=kubernetes.V1LabelSelector( match_labels={ "name": "searchgen-%s" % (stack_id), }), template=kubernetes.V1PodTemplateSpec( metadata=kubernetes.V1ObjectMeta(labels={ "name": "searchgen-%s" % (stack_id), "app": "searchgen", "test": test_id, "case": case_id, "stack": stack_id, }, ), spec=kubernetes.V1PodSpec( containers=[ kubernetes.V1Container( name="searchgen", image= "hovu96/splunk-searchgen:latest", resources=kubernetes. V1ResourceRequirements( requests={ "memory": "10Mi", "cpu": "500m", }, limits={ "memory": "50Mi", "cpu": "1", }, ), env=[ kubernetes.V1EnvVar( name="SEARCH_GEN_SPL", value=search_template, ), kubernetes.V1EnvVar( name="SEARCH_GEN_HOST", value=search_head_host, ), kubernetes.V1EnvVar( name="SEARCH_GEN_USER", value="admin", ), kubernetes.V1EnvVar( name= "SEARCH_GEN_PASSWORD", value=admin_password, ), kubernetes.V1EnvVar( name="SEARCH_GEN_SPS", value="%s" % searches_per_second_per_generator, ), ], ), ], node_selector= node_selector_for_generators, ), ), ), ), ) logging.info("created %s search generators for case %s" % (number_of_search_generators, case_id)) else: logging.info("no search generators started") case.update({ "status": CASE_RUNNING, "time_started_running": time.time(), }) cases_collection.update(case_id, json.dumps(case)) raise errors.RetryOperation("running test case %s ..." % case_id) elif status == CASE_RUNNING: time_started_running = case["time_started_running"] time_now = time.time() seconds_running_to_far = time_now - time_started_running target_run_duration = test["run_duration"] logging.debug( "time_started_running=%s time_now=%s seconds_running_to_far=%s" % (time_started_running, time_now, seconds_running_to_far)) if seconds_running_to_far < (target_run_duration * 60): logging.debug("still waiting") raise errors.RetryOperation() logging.info("time elapsed for case %s" % (case_id)) case.update({ "status": CASE_STOPPING, "time_finished_running": time.time(), }) cases_collection.update(case_id, json.dumps(case)) raise errors.RetryOperation("stopping test case %s" % case_id) elif status == CASE_STOPPING: stop_case(splunk, test_id, case_id, case) case.update({ "status": CASE_FINISHED, }) cases_collection.update(case_id, json.dumps(case)) logging.info("finished test case %s" % case_id) else: logging.error("run_cases: unexpected status for test case %s: %s" % (case_id, status)) raise errors.RetryOperation()