def handle_GET(self): path = self.request['path'] _, stack_id = os.path.split(path) stack_config = stacks.get_stack_config(self.splunk, stack_id) kubernetes = clusters.create_client(self.splunk, stack_config["cluster"]) core_api = kuberneteslib.CoreV1Api(kubernetes) instances = {} #FIXME: label_selector does exclude monitoring-console for now # Can be removed as soon as the stack_id label has been removed pods = core_api.list_namespaced_pod( namespace=stack_config["namespace"], label_selector= "app=saas,stack_id=%s,app.kubernetes.io/name!=monitoring-console" % stack_id, ).items for pod in pods: name = pod.metadata.name match = re.match('.*%s-(.+)-([0-9]+)$' % stack_id, name) if match: number = int(match.group(2)) + 1 role = match.group(1) else: number = None match = re.match('.*%s-(.+)$' % stack_id, name) if match: role = match.group(1) else: role = None reasons = set() if pod.status: status = pod.status.phase is_ready = None if pod.status.conditions: for condition in pod.status.conditions: if condition.status == "False": if condition.reason: reasons.add(condition.reason) if condition.type == "Ready": is_ready = condition.status == "True" if status == "Running": if is_ready: status = "ready" else: status = "running" else: status = "unknown" instances[name] = { "role": role.lower(), "number": number, "status": status.lower(), "reasons": list(reasons), } self.send_entries(instances.values())
def update_deployment(splunk, kubernetes, stack_id): stack_config = stacks.get_stack_config(splunk, stack_id) cluster_name = stack_config["cluster"] kubernetes = clusters.create_client(splunk, cluster_name) cluster_config = clusters.get_cluster(splunk, cluster_name) if stack_config["deployment_type"] == "distributed": indexer_cluster.update(splunk, kubernetes, stack_id, stack_config) search_head_cluster.update(splunk, kubernetes, stack_id, stack_config) indexer_cluster.wait_until_ready(splunk, kubernetes, stack_id, stack_config) search_head_cluster.wait_until_ready(splunk, kubernetes, stack_id, stack_config)
def update_deployment(splunk, kubernetes, stack_id): core_api = kuberneteslib.CoreV1Api(kubernetes) stack_config = stacks.get_stack_config(splunk, stack_id) cluster_name = stack_config["cluster"] kubernetes = clusters.create_client(splunk, cluster_name) cluster_config = clusters.get_cluster(splunk, cluster_name) if stack_config["deployment_type"] == "distributed": indexer_cluster.update(splunk, kubernetes, stack_id, stack_config) search_head_cluster.update(splunk, kubernetes, stack_id, stack_config) indexer_cluster.wait_until_ready(splunk, kubernetes, stack_id, stack_config) search_head_cluster.wait_until_ready(splunk, kubernetes, stack_id, stack_config) create_load_balancers(core_api, stack_id, stack_config) verify_load_balancers_completed(core_api, stack_id, stack_config)
def down(splunk, stack_id, force=False): stacks.update_config(splunk, stack_id, { "status": stacks.DELETING, "deleted_time": time.time(), }) stack_config = stacks.get_stack_config(splunk, stack_id) cluster_name = stack_config["cluster"] cluster_config = clusters.get_cluster(splunk, cluster_name) api_client = clusters.create_client(splunk, cluster_name) core_api = kuberneteslib.CoreV1Api(api_client) custom_objects_api = kuberneteslib.CustomObjectsApi(api_client) try: services.delete_all_load_balancers(core_api, stack_id, stack_config["namespace"]) stack_deployment.delete_objects(api_client, stack_id, stack_config, cluster_config) except: if not force: raise stacks.update_config(splunk, stack_id, { "status": stacks.DELETED, })
def stop_case(splunk, test_id, case_id, case): if "stack_id" not in case: return stack_id = case["stack_id"] result = splunk.get("saas/stack/%s" % stack_id) logging.debug("get stack result: %s" % result) response = json.loads(result.body.read())["entry"][0]["content"] logging.debug("get stack response: %s" % response) stack_status = response["status"] if stack_status == stacks.DELETING: raise errors.RetryOperation("still in status %s" % stacks.DELETING) elif stack_status == stacks.DELETED: pass elif stack_status != stacks.DELETED: result = splunk.delete("saas/stack/%s" % stack_id) response = json.loads(result.body.read())["entry"][0]["content"] logging.debug("delete stack result: %s" % response) raise errors.RetryOperation("issued deletion of stack %s" % (stack_id)) stack_config = stacks.get_stack_config(splunk, stack_id) kube_client = clusters.create_client(splunk, stack_config["cluster"]) apps_api = kubernetes.AppsV1Api(kube_client) datagen_deployments = apps_api.list_namespaced_deployment( namespace=stack_config["namespace"], label_selector="app=datagen,test=%s" % test_id, ).items for deployment in datagen_deployments: apps_api.delete_namespaced_deployment( name=deployment.metadata.name, namespace=stack_config["namespace"], ) logging.debug("deleted deployment %s" % deployment.metadata.name) searchgen_deployments = apps_api.list_namespaced_deployment( namespace=stack_config["namespace"], label_selector="app=searchgen,test=%s" % test_id, ).items for deployment in searchgen_deployments: apps_api.delete_namespaced_deployment( name=deployment.metadata.name, namespace=stack_config["namespace"], ) logging.debug("deleted deployment %s" % deployment.metadata.name)
def up(splunk, stack_id): stack_config = stacks.get_stack_config(splunk, stack_id) cluster_name = stack_config["cluster"] kubernetes = clusters.create_client(splunk, cluster_name) cluster_config = clusters.get_cluster(splunk, cluster_name) status = stack_config["status"] if status == stacks.CREATING: stack_deployment.create_deployment(splunk, kubernetes, stack_id, stack_config, cluster_config) app_deployment.update_apps(splunk, kubernetes, stack_id) logging.info("created") stacks.update_config(splunk, stack_id, { "status": stacks.CREATED, }) elif status == stacks.UPDATING: stack_deployment.update_deployment(splunk, kubernetes, stack_id) app_deployment.update_apps(splunk, kubernetes, stack_id) logging.info("updated") stacks.update_config(splunk, stack_id, { "status": stacks.CREATED, }) elif status == stacks.CREATED: logging.info("Everything is up-to-date") else: logging.warning("unexpected status: %s", status)
def handle_GET(self): path = self.request['path'] _, stack_id = os.path.split(path) stack_config = get_stack_config(self.splunk, stack_id) result = { "status": stack_config["status"], "title": stack_config["title"] if "title" in stack_config else "", "deployment_type": stack_config["deployment_type"], "license_master_mode": stack_config["license_master_mode"], "cluster": stack_config["cluster"], "namespace": stack_config["namespace"], } if stack_config["deployment_type"] == "distributed": result["indexer_count"] = stack_config["indexer_count"] result["search_head_count"] = stack_config["search_head_count"] api_client = clusters.create_client( self.service, stack_config["cluster"]) from kubernetes import client as kubernetes core_api = kubernetes.CoreV1Api(api_client) hosts = services.get_load_balancer_hosts( core_api, stack_id, services.search_head_role, stack_config["namespace"]) if hosts: admin_password = instances.get_admin_password(core_api, stack_id, stack_config, services.search_head_role) result.update({ "search_head_endpoint": ["http://%s" % hostname for hostname in hosts], "search_head_password": admin_password, }) if stack_config["license_master_mode"] == "local": hosts = services.get_load_balancer_hosts( core_api, stack_id, services.license_master_role, stack_config["namespace"]) if hosts: admin_password = instances.get_admin_password(core_api, stack_id, stack_config, services.license_master_role) result.update({ "license_master_endpoint": ["http://%s" % hostname for hostname in hosts], "license_master_password": admin_password, }) hosts = services.get_load_balancer_hosts( core_api, stack_id, services.cluster_master_role, stack_config["namespace"]) if hosts: admin_password = instances.get_admin_password(core_api, stack_id, stack_config, services.cluster_master_role) result.update({ "cluster_master_endpoint": ["http://%s" % hostname for hostname in hosts], "cluster_master_password": admin_password, }) hosts = services.get_load_balancer_hosts( core_api, stack_id, services.deployer_role, stack_config["namespace"]) if hosts: admin_password = instances.get_admin_password(core_api, stack_id, stack_config, services.deployer_role) result.update({ "deployer_endpoint": ["http://%s" % hostname for hostname in hosts], "deployer_password": admin_password, }) hosts = services.get_load_balancer_hosts( core_api, stack_id, services.standalone_role, stack_config["namespace"]) if hosts: admin_password = instances.get_admin_password(core_api, stack_id, stack_config, services.standalone_role) result.update({ "standalone_endpoint": ["http://%s" % hostname for hostname in hosts], "standalone_password": admin_password, }) hosts = services.get_load_balancer_hosts( core_api, stack_id, services.indexer_role, stack_config["namespace"]) if hosts: admin_password = instances.get_admin_password(core_api, stack_id, stack_config, services.indexer_role) result.update({ "indexer_endpoint": ["%s:9997" % hostname for hostname in hosts], "indexer_password": admin_password, }) self.send_result(result)
def run_cases(splunk, test_id, test): cases_collection = get_performance_test_cases_collection(splunk) cases = cases_collection.query( query=json.dumps({ "test_id": test_id, }), sort="index:1", ) for case in cases: case_id = case["_key"] status = case["status"] if status == CASE_FINISHED: continue if status == CASE_WAITING: result = splunk.post( "saas/stacks", **{ "deployment_type": case["deployment_type"], "indexer_count": case["indexer_count"], "search_head_count": case["search_head_count"], "cpu_per_instance": case["cpu_per_instance"], "etc_storage_in_gb": case["etc_storage_in_gb"], "other_var_storage_in_gb": case["other_var_storage_in_gb"], "indexer_var_storage_in_gb": case["indexer_var_storage_in_gb"], "memory_per_instance": case["memory_per_instance"], "title": "Performance Test %s and Case %s" % (test_id, case_id), "cluster": test["cluster"], }) response = json.loads(result.body.read())["entry"][0]["content"] stack_id = response["stack_id"] logging.info("created stack %s for test case %s" % (stack_id, case_id)) case.update({ "status": CASE_STARTING, "stack_id": stack_id, }) cases_collection.update(case_id, json.dumps(case)) raise errors.RetryOperation( "waiting for stack %s in test case %s starting up ..." % (stack_id, case_id)) elif status == CASE_STARTING: stack_id = case["stack_id"] stack = splunk.get("saas/stack/%s" % stack_id) stack_status = json.loads( stack.body.read())["entry"][0]["content"]["status"] if stack_status == stacks.CREATING: raise errors.RetryOperation() if stack_status != stacks.CREATED: raise Exception("unexpected stack status: %s" % stack_status) logging.info("successfully created stack %s for case %s" % (stack_id, case_id)) stack_config = stacks.get_stack_config(splunk, stack_id) kube_client = clusters.create_client(splunk, stack_config["cluster"]) cluster_config = clusters.get_cluster(splunk, test["cluster"]) node_selector_labels = cluster_config["node_selector"].split(",") node_selector_for_generators = {} for label in node_selector_labels: if label: kv = label.split("=") if len(kv) != 2: raise errors.ApplicationError( "invalid node selector format (%s)" % cluster_config.node_selector) node_selector_for_generators[kv[0]] = kv[1] apps_api = kubernetes.AppsV1Api(kube_client) core_api = kubernetes.CoreV1Api(kube_client) if stack_config["deployment_type"] == "standalone": indexer_hosts = services.get_load_balancer_hosts( core_api, stack_id, services.standalone_role, stack_config["namespace"]) elif stack_config["deployment_type"] == "distributed": indexer_hosts = services.get_load_balancer_hosts( core_api, stack_id, services.indexer_role, stack_config["namespace"]) else: raise Exception("unexpected deployment type: %s" % stack_config["deployment_type"]) data_volume_in_gb_per_day = int(case["data_volume_in_gb_per_day"]) logging.debug("data_volume_in_gb_per_day=%s" % (data_volume_in_gb_per_day)) data_volume_in_gb_per_second = data_volume_in_gb_per_day / 24 / 60 / 60 logging.debug("data_volume_in_gb_per_second=%s" % (data_volume_in_gb_per_second)) data_volume_in_kb_per_second = data_volume_in_gb_per_second * 1024 * 1024 logging.debug("data_volume_in_kb_per_second=%s" % (data_volume_in_kb_per_second)) max_kb_per_second_per_data_generator = 100 logging.debug("max_kb_per_second_per_data_generator=%s" % (max_kb_per_second_per_data_generator)) number_of_data_generators = max( int(data_volume_in_kb_per_second / max_kb_per_second_per_data_generator) + 1, 1) logging.debug("number_of_data_generators=%s" % (number_of_data_generators)) data_volume_in_kb_per_second_per_data_generator = data_volume_in_kb_per_second / \ number_of_data_generators logging.debug( "data_volume_in_kb_per_second_per_data_generator=%s" % (data_volume_in_kb_per_second_per_data_generator)) deployment_name = "datagen-%s" % (stack_id) try: apps_api.read_namespaced_deployment( deployment_name, namespace=stack_config["namespace"]) data_gen_deployment_already_exists = True except kubernetes.rest.ApiException as e: if e.status != 404: raise data_gen_deployment_already_exists = False if not data_gen_deployment_already_exists: apps_api.create_namespaced_deployment( namespace=stack_config["namespace"], body=kubernetes.V1Deployment( metadata=kubernetes.V1ObjectMeta( name=deployment_name, namespace=stack_config["namespace"], labels={ "app": "datagen", "test": test_id, "case": case_id, }, ), spec=kubernetes.V1DeploymentSpec( replicas=number_of_data_generators, selector=kubernetes.V1LabelSelector( match_labels={ "name": "datagen-%s" % (stack_id), }), template=kubernetes.V1PodTemplateSpec( metadata=kubernetes.V1ObjectMeta(labels={ "name": "datagen-%s" % (stack_id), "app": "datagen", "test": test_id, "case": case_id, "stack": stack_id, }, ), spec=kubernetes.V1PodSpec( containers=[ kubernetes.V1Container( name="datagen", image= "blackhypothesis/splunkeventgenerator:latest", resources=kubernetes. V1ResourceRequirements( requests={ "memory": "10Mi", "cpu": "500m", }, limits={ "memory": "50Mi", "cpu": "1", }, ), env=[ kubernetes.V1EnvVar( name="DSTHOST", value=";".join( map( lambda host: host + ":9996", indexer_hosts)), ), kubernetes.V1EnvVar( name="KB_S", value="%s" % data_volume_in_kb_per_second_per_data_generator, ), ], ), ], node_selector=node_selector_for_generators, ), ), ), ), ) logging.info("created %s data generators for case %s" % (number_of_data_generators, case_id)) if stack_config["deployment_type"] == "standalone": search_head_host = services.get_load_balancer_hosts( core_api, stack_id, services.standalone_role, stack_config["namespace"])[0] elif stack_config["deployment_type"] == "distributed": search_head_host = services.get_load_balancer_hosts( core_api, stack_id, services.search_head_role, stack_config["namespace"])[0] else: raise Exception("unexpected deployment type: %s" % stack_config["deployment_type"]) searches_per_day = int(case["searches_per_day"]) logging.debug("searches_per_day=%s" % (searches_per_day)) searches_per_second = searches_per_day / 24 / 60 / 60 logging.debug("searches_per_second=%s" % (searches_per_second)) max_searches_per_second_per_generator = 5 logging.debug("max_searches_per_second_per_generator=%s" % (max_searches_per_second_per_generator)) number_of_search_generators = max( int(searches_per_second / max_searches_per_second_per_generator) + 1, 1) logging.debug("number_of_search_generators=%s" % (number_of_search_generators)) searches_per_second_per_generator = searches_per_second / \ number_of_search_generators logging.debug("searches_per_second_per_generator=%s" % (searches_per_second_per_generator)) search_template = case["search_template"] if searches_per_day > 0 and search_template: deployment_name = "searchgen-%s" % (stack_id) try: apps_api.read_namespaced_deployment( deployment_name, namespace=stack_config["namespace"]) search_gen_deployment_already_exists = True except kubernetes.rest.ApiException as e: if e.status != 404: raise search_gen_deployment_already_exists = False if not search_gen_deployment_already_exists: admin_password = instances.get_admin_password( core_api, stack_id, stack_config, services.search_head_role) apps_api.create_namespaced_deployment( namespace=stack_config["namespace"], body=kubernetes.V1Deployment( metadata=kubernetes.V1ObjectMeta( name=deployment_name, namespace=stack_config["namespace"], labels={ "app": "searchgen", "test": test_id, "case": case_id, }, ), spec=kubernetes.V1DeploymentSpec( replicas=number_of_search_generators, selector=kubernetes.V1LabelSelector( match_labels={ "name": "searchgen-%s" % (stack_id), }), template=kubernetes.V1PodTemplateSpec( metadata=kubernetes.V1ObjectMeta(labels={ "name": "searchgen-%s" % (stack_id), "app": "searchgen", "test": test_id, "case": case_id, "stack": stack_id, }, ), spec=kubernetes.V1PodSpec( containers=[ kubernetes.V1Container( name="searchgen", image= "hovu96/splunk-searchgen:latest", resources=kubernetes. V1ResourceRequirements( requests={ "memory": "10Mi", "cpu": "500m", }, limits={ "memory": "50Mi", "cpu": "1", }, ), env=[ kubernetes.V1EnvVar( name="SEARCH_GEN_SPL", value=search_template, ), kubernetes.V1EnvVar( name="SEARCH_GEN_HOST", value=search_head_host, ), kubernetes.V1EnvVar( name="SEARCH_GEN_USER", value="admin", ), kubernetes.V1EnvVar( name= "SEARCH_GEN_PASSWORD", value=admin_password, ), kubernetes.V1EnvVar( name="SEARCH_GEN_SPS", value="%s" % searches_per_second_per_generator, ), ], ), ], node_selector= node_selector_for_generators, ), ), ), ), ) logging.info("created %s search generators for case %s" % (number_of_search_generators, case_id)) else: logging.info("no search generators started") case.update({ "status": CASE_RUNNING, "time_started_running": time.time(), }) cases_collection.update(case_id, json.dumps(case)) raise errors.RetryOperation("running test case %s ..." % case_id) elif status == CASE_RUNNING: time_started_running = case["time_started_running"] time_now = time.time() seconds_running_to_far = time_now - time_started_running target_run_duration = test["run_duration"] logging.debug( "time_started_running=%s time_now=%s seconds_running_to_far=%s" % (time_started_running, time_now, seconds_running_to_far)) if seconds_running_to_far < (target_run_duration * 60): logging.debug("still waiting") raise errors.RetryOperation() logging.info("time elapsed for case %s" % (case_id)) case.update({ "status": CASE_STOPPING, "time_finished_running": time.time(), }) cases_collection.update(case_id, json.dumps(case)) raise errors.RetryOperation("stopping test case %s" % case_id) elif status == CASE_STOPPING: stop_case(splunk, test_id, case_id, case) case.update({ "status": CASE_FINISHED, }) cases_collection.update(case_id, json.dumps(case)) logging.info("finished test case %s" % case_id) else: logging.error("run_cases: unexpected status for test case %s: %s" % (case_id, status)) raise errors.RetryOperation()