def test_process_metrics(host): url = "http://localhost:7070/api/topic/sts_multi_metrics?limit=1000" def wait_for_metrics(): data = host.check_output("curl \"%s\"" % url) json_data = json.loads(data) with open("./topic-multi-metrics.json", 'w') as f: json.dump(json_data, f, indent=4) def get_keys(m_host): return next( set(message["message"]["MultiMetric"]["values"].keys()) for message in json_data["messages"] if message["message"]["MultiMetric"]["name"] == "processMetrics" and message["message"]["MultiMetric"]["host"] == m_host) # Same metrics we check in the backend e2e tests # https://stackvista.githost.io/StackVista/StackState/blob/master/stackstate-pm-test/src/test/scala/com/stackstate/it/e2e/ProcessAgentIntegrationE2E.scala#L17 expected = { "cpu_nice", "cpu_userPct", "cpu_userTime", "cpu_systemPct", "cpu_numThreads", "io_writeRate", "io_writeBytesRate", "cpu_totalPct", "voluntaryCtxSwitches", "mem_dirty", "involuntaryCtxSwitches", "io_readRate", "openFdCount", "mem_shared", "cpu_systemTime", "io_readBytesRate", "mem_data", "mem_vms", "mem_lib", "mem_text", "mem_swap", "mem_rss" } assert get_keys("agent-ubuntu") == expected assert get_keys("agent-fedora") == expected assert get_keys("agent-centos") == expected assert get_keys("agent-win") == expected util.wait_until(wait_for_metrics, 30, 3)
def test_container_metrics(host): url = "http://localhost:7070/api/topic/sts_multi_metrics?limit=1000" def wait_for_metrics(): data = host.check_output("curl \"%s\"" % url) json_data = json.loads(data) with open("./topic-nagios-sts-multi-metrics.json", 'w') as f: json.dump(json_data, f, indent=4) def get_keys(m_host): return set( ''.join(message["message"]["MultiMetric"]["values"].keys()) for message in json_data["messages"] if message["message"]["MultiMetric"]["name"] == "convertedMetric" and message["message"]["MultiMetric"]["host"] == m_host) expected = { 'nagios.http.size', 'nagios.ping.pl', 'nagios.http.time', 'nagios.current_load.load15', 'nagios.swap_usage.swap', 'nagios.host.pl', 'nagios.root_partition', 'nagios.current_users.users', 'nagios.current_load.load1', 'nagios.host.rta', 'nagios.ping.rta', 'nagios.current_load.load5', 'nagios.total_processes.procs' } assert all([ expectedMetric for expectedMetric in expected if expectedMetric in get_keys("agent-integrations") ]) util.wait_until(wait_for_metrics, 180, 3)
def test_stackstate_process_agent_no_log_errors(host, hostname): process_agent_log_path = "c:\\programdata\\stackstate\\logs\\process-agent.log" # Check for presence of success def wait_for_check_successes(): process_agent_log = host.ansible( "win_shell", "cat \"{}\"".format(process_agent_log_path), check=False)["stdout"] print(process_agent_log) assert re.search("Finished check #1", process_agent_log) assert re.search("starting network tracer locally", process_agent_log) util.wait_until(wait_for_check_successes, 30, 3) process_agent_log = host.ansible( "win_shell", "cat \"{}\"".format(process_agent_log_path), check=False)["stdout"] with open("./{}-process.log".format(hostname), 'wb') as f: f.write(process_agent_log.encode('utf-8')) # Check for errors for line in process_agent_log.splitlines(): print("Considering: %s" % line) assert not re.search("error", line, re.IGNORECASE)
def test_connection_network_namespaces_relations(host): url = "http://localhost:7070/api/topic/sts_correlate_endpoints?limit=1500" def wait_for_connection(): data = host.check_output("curl \"%s\"" % url) json_data = json.loads(data) with open("./topic-correlate-endpoint-netns.json", 'w') as f: json.dump(json_data, f, indent=4) # assert that we find a outgoing localhost connection between 127.0.0.1 to 127.0.0.1 to port 9091 on # agent-connection-namespaces host within the same network namespace. outgoing_conn = _find_outgoing_connection_in_namespace( json_data, 9091, "agent-connection-namespaces", "127.0.0.1", "127.0.0.1") print(outgoing_conn) incoming_conn = _find_incoming_connection_in_namespace( json_data, 9091, "agent-connection-namespaces", "127.0.0.1", "127.0.0.1") print(incoming_conn) # assert that the connections are in the same namespace outgoing_local_namespace = outgoing_conn["localEndpoint"]["namespace"] outgoing_remote_namespace = outgoing_conn["remoteEndpoint"][ "namespace"] incoming_local_namespace = incoming_conn["localEndpoint"]["namespace"] incoming_remote_namespace = incoming_conn["remoteEndpoint"][ "namespace"] assert (outgoing_local_namespace == outgoing_remote_namespace and incoming_local_namespace == incoming_remote_namespace and incoming_remote_namespace == outgoing_local_namespace and incoming_local_namespace == outgoing_remote_namespace) util.wait_until(wait_for_connection, 30, 3)
def test_stackstate_trace_agent_log(host, hostname): trace_agent_log_path = "c:\\programdata\\stackstate\\logs\\trace-agent.log" # Check for presence of success def wait_for_check_successes(): trace_agent_log = host.ansible( "win_shell", "cat \"{}\"".format(trace_agent_log_path), check=False)["stdout"] print(trace_agent_log) assert re.search("Trace agent running on host", trace_agent_log) assert re.search("Listening for traces at", trace_agent_log) assert re.search("No data received", trace_agent_log) util.wait_until(wait_for_check_successes, 30, 3) agent_log = host.ansible("win_shell", "cat \"{}\"".format(trace_agent_log_path), check=False)["stdout"] with open("./{}-trace.log".format(hostname), 'wb') as f: f.write(agent_log.encode('utf-8')) # Check for errors for line in agent_log.splitlines(): print("Considering: %s" % line) assert not re.search("\\| error \\|", line, re.IGNORECASE)
def test_isolate_single_datanode(): """ In this test case we will create a network partition in such a way that one of the datanode will not be able to communicate with other datanodes but it will be able to communicate with SCM. Once the network partition happens, SCM detects it and closes the pipeline, which in-turn closes the containers. The container on the first two datanode will get CLOSED as they have quorum. The container replica on the third node will be QUASI_CLOSED as it is not able to connect with the other datanodes and it doesn't have latest BCSID. Once we restore the network, the stale replica on the third datanode will be deleted and a latest replica will be copied from any one of the other datanodes. """ cluster.run_freon(1, 1, 1, 10240) first_set = [ cluster.om, cluster.scm, cluster.datanodes[0], cluster.datanodes[1] ] second_set = [cluster.om, cluster.scm, cluster.datanodes[2]] logger.info("Partitioning the network") cluster.partition_network(first_set, second_set) cluster.run_freon(1, 1, 1, 10240) logger.info("Waiting for container to be QUASI_CLOSED") util.wait_until( lambda: cluster.get_container_states(cluster.datanodes[2]).popitem()[1] == 'QUASI_CLOSED', int(os.environ["CONTAINER_STATUS_SLEEP"]), 10) container_states_dn_0 = cluster.get_container_states(cluster.datanodes[0]) container_states_dn_1 = cluster.get_container_states(cluster.datanodes[1]) container_states_dn_2 = cluster.get_container_states(cluster.datanodes[2]) assert len(container_states_dn_0) != 0 assert len(container_states_dn_1) != 0 assert len(container_states_dn_2) != 0 for key in container_states_dn_0: assert container_states_dn_0.get(key) == 'CLOSED' for key in container_states_dn_1: assert container_states_dn_1.get(key) == 'CLOSED' for key in container_states_dn_2: assert container_states_dn_2.get(key) == 'QUASI_CLOSED' # Since the replica in datanode[2] doesn't have the latest BCSID, # ReplicationManager will delete it and copy a closed replica. # We will now restore the network and datanode[2] should get a # closed replica of the container logger.info("Restoring the network") cluster.restore_network() logger.info("Waiting for the replica to be CLOSED") util.wait_until( lambda: cluster.container_state_predicate(cluster.datanodes[2], 'CLOSED'), int(os.environ["CONTAINER_STATUS_SLEEP"]), 10) container_states_dn_2 = cluster.get_container_states(cluster.datanodes[2]) assert len(container_states_dn_2) != 0 for key in container_states_dn_2: assert container_states_dn_2.get(key) == 'CLOSED'
def test_docker_swarm_metrics(host): url = "http://localhost:7070/api/topic/sts_multi_metrics?limit=3000" def wait_for_metrics(): data = host.check_output("curl \"%s\"" % url) json_data = json.loads(data) with open("./topic-docker-swarm-sts-multi-metrics.json", 'w') as f: json.dump(json_data, f, indent=4) def get_keys(): # Check for a swarm service which all metrics are we returning # as an example we are taking for nginx return set( ''.join(message["message"]["MultiMetric"]["values"].keys()) for message in json_data["messages"] if message["message"]["MultiMetric"]["name"] == "convertedMetric" and "serviceName" in message["message"]["MultiMetric"]["tags"]) expected = { 'swarm.service.desired_replicas', 'swarm.service.running_replicas' } assert all([ expectedMetric for expectedMetric in expected if expectedMetric in get_keys() ]) util.wait_until(wait_for_metrics, 180, 10)
def test_stackstate_agent_log(host, hostname): agent_log_path = "/var/log/stackstate-agent/agent.log" # Check for presence of success def wait_for_check_successes(): agent_log = _get_log(host, "{}-{}".format(hostname, "agent"), agent_log_path) assert re.search("Successfully posted payload to.*stsAgent/intake", agent_log) util.wait_until(wait_for_check_successes, 60, 3) ignored_errors_regex = [ # TODO: Collecting processes snap -> Will be addressed with STAC-3531 "Error code \"400 Bad Request\" received while " "sending transaction to \"https://.*/stsAgent/intake/.*" "Failed to deserialize JSON on fields: , " "with message: Object is missing required member \'internalHostname\'", "net/ntp.go.*There was an error querying the ntp host", ] # Check for errors agent_log = _get_log(host, "{}-{}".format(hostname, "agent"), agent_log_path) for line in agent_log.splitlines(): ignored = False for ignored_error in ignored_errors_regex: if len(re.findall(ignored_error, line, re.DOTALL)) > 0: ignored = True if ignored: continue print("Considering: %s" % line) assert not re.search("error", line, re.IGNORECASE)
def test_no_datadog_metrics(host): url = "http://localhost:7070/api/topic/sts_multi_metrics?limit=1000" def wait_for_metrics(): data = host.check_output("curl \"%s\"" % url) json_data = json.loads(data) with open("./topic-sts-multi-metrics.json", 'w') as f: json.dump(json_data, f, indent=4) metrics = {} for message in json_data["messages"]: for m_name in message["message"]["MultiMetric"]["values"].keys(): if m_name not in metrics: metrics[m_name] = [] values = [message["message"]["MultiMetric"]["values"][m_name]] metrics[m_name] += values # assert that we don't see any datadog metrics datadog_metrics = [(key, value) for key, value in metrics.iteritems() if key.startswith("datadog")] assert len( datadog_metrics ) == 0, 'datadog metrics found in sts_multi_metrics: [%s]' % ', '.join( map(str, datadog_metrics)) util.wait_until(wait_for_metrics, 60, 3)
def test_created_connection_after_start_with_metrics(host, common_vars): url = "http://localhost:7070/api/topic/sts_correlate_endpoints?limit=1000" fedora_conn_port = int(common_vars["connection_port_after_start_fedora"]) windows_conn_port = int(common_vars["connection_port_after_start_windows"]) ubuntu_private_ip = _get_instance_config("agent-ubuntu")["private_address"] print("ubuntu private: {}".format(ubuntu_private_ip)) fedora_private_ip = _get_instance_config("agent-fedora")["private_address"] print("fedora private: {}".format(fedora_private_ip)) windows_private_ip = _get_instance_config("agent-win")["private_address"] print("windows private: {}".format(windows_private_ip)) def wait_for_connection(): data = host.check_output("curl \"%s\"" % url) json_data = json.loads(data) with open("./topic-correlate-endpoint-after.json", 'w') as f: json.dump(json_data, f, indent=4) outgoing_conn = _find_outgoing_connection(json_data, fedora_conn_port, fedora_private_ip, ubuntu_private_ip) print(outgoing_conn) assert outgoing_conn["direction"] == "OUTGOING" assert outgoing_conn["connectionType"] == "TCP" assert outgoing_conn["bytesSentPerSecond"] > 10.0 assert outgoing_conn["bytesReceivedPerSecond"] == 0.0 incoming_conn = _find_incoming_connection(json_data, fedora_conn_port, fedora_private_ip, ubuntu_private_ip) print(incoming_conn) assert incoming_conn["direction"] == "INCOMING" assert incoming_conn["connectionType"] == "TCP" assert incoming_conn["bytesSentPerSecond"] == 0.0 assert incoming_conn["bytesReceivedPerSecond"] > 10.0 outgoing_conn = _find_outgoing_connection(json_data, windows_conn_port, windows_private_ip, ubuntu_private_ip) print(outgoing_conn) assert outgoing_conn["direction"] == "OUTGOING" assert outgoing_conn["connectionType"] == "TCP" assert outgoing_conn[ "bytesSentPerSecond"] == 0.0 # We don't collect metrics on Windows assert outgoing_conn["bytesReceivedPerSecond"] == 0.0 incoming_conn = _find_incoming_connection(json_data, windows_conn_port, windows_private_ip, ubuntu_private_ip) print(incoming_conn) assert incoming_conn["direction"] == "INCOMING" assert incoming_conn["connectionType"] == "TCP" assert incoming_conn["bytesSentPerSecond"] == 0.0 assert incoming_conn[ "bytesReceivedPerSecond"] == 0.0 # We don't send data from Windows util.wait_until(wait_for_connection, 30, 3)
def test_cluster_agent_healthy(host, ansible_var): namespace = ansible_var("namespace") def assert_healthy(): c = kubeconfig_env + "kubectl wait --for=condition=available --timeout=1s deployment/stackstate-cluster-agent --namespace={}".format( namespace) assert host.run(c).rc == 0 util.wait_until(assert_healthy, 30, 5)
def test_node_agent_healthy(host, ansible_var): namespace = ansible_var("namespace") def assert_healthy(): c = kubeconfig_env + "kubectl wait --for=condition=ready --timeout=1s -l app=stackstate-agent pod --namespace={}".format( namespace) assert host.run(c).rc == 0 util.wait_until(assert_healthy, 30, 5)
def test_generic_events(host): url = "http://localhost:7070/api/topic/sts_generic_events?limit=1000" def wait_for_events(): data = host.check_output("curl \"%s\"" % url) json_data = json.loads(data) with open("./topic-sts-generic-events.json", 'w') as f: json.dump(json_data, f, indent=4) util.wait_until(wait_for_events, 60, 3)
def test_dnat(host, ansible_var): url = "http://localhost:7070/api/topic/sts_topo_process_agents?limit=1000" dnat_service_port = int(ansible_var("dnat_service_port")) dnat_server_port = int(ansible_var("dnat_server_port")) cluster_name = ansible_var("cluster_name") namespace = ansible_var("namespace") def wait_for_components(): data = host.check_output("curl \"%s\"" % url) json_data = json.loads(data) with open("./topic-topo-process-agents-dnat.json", 'w') as f: json.dump(json_data, f, indent=4) pod_server_ip = _get_pod_ip(host, namespace, "pod-server") pod_service_ip = _get_service_ip(host, namespace) pod_client = _get_pod_ip(host, namespace, "pod-client") endpoint_match = re.compile( "urn:endpoint:/.*:{}".format(pod_service_ip)) endpoint = _find_component( json_data=json_data, type_name="endpoint", external_id_assert_fn=lambda v: endpoint_match.findall(v)) assert json.loads(endpoint["data"])["ip"] == pod_service_ip endpoint_component_id = endpoint["externalId"] proc_to_proc_id_match = re.compile( "TCP:/urn:process:/.*:.*->{}:{}".format(endpoint_component_id, dnat_service_port)) proc_to_service_id_match = re.compile( "TCP:/urn:process:/.*->urn:process:/.*:.*:{}:{}:{}".format( cluster_name, pod_server_ip, dnat_server_port)) service_to_proc_id_match = re.compile( "TCP:/{}:{}->urn:process:/.*:{}:{}:{}".format( endpoint_component_id, dnat_service_port, cluster_name, pod_server_ip, dnat_server_port)) "" assert _relation_data( json_data=json_data, type_name="directional_connection", external_id_assert_fn=lambda v: proc_to_proc_id_match.findall( v))["outgoing"]["ip"] == pod_client assert _relation_data( json_data=json_data, type_name="directional_connection", external_id_assert_fn=lambda v: proc_to_service_id_match.findall( v))["outgoing"]["ip"] == pod_client assert _relation_data( json_data=json_data, type_name="directional_connection", external_id_assert_fn=lambda v: service_to_proc_id_match.findall( v))["incoming"]["ip"] == pod_server_ip util.wait_until(wait_for_components, 60, 3)
def test_node_agent_healthy(host, ansible_var): namespace = ansible_var("namespace") kubeconfig = ansible_var("kubeconfig") kubecontext = ansible_var("kubecontext") def assert_healthy(): c = "KUBECONFIG={0} kubectl --context={1} wait --for=condition=ready --timeout=1s -l app.kubernetes.io/component=agent pod --namespace={2}".format( kubeconfig, kubecontext, namespace) assert host.run(c).rc == 0 util.wait_until(assert_healthy, 30, 5)
def wait_until_one_replica_is_closed(self): def predicate(): dns = self.cluster.get_container_datanodes(self.container_id) for dn in dns: if self.cluster.get_container_state(self.container_id, dn) == 'CLOSED': return True return False util.wait_until(predicate, int(os.environ["CONTAINER_STATUS_SLEEP"]), 10) if not predicate(): raise Exception("None of the container replica is closed!")
def test_cluster_agent_pod_mount_volume_relation(host, ansible_var): cluster_name = ansible_var("cluster_name") namespace = ansible_var("namespace") topic = "sts_topo_kubernetes_%s" % cluster_name url = "http://localhost:7070/api/topic/%s?limit=1000" % topic def _find_mount_relation(json_data, pod_regex, volume_mount_regex): return _relation_sourceid( json_data=json_data, type_name="mounts", external_id_assert_fn=lambda eid: re.compile("%s->%s" % ( pod_regex, volume_mount_regex)).findall(eid)) def wait_for_relation(): data = host.check_output("curl \"%s\"" % url) json_data = json.loads(data) with open("./topic-" + topic + ".json", 'w') as f: json.dump(json_data, f, indent=4) # stackstate-cluster-agent Pod -> Volume mount (secret) cluster_agent_urn = \ "urn:kubernetes:/%s:%s:pod/stackstate-cluster-agent-.*-.*:container/cluster-agent" \ % (cluster_name, namespace) before_1_21 = _find_mount_relation( json_data, cluster_agent_urn, "urn:kubernetes:/%s:%s:secret/stackstate-cluster-agent-token-.*" % (cluster_name, namespace)) from_1_21 = _find_mount_relation( json_data, cluster_agent_urn, "urn:kubernetes:external-volume:projected/.*") relation = before_1_21 if before_1_21 else from_1_21 assert relation.startswith( "urn:kubernetes:/%s:%s:pod/stackstate-cluster-agent" % (cluster_name, namespace)) # stackstate-agent Pod -> Volume mount (secret) agent_urn = "urn:kubernetes:/%s:%s:pod/stackstate-cluster-agent-agent-.*:container/cluster-agent" \ % (cluster_name, namespace) before_1_21 = _find_mount_relation( json_data, agent_urn, "urn:kubernetes:/%s:%s:secret/stackstate-cluster-agent-token-.*" % (cluster_name, namespace)) from_1_21 = _find_mount_relation( json_data, agent_urn, "urn:kubernetes:external-volume:projected/.*") relation = before_1_21 if before_1_21 else from_1_21 assert relation.startswith( "urn:kubernetes:/%s:%s:pod/stackstate-cluster-agent-agent" % (cluster_name, namespace)) util.wait_until(wait_for_relation, 120, 3)
def test_dnat(host, common_vars): url = "http://localhost:7070/api/topic/sts_topo_process_agents?offset=0&limit=1000" ubuntu_private_ip = _get_instance_config("agent-ubuntu")["private_address"] fedora_private_ip = _get_instance_config("agent-fedora")["private_address"] dnat_service_port = int(common_vars["dnat_service_port"]) dnat_server_port = int(common_vars["dnat_server_port"]) def wait_for_components(): data = host.check_output("curl \"%s\"" % url) json_data = json.loads(data) with open("./topic-topo-process-agents-dnat.json", 'w') as f: json.dump(json_data, f, indent=4) endpoint_match = re.compile( "urn:endpoint:/.*:{}".format(ubuntu_private_ip)) endpoint = _find_component( json_data=json_data, type_name="endpoint", external_id_assert_fn=lambda v: endpoint_match.findall(v)) assert json.loads(endpoint["data"])["ip"] == ubuntu_private_ip endpoint_component_id = endpoint["externalId"] proc_to_proc_id_match = re.compile( "TCP:/urn:process:/agent-fedora:.*:.*->{}:{}".format( endpoint_component_id, dnat_service_port)) proc_to_service_id_match = re.compile( "TCP:/urn:process:/agent-fedora:.*:.*->urn:process:/agent-ubuntu:.*:.*:{}:{}" .format(ubuntu_private_ip, dnat_server_port)) service_to_proc_id_match = re.compile( "TCP:/{}:{}->urn:process:/agent-ubuntu:.*:.*:{}:{}".format( endpoint_component_id, dnat_service_port, ubuntu_private_ip, dnat_server_port)) assert _relation_data( json_data=json_data, type_name="directional_connection", external_id_assert_fn=lambda v: proc_to_proc_id_match.findall( v))["outgoing"]["ip"] == fedora_private_ip assert _relation_data( json_data=json_data, type_name="directional_connection", external_id_assert_fn=lambda v: proc_to_service_id_match.findall( v))["outgoing"]["ip"] == fedora_private_ip assert _relation_data( json_data=json_data, type_name="directional_connection", external_id_assert_fn=lambda v: service_to_proc_id_match.findall( v))["incoming"]["ip"] == ubuntu_private_ip util.wait_until(wait_for_components, 30, 3)
def test_created_connection_before_start(host, common_vars): url = "http://localhost:7070/api/topic/sts_correlate_endpoints?limit=1000" fedora_conn_port = int(common_vars["connection_port_before_start_fedora"]) windows_conn_port = int( common_vars["connection_port_before_start_windows"]) ubuntu_private_ip = _get_instance_config("agent-ubuntu")["private_address"] print("ubuntu private: {}".format(ubuntu_private_ip)) fedora_private_ip = _get_instance_config("agent-fedora")["private_address"] print("fedora private: {}".format(fedora_private_ip)) windows_private_ip = _get_instance_config("agent-win")["private_address"] print("windows private: {}".format(windows_private_ip)) def wait_for_connection(): data = host.check_output("curl \"%s\"" % url) json_data = json.loads(data) with open("./topic-correlate-endpoint-before.json", 'w') as f: json.dump(json_data, f, indent=4) outgoing_conn = _find_outgoing_connection(json_data, fedora_conn_port, fedora_private_ip, ubuntu_private_ip) print(outgoing_conn) assert outgoing_conn[ "direction"] == "NONE" # Outgoing gets no direction from Linux /proc scanning assert outgoing_conn["connectionType"] == "TCP" incoming_conn = _find_incoming_connection(json_data, fedora_conn_port, fedora_private_ip, ubuntu_private_ip) print(incoming_conn) assert incoming_conn["direction"] == "INCOMING" assert incoming_conn["connectionType"] == "TCP" outgoing_conn = _find_outgoing_connection(json_data, windows_conn_port, windows_private_ip, ubuntu_private_ip) print(outgoing_conn) assert outgoing_conn["direction"] == "OUTGOING" assert outgoing_conn["connectionType"] == "TCP" incoming_conn = _find_incoming_connection(json_data, windows_conn_port, windows_private_ip, ubuntu_private_ip) print(incoming_conn) assert incoming_conn["direction"] == "INCOMING" assert incoming_conn["connectionType"] == "TCP" util.wait_until(wait_for_connection, 30, 3)
def test_topology_filtering(host, ansible_var): url = "http://localhost:7070/api/topic/sts_topo_process_agents?offset=0&limit=2000" def wait_for_components(): data = host.check_output("curl \"%s\"" % url) json_data = json.loads(data) with open("./topic-topo-process-agents-filtering.json", 'w') as f: json.dump(json_data, f, indent=4) # assert that we get the stress process and that it contains the top resource tags stress_process_match = re.compile("/usr/bin/stress --vm .* --vm-bytes .*") stress_process = _find_process_by_command_args( json_data=json_data, type_name="process", cmd_assert_fn=lambda v: stress_process_match.findall(v) ) assert stress_process["command"]["exe"] == "/usr/bin/stress" # assert that we don't get the short-lived python processes short_lived_process_match = re.compile("python -c import time; time.sleep(.*);") assert _find_process_by_command_args( json_data=json_data, type_name="process", cmd_assert_fn=lambda v: short_lived_process_match.findall(v) ) is None # assert that we get the 3 python simple http servers + clients and expected relations # single requests server + client and no relation assert _network_relation( json_data=json_data, server_port=ansible_var("network_relation_test_server_port_single_request"), request_process_cmd="python single-request.py" ) is None # multiple requests server + client and their relation assert _network_relation( json_data=json_data, server_port=ansible_var("network_relation_test_server_port_multiple_requests"), request_process_cmd="python multiple-requests.py" ) is not None # shared connection requests server + client and their relation assert _network_relation( json_data=json_data, server_port=ansible_var("network_relation_test_server_port_shared_connection"), request_process_cmd="python shared-connection-requests.py" ) is not None util.wait_until(wait_for_components, 120, 3)
def test_headless_pod_to_pod(host, ansible_var, topic_api): url = "{0}/sts_topo_process_agents?limit=1000".format(topic_api) # Server and service port are equal server_port = int(ansible_var("headless_service_port")) cluster_name = ansible_var("cluster_name") def wait_for_components(): data = host.check_output("curl \"%s\"" % url) json_data = json.loads(data) with open("./topic-topo-process-agents-headless.json", 'w') as f: json.dump(json_data, f, indent=4) server_process_match = re.compile( "ncat -vv --broker --listen -p {}".format(server_port)) server_process = _find_process_by_command_args( json_data=json_data, type_name="process", cmd_assert_fn=lambda v: server_process_match.findall(v)) assert server_process is not None server_process_create_time = server_process["createTime"] server_process_pid = server_process["pid"] server_host = server_process["host"] request_process_match = re.compile( "nc -vv headless-service {}".format(server_port)) request_process = _find_process_by_command_args( json_data=json_data, type_name="process", cmd_assert_fn=lambda v: request_process_match.findall(v)) assert request_process is not None request_process_create_time = request_process["createTime"] request_process_pid = request_process["pid"] request_host = request_process["host"] request_process_to_server_relation_match = re.compile( "TCP:/urn:process:/{}:{}:{}->urn:process:/{}:{}:{}:{}:.*:{}". format(request_host, request_process_pid, request_process_create_time, server_host, server_process_pid, server_process_create_time, cluster_name, server_port)) assert _relation_data( json_data=json_data, type_name="directional_connection", external_id_assert_fn=lambda v: request_process_to_server_relation_match.findall(v)) is not None util.wait_until(wait_for_components, 120, 3)
def wait_until_replica_is_closed(self, datanode): def predicate(): try: if self.cluster.get_container_state(self.container_id, datanode) == 'CLOSED': return True else: return False except ContainerNotFoundError: return False util.wait_until(predicate, int(os.environ["CONTAINER_STATUS_SLEEP"]), 10) if not predicate(): raise Exception("Replica is not closed!")
def test_stackstate_trace_agent_no_log_errors(host, hostname): trace_agent_log_path = "/var/log/stackstate-agent/trace-agent.log" # Check for presence of success def wait_for_check_successes(): trace_agent_log = _get_log(host, hostname, trace_agent_log_path) assert re.search("total number of tracked services", trace_agent_log) assert re.search("trace-agent running on host", trace_agent_log) util.wait_until(wait_for_check_successes, 30, 3) # Check for errors trace_agent_log = _get_log(host, hostname, trace_agent_log_path) for line in trace_agent_log.splitlines(): print("Considering: %s" % line) assert not re.search("error", line, re.IGNORECASE)
def wait_until_all_replicas_are_closed(self): def predicate(): try: dns = self.cluster.get_container_datanodes(self.container_id) for dn in dns: if self.cluster.get_container_state(self.container_id, dn) != 'CLOSED': return False return True except ContainerNotFoundError: return False util.wait_until(predicate, int(os.environ["CONTAINER_STATUS_SLEEP"]), 10) if not predicate(): raise Exception("Not all the replicas are closed!")
def test_stackstate_trace_agent_no_log_errors(host, hostname): trace_agent_log_path = "/var/log/stackstate-agent/trace-agent.log" # Check for presence of success def wait_for_check_successes(): trace_agent_log = _get_log(host, "{}-{}".format(hostname, "trace-agent"), trace_agent_log_path) assert re.search("Trace agent running on host", trace_agent_log) assert re.search("No data received", trace_agent_log) util.wait_until(wait_for_check_successes, 30, 3) # Check for errors trace_agent_log = _get_log(host, "{}-{}".format(hostname, "trace-agent"), trace_agent_log_path) for line in trace_agent_log.splitlines(): print("Considering: %s" % line) assert not re.search("error", line, re.IGNORECASE)
def test_state_events(host): url = "http://localhost:7070/api/topic/sts_state_events?offset=0&limit=80" def wait_for_metrics(): data = host.check_output("curl \"%s\"" % url) json_data = json.loads(data) with open("./topic-state-events.json", 'w') as f: json.dump(json_data, f, indent=4) state_events = defaultdict(set) for message in json_data["messages"]: state_events[message["message"]["StateEvent"]["host"]].add( message["message"]["StateEvent"]["name"]) print(state_events) assert all([ assertTag for assertTag in [ "stackstate.agent.up", "stackstate.agent.check_status", "ntp.in_sync" ] if assertTag in state_events["agent-ubuntu"] ]) assert all([ assertTag for assertTag in [ "stackstate.agent.up", "stackstate.agent.check_status", "ntp.in_sync" ] if assertTag in state_events["agent-fedora"] ]) assert all([ assertTag for assertTag in [ "stackstate.agent.up", "stackstate.agent.check_status", "ntp.in_sync" ] if assertTag in state_events["agent-centos"] ]) assert all([ assertTag for assertTag in [ "stackstate.agent.up", "stackstate.agent.check_status", "ntp.in_sync" ] if assertTag in state_events["agent-connection-namespaces"] ]) assert all([ assertTag for assertTag in [ "stackstate.agent.up", "stackstate.agent.check_status", "ntp.in_sync" ] if assertTag in state_events["agent-win"] ]) util.wait_until(wait_for_metrics, 30, 3)
def test_dnat(host, ansible_var, topic_api): url = "{0}/sts_topo_process_agents?limit=1000".format(topic_api) correlate_url = "{0}/sts_correlate_endpoints?limit=100".format(topic_api) dnat_service_port = int(ansible_var("dnat_service_port")) namespace = ansible_var("namespace") kubeconfig = ansible_var("kubeconfig") kubecontext = ansible_var("kubecontext") def wait_for_components(): data = host.check_output("curl \"%s\"" % url) json_data = json.loads(data) with open("./topic-topo-process-agents-dnat.json", 'w') as f: json.dump(json_data, f, indent=4) # This is here for debugging correlate_data = host.check_output("curl \"%s\"" % correlate_url) correlate_json_data = json.loads(correlate_data) with open("./topic-topo-process-agents-dnat-correlate.json", 'w') as f: json.dump(correlate_json_data, f, indent=4) pod_service_ip = _get_service_ip(kubeconfig, kubecontext, host, namespace) pod_client = _get_pod_ip(kubeconfig, kubecontext, host, namespace, "pod-client") endpoint_match = re.compile( "urn:endpoint:/.*:{}".format(pod_service_ip)) endpoint = _find_component( json_data=json_data, type_name="endpoint", external_id_assert_fn=lambda v: endpoint_match.findall(v)) assert json.loads(endpoint["data"])["ip"] == pod_service_ip endpoint_component_id = endpoint["externalId"] proc_to_service_id_match = re.compile( "TCP:/urn:process:/.*:.*->{}:{}".format(endpoint_component_id, dnat_service_port)) assert _relation_data( json_data=json_data, type_name="directional_connection", external_id_assert_fn=lambda v: proc_to_service_id_match.findall( v))["outgoing"]["ip"] == pod_client util.wait_until(wait_for_components, 120, 3)
def test_datanode_isolation_all(): """ In this test case we will create a network partition in such a way that all datanodes cannot communicate with each other. All datanodes will be able to communicate with SCM. Once the network partition happens, SCM detects it and closes the pipeline, which in-turn tries to close the containers. At least one of the replica should be in closed state Once we restore the network, there will be three closed replicas. """ cluster.run_freon(1, 1, 1, 10240) assert len(cluster.get_container_states(cluster.datanodes[0])) != 0 assert len(cluster.get_container_states(cluster.datanodes[1])) != 0 assert len(cluster.get_container_states(cluster.datanodes[2])) != 0 logger.info("Partitioning the network") first_set = [cluster.om, cluster.scm, cluster.datanodes[0]] second_set = [cluster.om, cluster.scm, cluster.datanodes[1]] third_set = [cluster.om, cluster.scm, cluster.datanodes[2]] cluster.partition_network(first_set, second_set, third_set) logger.info("Waiting for the replica to be CLOSED") util.wait_until( lambda: cluster.container_state_predicate_one_closed(cluster.datanodes ), int(os.environ["CONTAINER_STATUS_SLEEP"]), 10) # At least one of the replica should be in closed state assert cluster.container_state_predicate_one_closed(cluster.datanodes) # After restoring the network all the replicas should be in # CLOSED state logger.info("Restoring the network") cluster.restore_network() logger.info("Waiting for the container to be replicated") util.wait_until( lambda: cluster.container_state_predicate_all_closed(cluster.datanodes ), int(os.environ["CONTAINER_STATUS_SLEEP"]), 10) assert cluster.container_state_predicate_all_closed(cluster.datanodes)
def test_stackstate_process_agent_no_log_errors(host, hostname): process_agent_log_path = "/var/log/stackstate-agent/process-agent.log" # Check for presence of success def wait_for_check_successes(): process_agent_log = _get_log(host, hostname, process_agent_log_path) assert re.search("Finished check #1", process_agent_log) if hostname != "agent-centos": assert re.search("starting network tracer locally", process_agent_log) util.wait_until(wait_for_check_successes, 30, 3) # Check for errors process_agent_log = _get_log(host, hostname, process_agent_log_path) for line in process_agent_log.splitlines(): print("Considering: %s" % line) assert not re.search("error", line, re.IGNORECASE)
def _check_logs(host, controller_name, success_regex, ignored_errors_regex): def wait_for_successful_post(): for pod in _get_pods(host, controller_name): log = _get_log(host, pod) assert re.search(success_regex, log) util.wait_until(wait_for_successful_post, 30, 3) for pod in _get_pods(host, controller_name): log = _get_log(host, pod) for line in log.splitlines(): ignored = False for ignored_error in ignored_errors_regex: if len(re.findall(ignored_error, line, re.DOTALL)) > 0: ignored = True if ignored: continue print("Considering: %s" % line) assert not re.search("error", line, re.IGNORECASE)