def test_process_metrics(host):
    url = "http://localhost:7070/api/topic/sts_multi_metrics?limit=1000"

    def wait_for_metrics():
        data = host.check_output("curl \"%s\"" % url)
        json_data = json.loads(data)
        with open("./topic-multi-metrics.json", 'w') as f:
            json.dump(json_data, f, indent=4)

        def get_keys(m_host):
            return next(
                set(message["message"]["MultiMetric"]["values"].keys())
                for message in json_data["messages"] if
                message["message"]["MultiMetric"]["name"] == "processMetrics"
                and message["message"]["MultiMetric"]["host"] == m_host)

        # Same metrics we check in the backend e2e tests
        # https://stackvista.githost.io/StackVista/StackState/blob/master/stackstate-pm-test/src/test/scala/com/stackstate/it/e2e/ProcessAgentIntegrationE2E.scala#L17

        expected = {
            "cpu_nice", "cpu_userPct", "cpu_userTime", "cpu_systemPct",
            "cpu_numThreads", "io_writeRate", "io_writeBytesRate",
            "cpu_totalPct", "voluntaryCtxSwitches", "mem_dirty",
            "involuntaryCtxSwitches", "io_readRate", "openFdCount",
            "mem_shared", "cpu_systemTime", "io_readBytesRate", "mem_data",
            "mem_vms", "mem_lib", "mem_text", "mem_swap", "mem_rss"
        }

        assert get_keys("agent-ubuntu") == expected
        assert get_keys("agent-fedora") == expected
        assert get_keys("agent-centos") == expected
        assert get_keys("agent-win") == expected

    util.wait_until(wait_for_metrics, 30, 3)
Example #2
0
def test_container_metrics(host):
    url = "http://localhost:7070/api/topic/sts_multi_metrics?limit=1000"

    def wait_for_metrics():
        data = host.check_output("curl \"%s\"" % url)
        json_data = json.loads(data)
        with open("./topic-nagios-sts-multi-metrics.json", 'w') as f:
            json.dump(json_data, f, indent=4)

        def get_keys(m_host):
            return set(
                ''.join(message["message"]["MultiMetric"]["values"].keys())
                for message in json_data["messages"] if
                message["message"]["MultiMetric"]["name"] == "convertedMetric"
                and message["message"]["MultiMetric"]["host"] == m_host)

        expected = {
            'nagios.http.size', 'nagios.ping.pl', 'nagios.http.time',
            'nagios.current_load.load15', 'nagios.swap_usage.swap',
            'nagios.host.pl', 'nagios.root_partition',
            'nagios.current_users.users', 'nagios.current_load.load1',
            'nagios.host.rta', 'nagios.ping.rta', 'nagios.current_load.load5',
            'nagios.total_processes.procs'
        }
        assert all([
            expectedMetric for expectedMetric in expected
            if expectedMetric in get_keys("agent-integrations")
        ])

    util.wait_until(wait_for_metrics, 180, 3)
def test_stackstate_process_agent_no_log_errors(host, hostname):
    process_agent_log_path = "c:\\programdata\\stackstate\\logs\\process-agent.log"

    # Check for presence of success
    def wait_for_check_successes():
        process_agent_log = host.ansible(
            "win_shell",
            "cat \"{}\"".format(process_agent_log_path),
            check=False)["stdout"]
        print(process_agent_log)

        assert re.search("Finished check #1", process_agent_log)
        assert re.search("starting network tracer locally", process_agent_log)

    util.wait_until(wait_for_check_successes, 30, 3)

    process_agent_log = host.ansible(
        "win_shell", "cat \"{}\"".format(process_agent_log_path),
        check=False)["stdout"]
    with open("./{}-process.log".format(hostname), 'wb') as f:
        f.write(process_agent_log.encode('utf-8'))

    # Check for errors
    for line in process_agent_log.splitlines():
        print("Considering: %s" % line)
        assert not re.search("error", line, re.IGNORECASE)
def test_connection_network_namespaces_relations(host):
    url = "http://localhost:7070/api/topic/sts_correlate_endpoints?limit=1500"

    def wait_for_connection():
        data = host.check_output("curl \"%s\"" % url)
        json_data = json.loads(data)
        with open("./topic-correlate-endpoint-netns.json", 'w') as f:
            json.dump(json_data, f, indent=4)

        # assert that we find a outgoing localhost connection between 127.0.0.1 to 127.0.0.1 to port 9091 on
        # agent-connection-namespaces host within the same network namespace.
        outgoing_conn = _find_outgoing_connection_in_namespace(
            json_data, 9091, "agent-connection-namespaces", "127.0.0.1",
            "127.0.0.1")
        print(outgoing_conn)

        incoming_conn = _find_incoming_connection_in_namespace(
            json_data, 9091, "agent-connection-namespaces", "127.0.0.1",
            "127.0.0.1")
        print(incoming_conn)

        # assert that the connections are in the same namespace
        outgoing_local_namespace = outgoing_conn["localEndpoint"]["namespace"]
        outgoing_remote_namespace = outgoing_conn["remoteEndpoint"][
            "namespace"]
        incoming_local_namespace = incoming_conn["localEndpoint"]["namespace"]
        incoming_remote_namespace = incoming_conn["remoteEndpoint"][
            "namespace"]
        assert (outgoing_local_namespace == outgoing_remote_namespace
                and incoming_local_namespace == incoming_remote_namespace
                and incoming_remote_namespace == outgoing_local_namespace
                and incoming_local_namespace == outgoing_remote_namespace)

    util.wait_until(wait_for_connection, 30, 3)
def test_stackstate_trace_agent_log(host, hostname):
    trace_agent_log_path = "c:\\programdata\\stackstate\\logs\\trace-agent.log"

    # Check for presence of success
    def wait_for_check_successes():
        trace_agent_log = host.ansible(
            "win_shell",
            "cat \"{}\"".format(trace_agent_log_path),
            check=False)["stdout"]
        print(trace_agent_log)
        assert re.search("Trace agent running on host", trace_agent_log)
        assert re.search("Listening for traces at", trace_agent_log)
        assert re.search("No data received", trace_agent_log)

    util.wait_until(wait_for_check_successes, 30, 3)

    agent_log = host.ansible("win_shell",
                             "cat \"{}\"".format(trace_agent_log_path),
                             check=False)["stdout"]
    with open("./{}-trace.log".format(hostname), 'wb') as f:
        f.write(agent_log.encode('utf-8'))

    # Check for errors
    for line in agent_log.splitlines():
        print("Considering: %s" % line)
        assert not re.search("\\| error \\|", line, re.IGNORECASE)
def test_isolate_single_datanode():
    """
  In this test case we will create a network partition in such a way that
  one of the datanode will not be able to communicate with other datanodes
  but it will be able to communicate with SCM.

  Once the network partition happens, SCM detects it and closes the pipeline,
  which in-turn closes the containers.

  The container on the first two datanode will get CLOSED as they have quorum.
  The container replica on the third node will be QUASI_CLOSED as it is not
  able to connect with the other datanodes and it doesn't have latest BCSID.

  Once we restore the network, the stale replica on the third datanode will be
  deleted and a latest replica will be copied from any one of the other
  datanodes.

  """
    cluster.run_freon(1, 1, 1, 10240)
    first_set = [
        cluster.om, cluster.scm, cluster.datanodes[0], cluster.datanodes[1]
    ]
    second_set = [cluster.om, cluster.scm, cluster.datanodes[2]]
    logger.info("Partitioning the network")
    cluster.partition_network(first_set, second_set)
    cluster.run_freon(1, 1, 1, 10240)
    logger.info("Waiting for container to be QUASI_CLOSED")

    util.wait_until(
        lambda: cluster.get_container_states(cluster.datanodes[2]).popitem()[1]
        == 'QUASI_CLOSED', int(os.environ["CONTAINER_STATUS_SLEEP"]), 10)
    container_states_dn_0 = cluster.get_container_states(cluster.datanodes[0])
    container_states_dn_1 = cluster.get_container_states(cluster.datanodes[1])
    container_states_dn_2 = cluster.get_container_states(cluster.datanodes[2])
    assert len(container_states_dn_0) != 0
    assert len(container_states_dn_1) != 0
    assert len(container_states_dn_2) != 0
    for key in container_states_dn_0:
        assert container_states_dn_0.get(key) == 'CLOSED'
    for key in container_states_dn_1:
        assert container_states_dn_1.get(key) == 'CLOSED'
    for key in container_states_dn_2:
        assert container_states_dn_2.get(key) == 'QUASI_CLOSED'

    # Since the replica in datanode[2] doesn't have the latest BCSID,
    # ReplicationManager will delete it and copy a closed replica.
    # We will now restore the network and datanode[2] should get a
    # closed replica of the container
    logger.info("Restoring the network")
    cluster.restore_network()

    logger.info("Waiting for the replica to be CLOSED")
    util.wait_until(
        lambda: cluster.container_state_predicate(cluster.datanodes[2],
                                                  'CLOSED'),
        int(os.environ["CONTAINER_STATUS_SLEEP"]), 10)
    container_states_dn_2 = cluster.get_container_states(cluster.datanodes[2])
    assert len(container_states_dn_2) != 0
    for key in container_states_dn_2:
        assert container_states_dn_2.get(key) == 'CLOSED'
Example #7
0
def test_docker_swarm_metrics(host):
    url = "http://localhost:7070/api/topic/sts_multi_metrics?limit=3000"

    def wait_for_metrics():
        data = host.check_output("curl \"%s\"" % url)
        json_data = json.loads(data)
        with open("./topic-docker-swarm-sts-multi-metrics.json", 'w') as f:
            json.dump(json_data, f, indent=4)

        def get_keys():
            # Check for a swarm service which all metrics are we returning
            # as an example we are taking for nginx
            return set(
                ''.join(message["message"]["MultiMetric"]["values"].keys())
                for message in json_data["messages"] if
                message["message"]["MultiMetric"]["name"] == "convertedMetric"
                and "serviceName" in message["message"]["MultiMetric"]["tags"])

        expected = {
            'swarm.service.desired_replicas', 'swarm.service.running_replicas'
        }
        assert all([
            expectedMetric for expectedMetric in expected
            if expectedMetric in get_keys()
        ])

    util.wait_until(wait_for_metrics, 180, 10)
Example #8
0
def test_stackstate_agent_log(host, hostname):
    agent_log_path = "/var/log/stackstate-agent/agent.log"

    # Check for presence of success
    def wait_for_check_successes():
        agent_log = _get_log(host, "{}-{}".format(hostname, "agent"),
                             agent_log_path)
        assert re.search("Successfully posted payload to.*stsAgent/intake",
                         agent_log)

    util.wait_until(wait_for_check_successes, 60, 3)

    ignored_errors_regex = [
        # TODO: Collecting processes snap -> Will be addressed with STAC-3531
        "Error code \"400 Bad Request\" received while "
        "sending transaction to \"https://.*/stsAgent/intake/.*"
        "Failed to deserialize JSON on fields: , "
        "with message: Object is missing required member \'internalHostname\'",
        "net/ntp.go.*There was an error querying the ntp host",
    ]

    # Check for errors
    agent_log = _get_log(host, "{}-{}".format(hostname, "agent"),
                         agent_log_path)
    for line in agent_log.splitlines():
        ignored = False
        for ignored_error in ignored_errors_regex:
            if len(re.findall(ignored_error, line, re.DOTALL)) > 0:
                ignored = True
        if ignored:
            continue

        print("Considering: %s" % line)
        assert not re.search("error", line, re.IGNORECASE)
def test_no_datadog_metrics(host):
    url = "http://localhost:7070/api/topic/sts_multi_metrics?limit=1000"

    def wait_for_metrics():
        data = host.check_output("curl \"%s\"" % url)
        json_data = json.loads(data)
        with open("./topic-sts-multi-metrics.json", 'w') as f:
            json.dump(json_data, f, indent=4)

        metrics = {}
        for message in json_data["messages"]:
            for m_name in message["message"]["MultiMetric"]["values"].keys():
                if m_name not in metrics:
                    metrics[m_name] = []

                values = [message["message"]["MultiMetric"]["values"][m_name]]
                metrics[m_name] += values

        # assert that we don't see any datadog metrics
        datadog_metrics = [(key, value) for key, value in metrics.iteritems()
                           if key.startswith("datadog")]
        assert len(
            datadog_metrics
        ) == 0, 'datadog metrics found in sts_multi_metrics: [%s]' % ', '.join(
            map(str, datadog_metrics))

    util.wait_until(wait_for_metrics, 60, 3)
def test_created_connection_after_start_with_metrics(host, common_vars):
    url = "http://localhost:7070/api/topic/sts_correlate_endpoints?limit=1000"

    fedora_conn_port = int(common_vars["connection_port_after_start_fedora"])
    windows_conn_port = int(common_vars["connection_port_after_start_windows"])

    ubuntu_private_ip = _get_instance_config("agent-ubuntu")["private_address"]
    print("ubuntu private: {}".format(ubuntu_private_ip))
    fedora_private_ip = _get_instance_config("agent-fedora")["private_address"]
    print("fedora private: {}".format(fedora_private_ip))
    windows_private_ip = _get_instance_config("agent-win")["private_address"]
    print("windows private: {}".format(windows_private_ip))

    def wait_for_connection():
        data = host.check_output("curl \"%s\"" % url)
        json_data = json.loads(data)
        with open("./topic-correlate-endpoint-after.json", 'w') as f:
            json.dump(json_data, f, indent=4)

        outgoing_conn = _find_outgoing_connection(json_data, fedora_conn_port,
                                                  fedora_private_ip,
                                                  ubuntu_private_ip)
        print(outgoing_conn)
        assert outgoing_conn["direction"] == "OUTGOING"
        assert outgoing_conn["connectionType"] == "TCP"
        assert outgoing_conn["bytesSentPerSecond"] > 10.0
        assert outgoing_conn["bytesReceivedPerSecond"] == 0.0

        incoming_conn = _find_incoming_connection(json_data, fedora_conn_port,
                                                  fedora_private_ip,
                                                  ubuntu_private_ip)
        print(incoming_conn)
        assert incoming_conn["direction"] == "INCOMING"
        assert incoming_conn["connectionType"] == "TCP"
        assert incoming_conn["bytesSentPerSecond"] == 0.0
        assert incoming_conn["bytesReceivedPerSecond"] > 10.0

        outgoing_conn = _find_outgoing_connection(json_data, windows_conn_port,
                                                  windows_private_ip,
                                                  ubuntu_private_ip)
        print(outgoing_conn)
        assert outgoing_conn["direction"] == "OUTGOING"
        assert outgoing_conn["connectionType"] == "TCP"
        assert outgoing_conn[
            "bytesSentPerSecond"] == 0.0  # We don't collect metrics on Windows
        assert outgoing_conn["bytesReceivedPerSecond"] == 0.0

        incoming_conn = _find_incoming_connection(json_data, windows_conn_port,
                                                  windows_private_ip,
                                                  ubuntu_private_ip)
        print(incoming_conn)
        assert incoming_conn["direction"] == "INCOMING"
        assert incoming_conn["connectionType"] == "TCP"
        assert incoming_conn["bytesSentPerSecond"] == 0.0
        assert incoming_conn[
            "bytesReceivedPerSecond"] == 0.0  # We don't send data from Windows

    util.wait_until(wait_for_connection, 30, 3)
Example #11
0
def test_cluster_agent_healthy(host, ansible_var):
    namespace = ansible_var("namespace")

    def assert_healthy():
        c = kubeconfig_env + "kubectl wait --for=condition=available --timeout=1s deployment/stackstate-cluster-agent --namespace={}".format(
            namespace)
        assert host.run(c).rc == 0

    util.wait_until(assert_healthy, 30, 5)
Example #12
0
def test_node_agent_healthy(host, ansible_var):
    namespace = ansible_var("namespace")

    def assert_healthy():
        c = kubeconfig_env + "kubectl wait --for=condition=ready --timeout=1s -l app=stackstate-agent pod --namespace={}".format(
            namespace)
        assert host.run(c).rc == 0

    util.wait_until(assert_healthy, 30, 5)
def test_generic_events(host):
    url = "http://localhost:7070/api/topic/sts_generic_events?limit=1000"

    def wait_for_events():
        data = host.check_output("curl \"%s\"" % url)
        json_data = json.loads(data)
        with open("./topic-sts-generic-events.json", 'w') as f:
            json.dump(json_data, f, indent=4)

    util.wait_until(wait_for_events, 60, 3)
def test_dnat(host, ansible_var):
    url = "http://localhost:7070/api/topic/sts_topo_process_agents?limit=1000"

    dnat_service_port = int(ansible_var("dnat_service_port"))
    dnat_server_port = int(ansible_var("dnat_server_port"))
    cluster_name = ansible_var("cluster_name")
    namespace = ansible_var("namespace")

    def wait_for_components():
        data = host.check_output("curl \"%s\"" % url)
        json_data = json.loads(data)
        with open("./topic-topo-process-agents-dnat.json", 'w') as f:
            json.dump(json_data, f, indent=4)

        pod_server_ip = _get_pod_ip(host, namespace, "pod-server")
        pod_service_ip = _get_service_ip(host, namespace)
        pod_client = _get_pod_ip(host, namespace, "pod-client")

        endpoint_match = re.compile(
            "urn:endpoint:/.*:{}".format(pod_service_ip))
        endpoint = _find_component(
            json_data=json_data,
            type_name="endpoint",
            external_id_assert_fn=lambda v: endpoint_match.findall(v))
        assert json.loads(endpoint["data"])["ip"] == pod_service_ip
        endpoint_component_id = endpoint["externalId"]
        proc_to_proc_id_match = re.compile(
            "TCP:/urn:process:/.*:.*->{}:{}".format(endpoint_component_id,
                                                    dnat_service_port))
        proc_to_service_id_match = re.compile(
            "TCP:/urn:process:/.*->urn:process:/.*:.*:{}:{}:{}".format(
                cluster_name, pod_server_ip, dnat_server_port))
        service_to_proc_id_match = re.compile(
            "TCP:/{}:{}->urn:process:/.*:{}:{}:{}".format(
                endpoint_component_id, dnat_service_port, cluster_name,
                pod_server_ip, dnat_server_port))
        ""
        assert _relation_data(
            json_data=json_data,
            type_name="directional_connection",
            external_id_assert_fn=lambda v: proc_to_proc_id_match.findall(
                v))["outgoing"]["ip"] == pod_client
        assert _relation_data(
            json_data=json_data,
            type_name="directional_connection",
            external_id_assert_fn=lambda v: proc_to_service_id_match.findall(
                v))["outgoing"]["ip"] == pod_client
        assert _relation_data(
            json_data=json_data,
            type_name="directional_connection",
            external_id_assert_fn=lambda v: service_to_proc_id_match.findall(
                v))["incoming"]["ip"] == pod_server_ip

    util.wait_until(wait_for_components, 60, 3)
Example #15
0
def test_node_agent_healthy(host, ansible_var):
    namespace = ansible_var("namespace")
    kubeconfig = ansible_var("kubeconfig")
    kubecontext = ansible_var("kubecontext")

    def assert_healthy():
        c = "KUBECONFIG={0} kubectl --context={1} wait --for=condition=ready --timeout=1s -l app.kubernetes.io/component=agent pod --namespace={2}".format(
            kubeconfig, kubecontext, namespace)
        assert host.run(c).rc == 0

    util.wait_until(assert_healthy, 30, 5)
Example #16
0
    def wait_until_one_replica_is_closed(self):
        def predicate():
            dns = self.cluster.get_container_datanodes(self.container_id)
            for dn in dns:
                if self.cluster.get_container_state(self.container_id,
                                                    dn) == 'CLOSED':
                    return True
            return False

        util.wait_until(predicate, int(os.environ["CONTAINER_STATUS_SLEEP"]),
                        10)
        if not predicate():
            raise Exception("None of the container replica is closed!")
def test_cluster_agent_pod_mount_volume_relation(host, ansible_var):
    cluster_name = ansible_var("cluster_name")
    namespace = ansible_var("namespace")
    topic = "sts_topo_kubernetes_%s" % cluster_name
    url = "http://localhost:7070/api/topic/%s?limit=1000" % topic

    def _find_mount_relation(json_data, pod_regex, volume_mount_regex):
        return _relation_sourceid(
            json_data=json_data,
            type_name="mounts",
            external_id_assert_fn=lambda eid: re.compile("%s->%s" % (
                pod_regex, volume_mount_regex)).findall(eid))

    def wait_for_relation():
        data = host.check_output("curl \"%s\"" % url)
        json_data = json.loads(data)
        with open("./topic-" + topic + ".json", 'w') as f:
            json.dump(json_data, f, indent=4)

        # stackstate-cluster-agent Pod -> Volume mount (secret)
        cluster_agent_urn = \
            "urn:kubernetes:/%s:%s:pod/stackstate-cluster-agent-.*-.*:container/cluster-agent" \
            % (cluster_name, namespace)
        before_1_21 = _find_mount_relation(
            json_data, cluster_agent_urn,
            "urn:kubernetes:/%s:%s:secret/stackstate-cluster-agent-token-.*" %
            (cluster_name, namespace))
        from_1_21 = _find_mount_relation(
            json_data, cluster_agent_urn,
            "urn:kubernetes:external-volume:projected/.*")
        relation = before_1_21 if before_1_21 else from_1_21
        assert relation.startswith(
            "urn:kubernetes:/%s:%s:pod/stackstate-cluster-agent" %
            (cluster_name, namespace))

        # stackstate-agent Pod -> Volume mount (secret)
        agent_urn = "urn:kubernetes:/%s:%s:pod/stackstate-cluster-agent-agent-.*:container/cluster-agent" \
                    % (cluster_name, namespace)
        before_1_21 = _find_mount_relation(
            json_data, agent_urn,
            "urn:kubernetes:/%s:%s:secret/stackstate-cluster-agent-token-.*" %
            (cluster_name, namespace))
        from_1_21 = _find_mount_relation(
            json_data, agent_urn,
            "urn:kubernetes:external-volume:projected/.*")
        relation = before_1_21 if before_1_21 else from_1_21
        assert relation.startswith(
            "urn:kubernetes:/%s:%s:pod/stackstate-cluster-agent-agent" %
            (cluster_name, namespace))

    util.wait_until(wait_for_relation, 120, 3)
def test_dnat(host, common_vars):
    url = "http://localhost:7070/api/topic/sts_topo_process_agents?offset=0&limit=1000"

    ubuntu_private_ip = _get_instance_config("agent-ubuntu")["private_address"]
    fedora_private_ip = _get_instance_config("agent-fedora")["private_address"]
    dnat_service_port = int(common_vars["dnat_service_port"])
    dnat_server_port = int(common_vars["dnat_server_port"])

    def wait_for_components():
        data = host.check_output("curl \"%s\"" % url)
        json_data = json.loads(data)
        with open("./topic-topo-process-agents-dnat.json", 'w') as f:
            json.dump(json_data, f, indent=4)

        endpoint_match = re.compile(
            "urn:endpoint:/.*:{}".format(ubuntu_private_ip))
        endpoint = _find_component(
            json_data=json_data,
            type_name="endpoint",
            external_id_assert_fn=lambda v: endpoint_match.findall(v))
        assert json.loads(endpoint["data"])["ip"] == ubuntu_private_ip
        endpoint_component_id = endpoint["externalId"]

        proc_to_proc_id_match = re.compile(
            "TCP:/urn:process:/agent-fedora:.*:.*->{}:{}".format(
                endpoint_component_id, dnat_service_port))
        proc_to_service_id_match = re.compile(
            "TCP:/urn:process:/agent-fedora:.*:.*->urn:process:/agent-ubuntu:.*:.*:{}:{}"
            .format(ubuntu_private_ip, dnat_server_port))
        service_to_proc_id_match = re.compile(
            "TCP:/{}:{}->urn:process:/agent-ubuntu:.*:.*:{}:{}".format(
                endpoint_component_id, dnat_service_port, ubuntu_private_ip,
                dnat_server_port))
        assert _relation_data(
            json_data=json_data,
            type_name="directional_connection",
            external_id_assert_fn=lambda v: proc_to_proc_id_match.findall(
                v))["outgoing"]["ip"] == fedora_private_ip
        assert _relation_data(
            json_data=json_data,
            type_name="directional_connection",
            external_id_assert_fn=lambda v: proc_to_service_id_match.findall(
                v))["outgoing"]["ip"] == fedora_private_ip
        assert _relation_data(
            json_data=json_data,
            type_name="directional_connection",
            external_id_assert_fn=lambda v: service_to_proc_id_match.findall(
                v))["incoming"]["ip"] == ubuntu_private_ip

    util.wait_until(wait_for_components, 30, 3)
def test_created_connection_before_start(host, common_vars):
    url = "http://localhost:7070/api/topic/sts_correlate_endpoints?limit=1000"

    fedora_conn_port = int(common_vars["connection_port_before_start_fedora"])
    windows_conn_port = int(
        common_vars["connection_port_before_start_windows"])

    ubuntu_private_ip = _get_instance_config("agent-ubuntu")["private_address"]
    print("ubuntu private: {}".format(ubuntu_private_ip))
    fedora_private_ip = _get_instance_config("agent-fedora")["private_address"]
    print("fedora private: {}".format(fedora_private_ip))
    windows_private_ip = _get_instance_config("agent-win")["private_address"]
    print("windows private: {}".format(windows_private_ip))

    def wait_for_connection():
        data = host.check_output("curl \"%s\"" % url)
        json_data = json.loads(data)
        with open("./topic-correlate-endpoint-before.json", 'w') as f:
            json.dump(json_data, f, indent=4)

        outgoing_conn = _find_outgoing_connection(json_data, fedora_conn_port,
                                                  fedora_private_ip,
                                                  ubuntu_private_ip)
        print(outgoing_conn)
        assert outgoing_conn[
            "direction"] == "NONE"  # Outgoing gets no direction from Linux /proc scanning
        assert outgoing_conn["connectionType"] == "TCP"

        incoming_conn = _find_incoming_connection(json_data, fedora_conn_port,
                                                  fedora_private_ip,
                                                  ubuntu_private_ip)
        print(incoming_conn)
        assert incoming_conn["direction"] == "INCOMING"
        assert incoming_conn["connectionType"] == "TCP"

        outgoing_conn = _find_outgoing_connection(json_data, windows_conn_port,
                                                  windows_private_ip,
                                                  ubuntu_private_ip)
        print(outgoing_conn)
        assert outgoing_conn["direction"] == "OUTGOING"
        assert outgoing_conn["connectionType"] == "TCP"

        incoming_conn = _find_incoming_connection(json_data, windows_conn_port,
                                                  windows_private_ip,
                                                  ubuntu_private_ip)
        print(incoming_conn)
        assert incoming_conn["direction"] == "INCOMING"
        assert incoming_conn["connectionType"] == "TCP"

    util.wait_until(wait_for_connection, 30, 3)
Example #20
0
def test_topology_filtering(host, ansible_var):
    url = "http://localhost:7070/api/topic/sts_topo_process_agents?offset=0&limit=2000"

    def wait_for_components():
        data = host.check_output("curl \"%s\"" % url)
        json_data = json.loads(data)
        with open("./topic-topo-process-agents-filtering.json", 'w') as f:
            json.dump(json_data, f, indent=4)

        # assert that we get the stress process and that it contains the top resource tags
        stress_process_match = re.compile("/usr/bin/stress --vm .* --vm-bytes .*")
        stress_process = _find_process_by_command_args(
            json_data=json_data,
            type_name="process",
            cmd_assert_fn=lambda v: stress_process_match.findall(v)
        )

        assert stress_process["command"]["exe"] == "/usr/bin/stress"

        # assert that we don't get the short-lived python processes
        short_lived_process_match = re.compile("python -c import time; time.sleep(.*);")
        assert _find_process_by_command_args(
            json_data=json_data,
            type_name="process",
            cmd_assert_fn=lambda v: short_lived_process_match.findall(v)
        ) is None

        # assert that we get the 3 python simple http servers + clients and expected relations
        # single requests server + client and no relation
        assert _network_relation(
            json_data=json_data,
            server_port=ansible_var("network_relation_test_server_port_single_request"),
            request_process_cmd="python single-request.py"
        ) is None

        # multiple requests server + client and their relation
        assert _network_relation(
            json_data=json_data,
            server_port=ansible_var("network_relation_test_server_port_multiple_requests"),
            request_process_cmd="python multiple-requests.py"
        ) is not None

        # shared connection requests server + client and their relation
        assert _network_relation(
            json_data=json_data,
            server_port=ansible_var("network_relation_test_server_port_shared_connection"),
            request_process_cmd="python shared-connection-requests.py"
        ) is not None

    util.wait_until(wait_for_components, 120, 3)
def test_headless_pod_to_pod(host, ansible_var, topic_api):
    url = "{0}/sts_topo_process_agents?limit=1000".format(topic_api)

    # Server and service port are equal
    server_port = int(ansible_var("headless_service_port"))
    cluster_name = ansible_var("cluster_name")

    def wait_for_components():
        data = host.check_output("curl \"%s\"" % url)
        json_data = json.loads(data)
        with open("./topic-topo-process-agents-headless.json", 'w') as f:
            json.dump(json_data, f, indent=4)

        server_process_match = re.compile(
            "ncat -vv --broker --listen -p {}".format(server_port))
        server_process = _find_process_by_command_args(
            json_data=json_data,
            type_name="process",
            cmd_assert_fn=lambda v: server_process_match.findall(v))
        assert server_process is not None
        server_process_create_time = server_process["createTime"]
        server_process_pid = server_process["pid"]
        server_host = server_process["host"]

        request_process_match = re.compile(
            "nc -vv headless-service {}".format(server_port))
        request_process = _find_process_by_command_args(
            json_data=json_data,
            type_name="process",
            cmd_assert_fn=lambda v: request_process_match.findall(v))
        assert request_process is not None
        request_process_create_time = request_process["createTime"]
        request_process_pid = request_process["pid"]
        request_host = request_process["host"]

        request_process_to_server_relation_match = re.compile(
            "TCP:/urn:process:/{}:{}:{}->urn:process:/{}:{}:{}:{}:.*:{}".
            format(request_host, request_process_pid,
                   request_process_create_time, server_host,
                   server_process_pid, server_process_create_time,
                   cluster_name, server_port))

        assert _relation_data(
            json_data=json_data,
            type_name="directional_connection",
            external_id_assert_fn=lambda v:
            request_process_to_server_relation_match.findall(v)) is not None

    util.wait_until(wait_for_components, 120, 3)
Example #22
0
    def wait_until_replica_is_closed(self, datanode):
        def predicate():
            try:
                if self.cluster.get_container_state(self.container_id,
                                                    datanode) == 'CLOSED':
                    return True
                else:
                    return False
            except ContainerNotFoundError:
                return False

        util.wait_until(predicate, int(os.environ["CONTAINER_STATUS_SLEEP"]),
                        10)
        if not predicate():
            raise Exception("Replica is not closed!")
def test_stackstate_trace_agent_no_log_errors(host, hostname):
    trace_agent_log_path = "/var/log/stackstate-agent/trace-agent.log"

    # Check for presence of success
    def wait_for_check_successes():
        trace_agent_log = _get_log(host, hostname, trace_agent_log_path)
        assert re.search("total number of tracked services", trace_agent_log)
        assert re.search("trace-agent running on host", trace_agent_log)

    util.wait_until(wait_for_check_successes, 30, 3)

    # Check for errors
    trace_agent_log = _get_log(host, hostname, trace_agent_log_path)
    for line in trace_agent_log.splitlines():
        print("Considering: %s" % line)
        assert not re.search("error", line, re.IGNORECASE)
Example #24
0
    def wait_until_all_replicas_are_closed(self):
        def predicate():
            try:
                dns = self.cluster.get_container_datanodes(self.container_id)
                for dn in dns:
                    if self.cluster.get_container_state(self.container_id,
                                                        dn) != 'CLOSED':
                        return False
                return True
            except ContainerNotFoundError:
                return False

        util.wait_until(predicate, int(os.environ["CONTAINER_STATUS_SLEEP"]),
                        10)
        if not predicate():
            raise Exception("Not all the replicas are closed!")
def test_stackstate_trace_agent_no_log_errors(host, hostname):
    trace_agent_log_path = "/var/log/stackstate-agent/trace-agent.log"

    # Check for presence of success
    def wait_for_check_successes():
        trace_agent_log = _get_log(host, "{}-{}".format(hostname, "trace-agent"), trace_agent_log_path)
        assert re.search("Trace agent running on host", trace_agent_log)
        assert re.search("No data received", trace_agent_log)

    util.wait_until(wait_for_check_successes, 30, 3)

    # Check for errors
    trace_agent_log = _get_log(host, "{}-{}".format(hostname, "trace-agent"), trace_agent_log_path)
    for line in trace_agent_log.splitlines():
        print("Considering: %s" % line)
        assert not re.search("error", line, re.IGNORECASE)
def test_state_events(host):
    url = "http://localhost:7070/api/topic/sts_state_events?offset=0&limit=80"

    def wait_for_metrics():
        data = host.check_output("curl \"%s\"" % url)
        json_data = json.loads(data)
        with open("./topic-state-events.json", 'w') as f:
            json.dump(json_data, f, indent=4)

        state_events = defaultdict(set)
        for message in json_data["messages"]:
            state_events[message["message"]["StateEvent"]["host"]].add(
                message["message"]["StateEvent"]["name"])

        print(state_events)
        assert all([
            assertTag for assertTag in [
                "stackstate.agent.up", "stackstate.agent.check_status",
                "ntp.in_sync"
            ] if assertTag in state_events["agent-ubuntu"]
        ])
        assert all([
            assertTag for assertTag in [
                "stackstate.agent.up", "stackstate.agent.check_status",
                "ntp.in_sync"
            ] if assertTag in state_events["agent-fedora"]
        ])
        assert all([
            assertTag for assertTag in [
                "stackstate.agent.up", "stackstate.agent.check_status",
                "ntp.in_sync"
            ] if assertTag in state_events["agent-centos"]
        ])
        assert all([
            assertTag for assertTag in [
                "stackstate.agent.up", "stackstate.agent.check_status",
                "ntp.in_sync"
            ] if assertTag in state_events["agent-connection-namespaces"]
        ])
        assert all([
            assertTag for assertTag in [
                "stackstate.agent.up", "stackstate.agent.check_status",
                "ntp.in_sync"
            ] if assertTag in state_events["agent-win"]
        ])

    util.wait_until(wait_for_metrics, 30, 3)
def test_dnat(host, ansible_var, topic_api):
    url = "{0}/sts_topo_process_agents?limit=1000".format(topic_api)
    correlate_url = "{0}/sts_correlate_endpoints?limit=100".format(topic_api)

    dnat_service_port = int(ansible_var("dnat_service_port"))
    namespace = ansible_var("namespace")
    kubeconfig = ansible_var("kubeconfig")
    kubecontext = ansible_var("kubecontext")

    def wait_for_components():
        data = host.check_output("curl \"%s\"" % url)
        json_data = json.loads(data)
        with open("./topic-topo-process-agents-dnat.json", 'w') as f:
            json.dump(json_data, f, indent=4)

        # This is here for debugging
        correlate_data = host.check_output("curl \"%s\"" % correlate_url)
        correlate_json_data = json.loads(correlate_data)
        with open("./topic-topo-process-agents-dnat-correlate.json", 'w') as f:
            json.dump(correlate_json_data, f, indent=4)

        pod_service_ip = _get_service_ip(kubeconfig, kubecontext, host,
                                         namespace)
        pod_client = _get_pod_ip(kubeconfig, kubecontext, host, namespace,
                                 "pod-client")

        endpoint_match = re.compile(
            "urn:endpoint:/.*:{}".format(pod_service_ip))
        endpoint = _find_component(
            json_data=json_data,
            type_name="endpoint",
            external_id_assert_fn=lambda v: endpoint_match.findall(v))
        assert json.loads(endpoint["data"])["ip"] == pod_service_ip
        endpoint_component_id = endpoint["externalId"]
        proc_to_service_id_match = re.compile(
            "TCP:/urn:process:/.*:.*->{}:{}".format(endpoint_component_id,
                                                    dnat_service_port))

        assert _relation_data(
            json_data=json_data,
            type_name="directional_connection",
            external_id_assert_fn=lambda v: proc_to_service_id_match.findall(
                v))["outgoing"]["ip"] == pod_client

    util.wait_until(wait_for_components, 120, 3)
def test_datanode_isolation_all():
    """
  In this test case we will create a network partition in such a way that
  all datanodes cannot communicate with each other.
  All datanodes will be able to communicate with SCM.

  Once the network partition happens, SCM detects it and closes the pipeline,
  which in-turn tries to close the containers.
  At least one of the replica should be in closed state

  Once we restore the network, there will be three closed replicas.

  """
    cluster.run_freon(1, 1, 1, 10240)

    assert len(cluster.get_container_states(cluster.datanodes[0])) != 0
    assert len(cluster.get_container_states(cluster.datanodes[1])) != 0
    assert len(cluster.get_container_states(cluster.datanodes[2])) != 0

    logger.info("Partitioning the network")
    first_set = [cluster.om, cluster.scm, cluster.datanodes[0]]
    second_set = [cluster.om, cluster.scm, cluster.datanodes[1]]
    third_set = [cluster.om, cluster.scm, cluster.datanodes[2]]
    cluster.partition_network(first_set, second_set, third_set)

    logger.info("Waiting for the replica to be CLOSED")
    util.wait_until(
        lambda: cluster.container_state_predicate_one_closed(cluster.datanodes
                                                             ),
        int(os.environ["CONTAINER_STATUS_SLEEP"]), 10)

    # At least one of the replica should be in closed state
    assert cluster.container_state_predicate_one_closed(cluster.datanodes)

    # After restoring the network all the replicas should be in
    # CLOSED state
    logger.info("Restoring the network")
    cluster.restore_network()

    logger.info("Waiting for the container to be replicated")
    util.wait_until(
        lambda: cluster.container_state_predicate_all_closed(cluster.datanodes
                                                             ),
        int(os.environ["CONTAINER_STATUS_SLEEP"]), 10)
    assert cluster.container_state_predicate_all_closed(cluster.datanodes)
def test_stackstate_process_agent_no_log_errors(host, hostname):
    process_agent_log_path = "/var/log/stackstate-agent/process-agent.log"

    # Check for presence of success
    def wait_for_check_successes():
        process_agent_log = _get_log(host, hostname, process_agent_log_path)
        assert re.search("Finished check #1", process_agent_log)
        if hostname != "agent-centos":
            assert re.search("starting network tracer locally",
                             process_agent_log)

    util.wait_until(wait_for_check_successes, 30, 3)

    # Check for errors
    process_agent_log = _get_log(host, hostname, process_agent_log_path)
    for line in process_agent_log.splitlines():
        print("Considering: %s" % line)
        assert not re.search("error", line, re.IGNORECASE)
def _check_logs(host, controller_name, success_regex, ignored_errors_regex):
    def wait_for_successful_post():
        for pod in _get_pods(host, controller_name):
            log = _get_log(host, pod)
            assert re.search(success_regex, log)

    util.wait_until(wait_for_successful_post, 30, 3)

    for pod in _get_pods(host, controller_name):
        log = _get_log(host, pod)
        for line in log.splitlines():
            ignored = False
            for ignored_error in ignored_errors_regex:
                if len(re.findall(ignored_error, line, re.DOTALL)) > 0:
                    ignored = True
            if ignored:
                continue
            print("Considering: %s" % line)
            assert not re.search("error", line, re.IGNORECASE)