def run(config, metrics): with run_service("spark", command="bin/spark-class org.apache.spark.deploy.master.Master") as spark_master: master_ip = container_ip(spark_master) assert wait_for(p(tcp_socket_open, master_ip, 7077), 60), "master service didn't start" assert wait_for(p(tcp_socket_open, master_ip, 8080), 60), "master webui service didn't start" assert spark_master.exec_run("./sbin/start-history-server.sh").exit_code == 0, "history service didn't start" with run_service( "spark", command=f"bin/spark-class org.apache.spark.deploy.worker.Worker spark://{master_ip}:7077" ) as spark_worker: worker_ip = container_ip(spark_worker) assert wait_for(p(tcp_socket_open, worker_ip, 8081), 60), "worker webui service didn't start" spark_master.exec_run("nc -lk 9999", detach=True) spark_master.exec_run( f"bin/spark-submit --master spark://{master_ip}:7077 --conf spark.driver.host={master_ip} {SPARK_APP}", detach=True, ) assert wait_for(p(tcp_socket_open, master_ip, 4040), 60), "application service didn't start" config = config.format(master_ip=master_ip, worker_ip=worker_ip) with Agent.run(config) as agent: verify(agent, metrics, timeout=60) assert has_datapoint_with_dim( agent.fake_services, "plugin", "apache_spark" ), "Didn't get spark datapoints"
def test_postgresql_defaults(): with run_container("postgres:10", environment=ENV) as cont: host = container_ip(cont) assert wait_for(p(tcp_socket_open, host, 5432), 60), "service didn't start" with Agent.run(f""" monitors: - type: collectd/postgresql host: {host} port: 5432 username: "******" password: "******" queries: - name: "exampleQuery" minVersion: 60203 maxVersion: 200203 statement: | SELECT coalesce(sum(n_live_tup), 0) AS live, coalesce(sum(n_dead_tup), 0) AS dead FROM pg_stat_user_tables; results: - type: gauge instancePrefix: live valuesFrom: - live databases: - name: test username: "******" password: "******" interval: 5 expireDelay: 10 sslMode: disable """) as agent: verify(agent, METADATA.default_metrics)
def test_cgroup_monitor(): with run_service("nginx", cpu_period=100_000, cpu_quota=10000, cpu_shares=50, mem_limit=20 * 1024 * 1024) as nginx_container: with Agent.run(""" monitors: - type: cgroups extraMetrics: ['*'] """) as agent: verify(agent, METADATA.all_metrics) expected_cgroup = "/docker/" + nginx_container.id assert wait_for( p( has_datapoint, agent.fake_services, metric_name="cgroup.cpu_shares", value=50, dimensions={"cgroup": expected_cgroup}, )) assert wait_for( p( has_datapoint, agent.fake_services, metric_name="cgroup.cpu_cfs_period_us", value=100_000, dimensions={"cgroup": expected_cgroup}, ))
def test_hadoop_default(version): """ Any new versions of hadoop should be manually built, tagged, and pushed to quay.io, i.e. docker build \ -t quay.io/signalfx/hadoop-test:<version> \ --build-arg HADOOP_VER=<version> \ <repo_root>/test-services/hadoop docker push quay.io/signalfx/hadoop-test:<version> """ with run_container( "quay.io/signalfx/hadoop-test:%s" % version, hostname="hadoop-master") as hadoop_master, run_container( "quay.io/signalfx/hadoop-test:%s" % version, hostname="hadoop-worker1") as hadoop_worker1: host = start_hadoop(hadoop_master, hadoop_worker1) # start the agent with hadoop config config = f""" monitors: - type: collectd/hadoop host: {host} port: 8088 verbose: true """ with Agent.run(config) as agent: verify(agent, METADATA.default_metrics - EXCLUDED) # Need to run the agent manually because we want to wait for this metric to become 1 but it may # be 0 at first. assert wait_for( p(has_datapoint, agent.fake_services, "gauge.hadoop.cluster.metrics.active_nodes", {}, 1)), "expected 1 hadoop worker node" assert has_datapoint_with_dim( agent.fake_services, "plugin", "apache_hadoop"), "Didn't get hadoop datapoints"
def test_load_default(): with Agent.run(""" monitors: - type: collectd/load """) as agent: verify(agent, METADATA.default_metrics) assert not has_log_message(agent.output.lower(), "error"), "error found in agent output!"
def test_netio_defaults(): with Agent.run( """ monitors: - type: net-io """ ) as agent: verify(agent, METADATA.included_metrics) assert not has_log_message(agent.output.lower(), "error"), "error found in agent output!"
def run(config, metrics): with run_service("apache") as apache_container: host = container_ip(apache_container) config = config.format(host=host) assert wait_for(p(tcp_socket_open, host, 80), 60), "service didn't start" with Agent.run(config) as agent: verify(agent, metrics) assert has_datapoint_with_dim(agent.fake_services, "plugin", "apache"), "Didn't get apache datapoints"
def test_nginx_included(): with run_nginx() as host, Agent.run(f""" monitors: - type: collectd/nginx host: {host} port: 80 """) as agent: verify(agent, METADATA.included_metrics) assert has_datapoint_with_dim(agent.fake_services, "plugin", "nginx"), "Didn't get nginx datapoints"
def run(version, node_type, metrics, extra_metrics=""): with run_node(node_type, version) as (host, port): # start the agent with hadoopjmx config config = HADOOPJMX_CONFIG.format(host=host, port=port, nodeType=node_type, extraMetrics=extra_metrics) with Agent.run(config) as agent: verify(agent, metrics) # Check for expected dimension. assert has_datapoint_with_dim( agent.fake_services, "nodeType", node_type ), f"Didn't get hadoopjmx datapoints for nodeType {node_type}"
def test_haproxy_default_metrics_from_stats_page(version): with run_service("haproxy", buildargs={"HAPROXY_VERSION": version}) as service_container: host = container_ip(service_container) with Agent.run(f""" monitors: - type: haproxy url: http://{host}:8080/stats?stats;csv """) as agent: verify(agent, EXPECTED_DEFAULTS - EXPECTED_DEFAULTS_FROM_SOCKET, 10)
def test_kong_default(kong_version): with run_kong(kong_version) as kong_ip: config = f""" monitors: - type: collectd/kong host: {kong_ip} port: 8001 """ with Agent.run(config) as agent: verify(agent, METADATA.default_metrics) assert has_datapoint_with_dim(agent.fake_services, "plugin", "kong"), "Didn't get Kong dimension"
def test_kubernetes_scheduler(k8s_cluster): config = """ observers: - type: k8s-api monitors: - type: kubernetes-scheduler discoveryRule: kubernetes_pod_name =~ "kube-scheduler" port: 10251 extraMetrics: ["*"] """ with k8s_cluster.run_agent(config) as agent: verify(agent, METADATA.all_metrics)
def test_supervisor_default(): with run_supervisor_fpm() as host, Agent.run( f""" monitors: - type: supervisor host: {host} port: {PORT} """ ) as agent: verify(agent, METADATA.default_metrics) assert has_datapoint_with_dim( agent.fake_services, "name", PROCESS ), "Didn't get process name dimension {}".format(PROCESS)
def test_haproxy_default_metrics_from_stats_page_by_discovery_rule(version): with run_service("haproxy", buildargs={"HAPROXY_VERSION": version}, name="haproxy"): with Agent.run(f""" observers: - type: docker monitors: - type: haproxy discoveryRule: 'container_name == "haproxy"' """) as agent: verify(agent, EXPECTED_DEFAULTS - EXPECTED_DEFAULTS_FROM_SOCKET, 10)
def test_php_default(): with run_php_fpm() as host, Agent.run(f""" monitors: - type: collectd/php-fpm url: "http://{host}/status?json" name: {INSTANCE} """) as agent: verify(agent, METADATA.default_metrics) assert has_datapoint_with_dim( agent.fake_services, "plugin", "curl_json"), "Didn't get php-fpm datapoints" assert has_datapoint_with_dim( agent.fake_services, "plugin_instance", INSTANCE), "Didn't get right instance dimension on datapoints"
def test_process_monitor_process_name_filter(): proc = psutil.Process(os.getpid()) self_proc_name = proc.name() with Agent.run(f""" monitors: - type: process processes: - {self_proc_name} """) as agent: verify(agent, METADATA.all_metrics) assert has_datapoint(agent.fake_services, dimensions={"command": self_proc_name})
def test_process_monitor_executable_filter(): proc = psutil.Process(os.getpid()) self_proc_exec = proc.exe() with Agent.run(f""" monitors: - type: process executables: - {self_proc_exec} """) as agent: verify(agent, METADATA.all_metrics) assert has_datapoint(agent.fake_services, dimensions={"executable": self_proc_exec})
def test_consul_defaults(): with run_container("consul:1.4.4") as consul_cont: host = container_ip(consul_cont) assert wait_for(p(tcp_socket_open, host, 8500), 60), "consul service didn't start" with Agent.run(f""" monitors: - type: collectd/consul host: {host} port: 8500 enhancedMetrics: false """) as agent: verify(agent, EXPECTED_DEFAULTS)
def test_memory(): expected_metrics = {"memory.used", "memory.utilization"} if sys.platform == "linux": expected_metrics.update({ "memory.buffered", "memory.cached", "memory.free", "memory.slab_recl", "memory.slab_unrecl" }) with Agent.run(""" monitors: - type: memory """) as agent: for met in expected_metrics: assert met in METADATA.default_metrics verify(agent, expected_metrics)
def test_kong_metric_config(): """Test turning on metric config flag allows through filter""" with run_kong(LATEST) as kong_ip: config = f""" monitors: - type: collectd/kong host: {kong_ip} port: 8001 metrics: - metric: connections_accepted report: true """ with Agent.run(config) as agent: verify(agent, METADATA.default_metrics | {"counter.kong.connections.accepted"}) assert has_datapoint_with_dim(agent.fake_services, "plugin", "kong"), "Didn't get Kong dimension"
def test_mongo_basic(): with run_container("mongo:3.6") as mongo_cont: host = container_ip(mongo_cont) config = dedent(f""" monitors: - type: collectd/mongodb host: {host} port: 27017 databases: [admin] """) assert wait_for(p(tcp_socket_open, host, 27017), 60), "service didn't start" with Agent.run(config) as agent: verify(agent, EXPECTED_DEFAULTS)
def test_haproxy_default_and_status_metrics_from_stats_page(version): with run_service("haproxy", buildargs={"HAPROXY_VERSION": version}) as service_container: host = container_ip(service_container) status_metric = "haproxy_status" with Agent.run(f""" monitors: - type: haproxy url: http://{host}:8080/stats?stats;csv extraMetrics: [{status_metric}] """) as agent: verify(agent, (EXPECTED_DEFAULTS | {status_metric}) - EXPECTED_DEFAULTS_FROM_SOCKET, 10) assert not has_log_message(agent.output.lower(), "error"), "error found in agent output!"
def test_kong_extra_metric(): """Test adding extra metric enables underlying config metric""" # counter.kong.connections.handled chosen because it's not reported by default by the monitor # and is not a default metric. with run_kong(LATEST) as kong_ip: config = f""" monitors: - type: collectd/kong host: {kong_ip} port: 8001 extraMetrics: - counter.kong.connections.handled """ with Agent.run(config) as agent: verify(agent, METADATA.default_metrics | {"counter.kong.connections.handled"}) assert has_datapoint_with_dim(agent.fake_services, "plugin", "kong"), "Didn't get Kong dimension"
def test_haproxy_basic(version): with run_service("haproxy", buildargs={"HAPROXY_VERSION": version}) as service_container: host = container_ip(service_container) assert wait_for(p(tcp_socket_open, host, 9000)), "haproxy not listening on port" with Agent.run( f""" monitors: - type: collectd/haproxy host: {host} port: 9000 enhancedMetrics: false """ ) as agent: requests.get(f"http://{host}:80", timeout=5) requests.get(f"http://{host}:80", timeout=5) verify(agent, EXPECTED_DEFAULTS, 10)
def test_couchbase_included(tag): with run_couchbase(tag) as host, Agent.run(f""" monitors: - type: collectd/couchbase host: {host} port: 8091 collectTarget: NODE username: administrator password: password """) as agent: verify( agent, (METADATA.metrics_by_group["nodes"] & METADATA.included_metrics) - EXCLUDED) assert has_datapoint_with_dim( agent.fake_services, "plugin", "couchbase"), "Didn't get couchbase datapoints"
def run_all(version, metrics, extra_metrics=""): with run_kafka(version) as kafka: kafka_ip = container_ip(kafka) kafka_host = container_hostname(kafka) image = kafka.image.id # We add the Kafka broker host:ip as an extra_host because by default the Kafka broker advertises itself with # its hostname and without this the producer and consumer wouldn't be able to resolve the broker hostname. with run_producer(image, kafka_host, extra_hosts={ kafka_host: kafka_ip }) as kafkaproducerhost, run_consumer( image, kafka_host, extra_hosts={ kafka_host: kafka_ip }) as kafkaconsumerhost, Agent.run(f""" monitors: - type: collectd/kafka host: {kafka_ip} port: 7099 clusterName: testCluster extraMetrics: {extra_metrics} - type: collectd/kafka_producer host: {kafkaproducerhost} port: 8099 extraMetrics: {extra_metrics} - type: collectd/kafka_consumer host: {kafkaconsumerhost} port: 9099 extraMetrics: {extra_metrics} """) as agent: verify(agent, metrics) assert has_datapoint_with_dim( agent.fake_services, "cluster", "testCluster" ), "Didn't get cluster dimension from kafka datapoints" assert has_datapoint_with_dim( agent.fake_services, "client-id", "console-producer" ), "Didn't get client-id dimension from kafka_producer datapoints" assert has_datapoint_with_dim( agent.fake_services, "client-id", "consumer-1" ), "Didn't get client-id dimension from kafka_consumer datapoints"
def test_extra_metrics_passthrough(): """ The specified extraMetrics should be allowed through even though they are not included by default. """ metadata = Metadata.from_package("expvar") with run_expvar() as expvar_container_ip: with Agent.run(f""" monitors: - type: expvar host: {expvar_container_ip} port: 8080 intervalSeconds: 1 extraMetrics: - memstats.by_size.mallocs """) as agent: assert "memstats.by_size.mallocs" in metadata.nonincluded_metrics verify(agent, metadata.included_metrics | {"memstats.by_size.mallocs"})
def test_built_in_filtering_disabled_no_whitelist_for_monitor(): """ Test a monitor that doesn't have any entries in whitelist.json """ metadata = Metadata.from_package("expvar") with run_expvar() as expvar_container_ip: with Agent.run(f""" enableBuiltInFiltering: false monitors: - type: expvar host: {expvar_container_ip} port: 8080 intervalSeconds: 1 enhancedMetrics: true # This should be ignored extraMetrics: - memstats.by_size.mallocs metricsToExclude: - {{"#from": "{REPO_ROOT_DIR}/whitelist.json", flatten: true}} """) as agent: verify(agent, metadata.all_metrics)
def test_mongo_enhanced_metrics(): with run_container("mongo:3.6") as mongo_cont: host = container_ip(mongo_cont) config = dedent(f""" monitors: - type: collectd/mongodb host: {host} port: 27017 databases: [admin] sendCollectionMetrics: true sendCollectionTopMetrics: true """) assert wait_for(p(tcp_socket_open, host, 27017), 60), "service didn't start" with Agent.run(config) as agent: verify( agent, METADATA.metrics_by_group["collection"] | METADATA.metrics_by_group["collection-top"] | EXPECTED_DEFAULTS, )
def test_elasticsearch_included(): with run_elasticsearch( environment={"cluster.name": "testCluster"}) as es_container: host = container_ip(es_container) config = f""" monitors: - type: collectd/elasticsearch host: {host} port: 9200 username: elastic password: testing123 """ with Agent.run(config) as agent: verify(agent, METADATA.default_metrics - EXCLUDED) assert has_datapoint_with_dim( agent.fake_services, "plugin", "elasticsearch"), "Didn't get elasticsearch datapoints" assert has_datapoint_with_dim( agent.fake_services, "plugin_instance", "testCluster"), "Cluster name not picked from read callback" assert not has_log_message(agent.output.lower(), "error"), "error found in agent output!"