def run_kafka(version): """ Runs a kafka container with zookeeper """ with run_container("zookeeper:3.5") as zookeeper: zkhost = container_ip(zookeeper) assert wait_for(p(tcp_socket_open, zkhost, 2181), 60), "zookeeper didn't start" with run_service( "kafka", environment={ "JMX_PORT": "7099", "KAFKA_ZOOKEEPER_CONNECT": "%s:2181" % (zkhost, ), "START_AS": "broker" }, buildargs={"KAFKA_VERSION": version}, ) as kafka_container: run_service( "kafka", environment={ "START_AS": "create-topic", "KAFKA_ZOOKEEPER_CONNECT": "%s:2181" % (zkhost, ) }, buildargs={"KAFKA_VERSION": version}, ) yield kafka_container
def test_all_kafka_monitors(version): with run_kafka(version) as kafka: kafkahost = container_ip(kafka) with run_service( "kafka", environment={ "JMX_PORT": "8099", "START_AS": "producer", "KAFKA_BROKER": "%s:9092" % (kafkahost, ) }, buildargs={"KAFKA_VERSION": version}, ) as kafka_producer: kafkaproducerhost = container_ip(kafka_producer) with run_service( "kafka", environment={ "JMX_PORT": "9099", "START_AS": "consumer", "KAFKA_BROKER": "%s:9092" % (kafkahost, ) }, buildargs={"KAFKA_VERSION": version}, ) as kafka_consumer: kafkaconsumerhost = container_ip(kafka_consumer) with run_agent( textwrap.dedent(""" monitors: - type: collectd/kafka host: {0} port: 7099 clusterName: testCluster - type: collectd/kafka_producer host: {1} port: 8099 - type: collectd/kafka_consumer host: {2} port: 9099 """.format(kafkahost, kafkaproducerhost, kafkaconsumerhost))) as [backend, _, _]: assert wait_for( p(has_datapoint_with_metric_name, backend, "gauge.kafka-active-controllers"), timeout_seconds=60), "Didn't get kafka datapoints" assert wait_for( p(has_datapoint_with_dim, backend, "cluster", "testCluster"), timeout_seconds=60 ), "Didn't get cluster dimension from kafka datapoints" assert wait_for( p(has_datapoint_with_dim, backend, "client-id", "console-producer"), timeout_seconds=60 ), "Didn't get client-id dimension from kafka_producer datapoints" assert wait_for( p(has_datapoint_with_dim, backend, "client-id", "consumer-1"), timeout_seconds=60 ), "Didn't get client-id dimension from kafka_consumer datapoints"
def test_docker_observer_labels_partial(): """ Test that docker observer picks up a partially configured endpoint from container labels """ with run_agent( dedent(""" observers: - type: docker monitors: - type: collectd/nginx discoveryRule: container_name =~ "nginx-disco-partial" && port == 80 """)) as [backend, _, _]: with run_service( "nginx", name="nginx-disco-partial", labels={ "agent.signalfx.com.config.80.extraDimensions": "{mydim: myvalue}" }, ): assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "nginx")), "Didn't get nginx datapoints" assert wait_for( p(has_datapoint_with_dim, backend, "mydim", "myvalue")), "Didn't get extra dimension" # Let nginx be removed by docker observer and collectd restart time.sleep(5) backend.datapoints.clear() assert ensure_always( lambda: not has_datapoint_with_dim(backend, "container_name", "nginx-disco-partial"), 10)
def test_elasticsearch_without_cluster(): # start the ES container without the service with run_service("elasticsearch/6.4.2", environment={"cluster.name": "testCluster"}, entrypoint="sleep inf") as es_container: host = container_ip(es_container) config = dedent(f""" monitors: - type: collectd/elasticsearch host: {host} port: 9200 username: elastic password: testing123 """) with run_agent(config) as [backend, _, _]: assert not wait_for( p(has_datapoint_with_dim, backend, "plugin", "elasticsearch")), "datapoints found without service" # start ES service and make sure it gets discovered es_container.exec_run( "/usr/local/bin/docker-entrypoint.sh eswrapper", detach=True) assert wait_for( p(http_status, url=f"http://{host}:9200/_nodes/_local", status=[200]), 180), "service didn't start" assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "elasticsearch")), "Didn't get elasticsearch datapoints"
def test_elasticsearch_with_cluster_option(): with run_service("elasticsearch/6.4.2", environment={"cluster.name": "testCluster"}) as es_container: host = container_ip(es_container) assert wait_for( p(http_status, url=f"http://{host}:9200/_nodes/_local", status=[200]), 180), "service didn't start" config = dedent(f""" monitors: - type: collectd/elasticsearch host: {host} port: 9200 username: elastic password: testing123 cluster: testCluster1 """) with run_agent(config) as [backend, get_output, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "elasticsearch")), "Didn't get elasticsearch datapoints" assert wait_for( p(has_datapoint_with_dim, backend, "plugin_instance", "testCluster1") ), "Cluster name not picked from read callback" # make sure all plugin_instance dimensions were overridden by the cluster option assert not wait_for( p(has_datapoint_with_dim, backend, "plugin_instance", "testCluster"), 10 ), "plugin_instance dimension not overridden by cluster option" assert not has_log_message(get_output().lower(), "error"), "error found in agent output!"
def test_elasticsearch_with_additional_metrics(): with run_service("elasticsearch/6.2.0", environment={"cluster.name": "testCluster"}) as es_container: host = container_ip(es_container) assert wait_for( p(http_status, url=f"http://{host}:9200/_nodes/_local", status=[200]), 180), "service didn't start" config = dedent(f""" monitors: - type: collectd/elasticsearch host: {host} port: 9200 username: elastic password: testing123 additionalMetrics: - cluster.initializing-shards - thread_pool.threads """) with run_agent(config) as [backend, get_output, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "elasticsearch")), "Didn't get elasticsearch datapoints" assert wait_for( p(has_datapoint_with_metric_name, backend, "gauge.cluster.initializing-shards") ), "Didn't get gauge.cluster.initializing-shards metric" assert wait_for( p(has_datapoint_with_metric_name, backend, "gauge.thread_pool.threads") ), "Didn't get gauge.thread_pool.threads metric" assert not has_log_message(get_output().lower(), "error"), "error found in agent output!"
def test_elasticsearch_with_threadpool(): with run_service("elasticsearch/6.2.0", environment={"cluster.name": "testCluster"}) as es_container: host = container_ip(es_container) assert wait_for( p(http_status, url=f"http://{host}:9200/_nodes/_local", status=[200]), 180), "service didn't start" config = dedent(f""" monitors: - type: collectd/elasticsearch host: {host} port: 9200 username: elastic password: testing123 threadPools: - bulk - index - search """) with run_agent(config) as [backend, get_output, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "elasticsearch")), "Didn't get elasticsearch datapoints" assert wait_for( p(has_datapoint_with_dim, backend, "thread_pool", "bulk")), "Didn't get bulk thread pool metrics" assert not has_log_message(get_output().lower(), "error"), "error found in agent output!"
def test_jenkins(version): with run_service("jenkins", buildargs={ "JENKINS_VERSION": version, "JENKINS_PORT": "8080" }) as jenkins_container: host = container_ip(jenkins_container) config = dedent(f""" monitors: - type: collectd/jenkins host: {host} port: 8080 metricsKey: {METRICS_KEY} """) assert wait_for(p(tcp_socket_open, host, 8080), 60), "service not listening on port" assert wait_for( p(http_status, url=f"http://{host}:8080/metrics/{METRICS_KEY}/ping/", status=[200]), 120), "service didn't start" with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "jenkins")), "Didn't get jenkins datapoints"
def test_docker_observer_labels(): """ Test that docker observer picks up a fully configured endpoint from container labels """ with run_agent( dedent(""" observers: - type: docker """)) as [backend, _, _]: with run_service( "nginx", name="nginx-disco-full", labels={ "agent.signalfx.com.monitorType.80": "collectd/nginx", "agent.signalfx.com.config.80.intervalSeconds": "1", }, ): assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "nginx")), "Didn't get nginx datapoints" # Let nginx be removed by docker observer and collectd restart time.sleep(5) backend.datapoints.clear() assert ensure_always( lambda: not has_datapoint_with_dim(backend, "container_name", "nginx-disco-full"), 10)
def test_health_checker_tcp(): with run_service("nginx") as nginx_container: host = container_ip(nginx_container) assert wait_for(p(tcp_socket_open, host, 80), 60), "service didn't start" with run_agent(CONFIG.substitute(host=host)) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "health_checker")), "Didn't get health_checker datapoints"
def test_haproxy(version): with run_service("haproxy", buildargs={"HAPROXY_VERSION": version}) as service_container: host = container_ip(service_container) config = MONITOR_CONFIG.substitute(host=host) assert wait_for(p(tcp_socket_open, host, 9000), 120), "haproxy not listening on port" with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "haproxy")), "didn't get datapoints"
def test_apache(): with run_service("apache") as apache_container: host = container_ip(apache_container) config = APACHE_CONFIG.substitute(host=host) assert wait_for(p(tcp_socket_open, host, 80), 60), "service didn't start" with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "apache")), "Didn't get apache datapoints"
def test_nginx(): with run_service("nginx") as nginx_container: host = container_ip(nginx_container) config = NGINX_CONFIG.substitute(host=host) assert wait_for(p(tcp_socket_open, host, 80), 60), "service didn't start" with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "nginx")), "Didn't get nginx datapoints"
def test_docker_observer_use_host_bindings(): with run_service("nginx", name="nginx-non-host-binding", labels={"mylabel": "non-host-binding"}): with run_service( "nginx", name="nginx-with-host-binding", labels={"mylabel": "with-host-binding"}, ports={"80/tcp": ("127.0.0.1", 0)}, ) as container_bind: with run_agent( HOST_BINDING_CONFIG.substitute( port=container_bind.attrs["NetworkSettings"]["Ports"] ["80/tcp"][0]["HostPort"])) as [backend, _, _]: assert not wait_for( p(has_datapoint_with_dim, backend, "mydim", "non-host-binding")), "Didn't get custom label dimension" assert wait_for( p(has_datapoint_with_dim, backend, "mydim", "with-host-binding") ), "Didn't get custom label dimension"
def test_basic_service_discovery(): with run_agent(CONFIG) as [backend, get_output, _]: with run_service("nginx", name="nginx-discovery"): assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "nginx")), "Didn't get nginx datapoints" # Let nginx be removed by docker observer and collectd restart time.sleep(5) backend.datapoints.clear() assert ensure_always( lambda: not has_datapoint_with_dim(backend, "plugin", "nginx"), 10) assert not has_log_message(get_output(), "error")
def test_docker_detects_new_containers(): with run_agent( """ monitors: - type: docker-container-stats """ ) as [backend, _, _]: time.sleep(5) with run_service("nginx") as nginx_container: assert wait_for( p(has_datapoint_with_dim, backend, "container_id", nginx_container.id) ), "Didn't get nginx datapoints"
def test_docker_image_filtering(): with run_service("nginx") as nginx_container: with run_agent( """ monitors: - type: docker-container-stats excludedImages: - "%s" """ % nginx_container.attrs["Image"] ) as [backend, _, _]: assert ensure_always(lambda: not has_datapoint_with_dim(backend, "container_id", nginx_container.id))
def test_docker_envvar_dimensions(): with run_service("nginx", environment={"APP": "myserver"}): with run_agent( """ monitors: - type: docker-container-stats envToDimensions: APP: app """ ) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "app", "myserver") ), "Didn't get datapoint with service app"
def test_docker_label_dimensions(): with run_service("nginx", labels={"app": "myserver"}): with run_agent( """ monitors: - type: docker-container-stats labelsToDimensions: app: service """ ) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "service", "myserver") ), "Didn't get datapoint with service dim"
def test_cassandra(): with run_service("cassandra") as cassandra_cont: config = CASSANDRA_CONFIG.substitute(host=container_ip(cassandra_cont)) # Wait for the JMX port to be open in the container assert wait_for( p(container_cmd_exit_0, cassandra_cont, "sh -c 'cat /proc/net/tcp | grep 1C1F'") ), "Cassandra JMX didn't start" with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_metric_name, backend, "counter.cassandra.ClientRequest.Read.Latency.Count"), 30), "Didn't get Cassandra datapoints"
def test_docker_observer(): with run_agent(CONFIG) as [backend, _, _]: with run_service("nginx", name="nginx-discovery", labels={"mylabel": "abc"}): assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "nginx")), "Didn't get nginx datapoints" assert wait_for(p(has_datapoint_with_dim, backend, "mydim", "abc")), "Didn't get custom label dimension" # Let nginx be removed by docker observer and collectd restart time.sleep(5) backend.datapoints.clear() assert ensure_always( lambda: not has_datapoint_with_dim(backend, "container_name", "nginx-discovery"), 10)
def test_docker_stops_watching_paused_containers(): with run_service("nginx") as nginx_container: with run_agent( """ monitors: - type: docker-container-stats """ ) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "container_id", nginx_container.id) ), "Didn't get nginx datapoints" nginx_container.pause() time.sleep(5) backend.datapoints.clear() assert ensure_always(lambda: not has_datapoint_with_dim(backend, "container_id", nginx_container.id))
def test_couchbase(tag): with run_service("couchbase", buildargs={"COUCHBASE_VERSION": tag}, hostname="node1.cluster") as couchbase_container: host = container_ip(couchbase_container) config = COUCHBASE_CONFIG.substitute(host=host) assert wait_for(p(tcp_socket_open, host, 8091), 60), "service not listening on port" assert wait_for( p(http_status, url="http://{0}:8091/pools".format(host), status=[401]), 120), "service didn't start" with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "couchbase")), "Didn't get couchbase datapoints"
def test_health_checker_http(): with run_service("nginx") as nginx_container: host = container_ip(nginx_container) assert wait_for(p(tcp_socket_open, host, 80), 60), "service didn't start" with run_agent( string.Template( dedent(""" monitors: - type: collectd/health-checker host: $host port: 80 path: /nonexistent """)).substitute(host=host)) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "health_checker")), "Didn't get health_checker datapoints"
def test_docker_container_stats(): with run_service("nginx") as nginx_container: with run_agent( """ monitors: - type: docker-container-stats """ ) as [backend, _, _]: assert wait_for( p(has_datapoint_with_metric_name, backend, "cpu.percent") ), "Didn't get docker cpu datapoints" assert wait_for( p(has_datapoint_with_metric_name, backend, "memory.percent") ), "Didn't get docker memory datapoints" assert wait_for( p(has_datapoint_with_dim, backend, "container_id", nginx_container.id) ), "Didn't get nginx datapoints"
def test_solr_monitor(): with run_service("solr") as solr_container: host = container_ip(solr_container) config = dedent(f""" monitors: - type: collectd/solr host: {host} port: 8983 """) assert wait_for(p(tcp_socket_open, host, 8983), 60), "service not listening on port" with run_agent(config) as [backend, get_output, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "solr")), "Didn't get solr datapoints" assert ensure_always(lambda: has_datapoint_with_metric_name( backend, "counter.solr.http_5xx_responses")) assert not has_log_message(get_output().lower(), "error"), "error found in agent output!"
def test_activemq(): with run_service("activemq") as activemq_container: host = container_ip(activemq_container) config = dedent(f""" monitors: - type: collectd/activemq host: {host} port: 1099 serviceURL: service:jmx:rmi:///jndi/rmi://{host}:1099/jmxrmi username: testuser password: testing123 """) assert wait_for(p(tcp_socket_open, host, 1099), 60), "service didn't start" with run_agent(config) as [backend, _, _]: metrics = get_monitor_metrics_from_selfdescribe( "collectd/activemq") assert wait_for(p(any_metric_found, backend, metrics)), "Didn't get activemq datapoints"
def test_hadoop(version): with run_service("hadoop", buildargs={"HADOOP_VER": version}, hostname="hadoop-master") as hadoop_master: with run_container(hadoop_master.image, hostname="hadoop-worker1") as hadoop_worker1: containers = { "hadoop-master": hadoop_master, "hadoop-worker1": hadoop_worker1 } # distribute the ip and hostnames for each container distribute_hostnames(containers) # format hdfs print_lines( hadoop_master.exec_run( ["/usr/local/hadoop/bin/hdfs", "namenode", "-format"])[1]) # start hadoop and yarn print_lines(hadoop_master.exec_run("start-dfs.sh")[1]) print_lines(hadoop_master.exec_run("start-yarn.sh")[1]) # wait for yarn api to be available host = container_ip(hadoop_master) assert wait_for(p(tcp_socket_open, host, 8088), 60), "service not listening on port" assert wait_for( p(http_status, url="http://{0}:8088".format(host), status=[200]), 120), "service didn't start" # start the agent with hadoop config config = HADOOP_CONFIG.substitute(host=host, port=8088) with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "apache_hadoop")), "Didn't get hadoop datapoints" assert wait_for( p(has_datapoint, backend, "gauge.hadoop.cluster.metrics.active_nodes", {}, 1)), "expected 1 hadoop worker node"
def test_docker_observer_labels_multiple_monitors_per_port(): """ Test that we can configure multiple monitors per port using labels """ with run_agent( dedent(""" observers: - type: docker """)) as [backend, _, _]: with run_service( "nginx", name="nginx-multi-monitors", labels={ "agent.signalfx.com.monitorType.80": "collectd/nginx", "agent.signalfx.com.config.80.intervalSeconds": "1", "agent.signalfx.com.config.80.extraDimensions": "{app: nginx}", "agent.signalfx.com.monitorType.80-nginx2": "collectd/nginx", "agent.signalfx.com.config.80-nginx2.intervalSeconds": "1", "agent.signalfx.com.config.80-nginx2.extraDimensions": "{app: other}", }, ): assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "nginx")), "Didn't get nginx datapoints" assert wait_for(p(has_datapoint_with_dim, backend, "app", "nginx")), "Didn't get extra dims" assert wait_for(p(has_datapoint_with_dim, backend, "app", "other")), "Didn't get extra dims" # Let nginx be removed by docker observer and collectd restart time.sleep(5) backend.datapoints.clear() assert ensure_always( lambda: not has_datapoint_with_dim(backend, "container_name", "nginx-multi-monitors"), 10)