def test_all_kafka_monitors(version): with run_kafka(version) as kafka: kafkahost = container_ip(kafka) with run_service( "kafka", environment={ "JMX_PORT": "8099", "START_AS": "producer", "KAFKA_BROKER": "%s:9092" % (kafkahost, ) }, buildargs={"KAFKA_VERSION": version}, ) as kafka_producer: kafkaproducerhost = container_ip(kafka_producer) with run_service( "kafka", environment={ "JMX_PORT": "9099", "START_AS": "consumer", "KAFKA_BROKER": "%s:9092" % (kafkahost, ) }, buildargs={"KAFKA_VERSION": version}, ) as kafka_consumer: kafkaconsumerhost = container_ip(kafka_consumer) with run_agent( textwrap.dedent(""" monitors: - type: collectd/kafka host: {0} port: 7099 clusterName: testCluster - type: collectd/kafka_producer host: {1} port: 8099 - type: collectd/kafka_consumer host: {2} port: 9099 """.format(kafkahost, kafkaproducerhost, kafkaconsumerhost))) as [backend, _, _]: assert wait_for( p(has_datapoint_with_metric_name, backend, "gauge.kafka-active-controllers"), timeout_seconds=60), "Didn't get kafka datapoints" assert wait_for( p(has_datapoint_with_dim, backend, "cluster", "testCluster"), timeout_seconds=60 ), "Didn't get cluster dimension from kafka datapoints" assert wait_for( p(has_datapoint_with_dim, backend, "client-id", "console-producer"), timeout_seconds=60 ), "Didn't get client-id dimension from kafka_producer datapoints" assert wait_for( p(has_datapoint_with_dim, backend, "client-id", "consumer-1"), timeout_seconds=60 ), "Didn't get client-id dimension from kafka_consumer datapoints"
def test_kong(kong_image): # pylint: disable=redefined-outer-name kong_env = dict(KONG_ADMIN_LISTEN="0.0.0.0:8001", KONG_LOG_LEVEL="warn", KONG_DATABASE="postgres", KONG_PG_DATABASE="kong") with run_container("postgres:9.5", environment=dict(POSTGRES_USER="******", POSTGRES_DB="kong")) as db: db_ip = container_ip(db) kong_env["KONG_PG_HOST"] = db_ip def db_is_ready(): return db.exec_run("pg_isready -U kong").exit_code == 0 assert wait_for(db_is_ready) with run_container(kong_image, environment=kong_env, command="sleep inf") as migrations: def db_is_reachable(): return migrations.exec_run( "psql -h {} -U kong".format(db_ip)).exit_code == 0 assert wait_for(db_is_reachable) assert migrations.exec_run("kong migrations up --v").exit_code == 0 with run_container(kong_image, environment=kong_env) as kong: kong_ip = container_ip(kong) def kong_is_listening(): try: return get("http://{}:8001/signalfx".format( kong_ip)).status_code == 200 except RequestException: return False assert wait_for(kong_is_listening) config = string.Template( dedent(""" monitors: - type: collectd/kong host: $host port: 8001 metrics: - metric: connections_handled report: true """)).substitute(host=container_ip(kong)) with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "kong")), "Didn't get Kong data point"
def run_kafka(version): """ Runs a kafka container with zookeeper """ with run_container("zookeeper:3.5") as zookeeper: zkhost = container_ip(zookeeper) assert wait_for(p(tcp_socket_open, zkhost, 2181), 60), "zookeeper didn't start" with run_service( "kafka", environment={ "JMX_PORT": "7099", "KAFKA_ZOOKEEPER_CONNECT": "%s:2181" % (zkhost, ), "START_AS": "broker" }, buildargs={"KAFKA_VERSION": version}, ) as kafka_container: run_service( "kafka", environment={ "START_AS": "create-topic", "KAFKA_ZOOKEEPER_CONNECT": "%s:2181" % (zkhost, ) }, buildargs={"KAFKA_VERSION": version}, ) yield kafka_container
def test_elasticsearch_without_cluster(): # start the ES container without the service with run_service("elasticsearch/6.4.2", environment={"cluster.name": "testCluster"}, entrypoint="sleep inf") as es_container: host = container_ip(es_container) config = dedent(f""" monitors: - type: collectd/elasticsearch host: {host} port: 9200 username: elastic password: testing123 """) with run_agent(config) as [backend, _, _]: assert not wait_for( p(has_datapoint_with_dim, backend, "plugin", "elasticsearch")), "datapoints found without service" # start ES service and make sure it gets discovered es_container.exec_run( "/usr/local/bin/docker-entrypoint.sh eswrapper", detach=True) assert wait_for( p(http_status, url=f"http://{host}:9200/_nodes/_local", status=[200]), 180), "service didn't start" assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "elasticsearch")), "Didn't get elasticsearch datapoints"
def test_elasticsearch_with_cluster_option(): with run_service("elasticsearch/6.4.2", environment={"cluster.name": "testCluster"}) as es_container: host = container_ip(es_container) assert wait_for( p(http_status, url=f"http://{host}:9200/_nodes/_local", status=[200]), 180), "service didn't start" config = dedent(f""" monitors: - type: collectd/elasticsearch host: {host} port: 9200 username: elastic password: testing123 cluster: testCluster1 """) with run_agent(config) as [backend, get_output, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "elasticsearch")), "Didn't get elasticsearch datapoints" assert wait_for( p(has_datapoint_with_dim, backend, "plugin_instance", "testCluster1") ), "Cluster name not picked from read callback" # make sure all plugin_instance dimensions were overridden by the cluster option assert not wait_for( p(has_datapoint_with_dim, backend, "plugin_instance", "testCluster"), 10 ), "plugin_instance dimension not overridden by cluster option" assert not has_log_message(get_output().lower(), "error"), "error found in agent output!"
def test_elasticsearch_with_additional_metrics(): with run_service("elasticsearch/6.2.0", environment={"cluster.name": "testCluster"}) as es_container: host = container_ip(es_container) assert wait_for( p(http_status, url=f"http://{host}:9200/_nodes/_local", status=[200]), 180), "service didn't start" config = dedent(f""" monitors: - type: collectd/elasticsearch host: {host} port: 9200 username: elastic password: testing123 additionalMetrics: - cluster.initializing-shards - thread_pool.threads """) with run_agent(config) as [backend, get_output, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "elasticsearch")), "Didn't get elasticsearch datapoints" assert wait_for( p(has_datapoint_with_metric_name, backend, "gauge.cluster.initializing-shards") ), "Didn't get gauge.cluster.initializing-shards metric" assert wait_for( p(has_datapoint_with_metric_name, backend, "gauge.thread_pool.threads") ), "Didn't get gauge.thread_pool.threads metric" assert not has_log_message(get_output().lower(), "error"), "error found in agent output!"
def test_elasticsearch_with_threadpool(): with run_service("elasticsearch/6.2.0", environment={"cluster.name": "testCluster"}) as es_container: host = container_ip(es_container) assert wait_for( p(http_status, url=f"http://{host}:9200/_nodes/_local", status=[200]), 180), "service didn't start" config = dedent(f""" monitors: - type: collectd/elasticsearch host: {host} port: 9200 username: elastic password: testing123 threadPools: - bulk - index - search """) with run_agent(config) as [backend, get_output, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "elasticsearch")), "Didn't get elasticsearch datapoints" assert wait_for( p(has_datapoint_with_dim, backend, "thread_pool", "bulk")), "Didn't get bulk thread pool metrics" assert not has_log_message(get_output().lower(), "error"), "error found in agent output!"
def test_jenkins(version): with run_service("jenkins", buildargs={ "JENKINS_VERSION": version, "JENKINS_PORT": "8080" }) as jenkins_container: host = container_ip(jenkins_container) config = dedent(f""" monitors: - type: collectd/jenkins host: {host} port: 8080 metricsKey: {METRICS_KEY} """) assert wait_for(p(tcp_socket_open, host, 8080), 60), "service not listening on port" assert wait_for( p(http_status, url=f"http://{host}:8080/metrics/{METRICS_KEY}/ping/", status=[200]), 120), "service didn't start" with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "jenkins")), "Didn't get jenkins datapoints"
def test_hadoopjmx(version, nodeType): """ Any new versions of hadoop should be manually built, tagged, and pushed to quay.io, i.e. docker build \ -t quay.io/signalfx/hadoop-test:<version> \ --build-arg HADOOP_VER=<version> \ <repo_root>/test-services/hadoop docker push quay.io/signalfx/hadoop-test:<version> """ with run_container("quay.io/signalfx/hadoop-test:%s" % version, hostname="hadoop-master") as hadoop_master: with run_container("quay.io/signalfx/hadoop-test:%s" % version, hostname="hadoop-worker1") as hadoop_worker1: if nodeType in ["nameNode", "resourceManager"]: container = hadoop_master else: container = hadoop_worker1 host = container_ip(container) port = NODETYPE_PORT[nodeType] if nodeType in ["resourceManager", "nodeManager"]: yarn_var = YARN_VAR[nodeType] yarn_opts = YARN_OPTS % (yarn_var, port, yarn_var) cmd = ["/bin/bash", "-c", "echo 'export %s' >> %s" % (yarn_opts, YARN_ENV_PATH)] container.exec_run(cmd) start_hadoop(hadoop_master, hadoop_worker1) # wait for jmx to be available assert wait_for(p(tcp_socket_open, host, port), 60), "jmx service not listening on port %d" % port # start the agent with hadoopjmx config config = HADOOPJMX_CONFIG.substitute(host=host, port=port, nodeType=nodeType) with run_agent(config) as [backend, _, _]: assert wait_for(p(has_datapoint_with_dim, backend, "nodeType", nodeType)), ( "Didn't get hadoopjmx datapoints for nodeType %s" % nodeType )
def test_python_runner_with_redis(): with run_container("redis:4-alpine") as test_container: host = container_ip(test_container) config = MONITOR_CONFIG.substitute(host=host, bundle_root=BUNDLE_DIR) assert wait_for(p(tcp_socket_open, host, 6379), 60), "redis is not listening on port" redis_client = redis.StrictRedis(host=host, port=6379, db=0) assert wait_for(redis_client.ping, 60), "service didn't start" with run_agent(config) as [backend, get_output, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "redis_info")), "didn't get datapoints" assert wait_for( p(regex_search_matches_output, get_output, PID_RE.search)) pid = int(PID_RE.search(get_output()).groups()[0]) os.kill(pid, signal.SIGTERM) time.sleep(3) backend.datapoints.clear() assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "redis_info") ), "didn't get datapoints after Python process was killed" assert wait_for( p(has_datapoint, backend, metric_name="counter.lru_clock", metric_type=sf_pbuf.CUMULATIVE_COUNTER), timeout_seconds=3, ), "metric type was wrong"
def test_etcd_monitor(): with run_container("quay.io/coreos/etcd:v2.3.8", command=ETCD_COMMAND) as etcd_cont: host = container_ip(etcd_cont) config = ETCD_CONFIG.substitute(host=host) assert wait_for(p(tcp_socket_open, host, 2379), 60), "service didn't start" with run_agent(config) as [backend, _, _]: assert wait_for(p(has_datapoint_with_dim, backend, "plugin", "etcd")), "Didn't get etcd datapoints"
def get_client(self): if self.container: self.container.reload() self.client = docker.DockerClient(base_url="tcp://%s:2375" % container_ip(self.container), version="auto") return self.client
def test_bad_globbing(): with run_container("zookeeper:3.4") as zk_cont: zkhost = container_ip(zk_cont) assert wait_for(p(tcp_socket_open, zkhost, 2181), 30) create_znode(zk_cont, "/env", "prod") final_conf = BAD_GLOB_CONFIG.substitute(zk_endpoint="%s:2181" % zkhost) with run_agent(final_conf) as [_, get_output, _]: assert wait_for( lambda: "Zookeeper only supports globs" in get_output())
def test_health_checker_tcp(): with run_service("nginx") as nginx_container: host = container_ip(nginx_container) assert wait_for(p(tcp_socket_open, host, 80), 60), "service didn't start" with run_agent(CONFIG.substitute(host=host)) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "health_checker")), "Didn't get health_checker datapoints"
def run_redis(image="redis:4-alpine"): with run_container(image) as redis_container: host = container_ip(redis_container) assert wait_for(p(tcp_socket_open, host, 6379), 60), "service not listening on port" redis_client = redis.StrictRedis(host=host, port=6379, db=0) assert wait_for(redis_client.ping, 60), "service didn't start" yield [host, redis_client]
def test_nginx(): with run_service("nginx") as nginx_container: host = container_ip(nginx_container) config = NGINX_CONFIG.substitute(host=host) assert wait_for(p(tcp_socket_open, host, 80), 60), "service didn't start" with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "nginx")), "Didn't get nginx datapoints"
def test_haproxy(version): with run_service("haproxy", buildargs={"HAPROXY_VERSION": version}) as service_container: host = container_ip(service_container) config = MONITOR_CONFIG.substitute(host=host) assert wait_for(p(tcp_socket_open, host, 9000), 120), "haproxy not listening on port" with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "haproxy")), "didn't get datapoints"
def test_apache(): with run_service("apache") as apache_container: host = container_ip(apache_container) config = APACHE_CONFIG.substitute(host=host) assert wait_for(p(tcp_socket_open, host, 80), 60), "service didn't start" with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "apache")), "Didn't get apache datapoints"
def test_bad_globbing(): with run_container("zookeeper:3.4") as zk_cont: assert wait_for( p(container_cmd_exit_0, zk_cont, "nc -z localhost 2181"), 5) create_znode(zk_cont, "/env", "prod") final_conf = BAD_GLOB_CONFIG.substitute(zk_endpoint="%s:2181" % container_ip(zk_cont)) with run_agent(final_conf) as [_, get_output, _]: assert wait_for( lambda: "Zookeeper only supports globs" in get_output())
def test_postgresql(): with run_container("postgres:10", environment=ENV) as cont: host = container_ip(cont) config = CONFIG_TEMP.substitute(host=host) assert wait_for(p(tcp_socket_open, host, 5432), 60), "service didn't start" with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "postgresql") ), "Didn't get postgresql datapoints" assert wait_for(p(has_datapoint_with_metric_name, backend, "pg_blks.toast_hit"))
def test_redis(image): with run_container(image) as test_container: host = container_ip(test_container) config = MONITOR_CONFIG.substitute(host=host) assert wait_for(p(tcp_socket_open, host, 6379), 60), "service not listening on port" redis_client = redis.StrictRedis(host=host, port=6379, db=0) assert wait_for(redis_client.ping, 60), "service didn't start" with run_agent(config) as [backend, _, _]: assert wait_for(p(has_datapoint_with_dim, backend, "plugin", "redis_info")), "didn't get datapoints"
def container_is_running(client, name): try: cont = client.containers.get(name) cont.reload() if cont.status.lower() != "running": return False return container_ip(cont) except docker.errors.NotFound: return False except docker.errors.APIError as e: if "is not running" in str(e): return False raise
def distribute_hostnames(containers): """ iterate over each container and pass its hostname and ip to etc host on all of the other containers in the dictionary """ for hostname, container in containers.items(): ip_addr = container_ip(container) for target in containers: if hostname != target: containers[target].exec_run([ "/bin/bash", "-c", "echo '{0} {1}' >> /etc/hosts".format(ip_addr, hostname) ])
def test_basic_etcd2_config(): with run_container(ETCD2_IMAGE, command=ETCD_COMMAND) as etcd: assert wait_for(p(container_cmd_exit_0, etcd, "/etcdctl ls"), 5), "etcd didn't start" create_path(etcd, "/env", "prod") create_path(etcd, "/monitors/cpu", "- type: collectd/cpu") create_path(etcd, "/monitors/signalfx-metadata", "- type: collectd/signalfx-metadata") final_conf = CONFIG.substitute(endpoint="%s:2379" % container_ip(etcd)) with run_agent(final_conf) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "signalfx-metadata") ), "Datapoints didn't come through" assert wait_for(p(has_datapoint_with_dim, backend, "env", "prod")), "dimension wasn't set"
def test_openstack(devstack): host = container_ip(devstack) config = dedent(f""" monitors: - type: collectd/openstack authURL: http://{host}/identity/v3 username: admin password: testing123 """) with run_agent(config) as [backend, _, _]: expected_metrics = get_monitor_metrics_from_selfdescribe( "collectd/openstack") assert wait_for(p(any_metric_found, backend, expected_metrics), 60), "Timed out waiting for openstack metrics"
def run_vault(): with run_container("vault:1.0.2") as vault_cont: vault_ip = container_ip(vault_cont) assert wait_for(p(tcp_socket_open, vault_ip, 8200), 30) assert wait_for(lambda: "Root Token:" in vault_cont.logs().decode("utf-8"), 10) logs = vault_cont.logs() token = re.search(r"Root Token: (.*)$", logs.decode("utf-8"), re.MULTILINE).group(1) assert token, "Could not get root token of vault server" client = hvac.Client(url=f"http://{vault_ip}:8200", token=token) client.sys.enable_audit_device( device_type="file", options={"log_raw": True, "prefix": AUDIT_PREFIX, "file_path": "stdout"} ) yield [client, lambda: parse_audit_events_from_logs(vault_cont)]
def test_cassandra(): with run_service("cassandra") as cassandra_cont: config = CASSANDRA_CONFIG.substitute(host=container_ip(cassandra_cont)) # Wait for the JMX port to be open in the container assert wait_for( p(container_cmd_exit_0, cassandra_cont, "sh -c 'cat /proc/net/tcp | grep 1C1F'") ), "Cassandra JMX didn't start" with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_metric_name, backend, "counter.cassandra.ClientRequest.Read.Latency.Count"), 30), "Didn't get Cassandra datapoints"
def test_marathon(marathon_image): with run_container("zookeeper:3.5") as zookeeper: zkhost = container_ip(zookeeper) assert wait_for(p(tcp_socket_open, zkhost, 2181), 60), "zookeeper didn't start" with run_container( marathon_image, command=["--master", "localhost:5050", "--zk", "zk://{0}:2181/marathon".format(zkhost)] ) as service_container: host = container_ip(service_container) config = dedent( f""" monitors: - type: collectd/marathon host: {host} port: 8080 """ ) assert wait_for(p(tcp_socket_open, host, 8080), 120), "marathon not listening on port" assert wait_for( p(http_status, url="http://{0}:8080/v2/info".format(host), status=[200]), 120 ), "service didn't start" with run_agent(config) as [backend, _, _]: assert wait_for(p(has_datapoint_with_dim, backend, "plugin", "marathon")), "didn't get datapoints"
def test_interior_globbing(): with run_container(ETCD2_IMAGE, command=ETCD_COMMAND) as etcd: assert wait_for(p(container_cmd_exit_0, etcd, "/etcdctl ls"), 5), "etcd didn't start" create_path(etcd, "/env", "prod") create_path(etcd, "/services/cpu/monitor", "- type: collectd/cpu") create_path(etcd, "/services/signalfx/monitor", "- type: collectd/signalfx-metadata") final_conf = INTERNAL_GLOB_CONFIG.substitute(endpoint="%s:2379" % container_ip(etcd)) with run_agent(final_conf) as [backend, _, _]: assert wait_for( p(has_event_with_dim, backend, "plugin", "signalfx-metadata") ), "Datapoints didn't come through" create_path(etcd, "/services/uptime/monitor", "- type: collectd/uptime") assert wait_for(p(has_datapoint_with_dim, backend, "plugin", "uptime")), "didn't get uptime datapoints"
def test_basic_zk_config(): with run_container("zookeeper:3.4") as zk_cont: zkhost = container_ip(zk_cont) assert wait_for(p(tcp_socket_open, zkhost, 2181), 30) create_znode(zk_cont, "/env", "prod") create_znode(zk_cont, "/monitors", "") create_znode(zk_cont, "/monitors/cpu", "- type: collectd/cpu") create_znode(zk_cont, "/monitors/signalfx-metadata", "- type: collectd/signalfx-metadata") final_conf = CONFIG.substitute(zk_endpoint="%s:2181" % zkhost) with run_agent(final_conf) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "signalfx-metadata")) assert wait_for(p(has_datapoint_with_dim, backend, "env", "prod"))