def test_mongo(): with run_container("mongo:3.6") as mongo_cont: host = container_ip(mongo_cont) config = dedent( f""" monitors: - type: collectd/mongodb host: {host} port: 27017 databases: [admin] """ ) assert wait_for(p(tcp_socket_open, host, 27017), 60), "service didn't start" with run_agent(config) as [backend, _, _]: assert wait_for(p(has_datapoint_with_dim, backend, "plugin", "mongo")), "Didn't get mongo datapoints"
def test_chrony(): """ Unfortunately, chronyd is very hard to run in a test environment without giving it the ability to change the time which we don't want, so just check for an error message ensuring that the monitor actually did configure it, even if it doesn't emit any metrics. """ with run_agent(chrony_config) as [_, get_output, _]: def has_error(): return has_log_message( get_output(), level="error", message="chrony plugin: chrony_query (REQ_TRACKING) failed") assert wait_for(has_error), "Didn't get chrony error message"
def test_docker_observer(): with run_agent(CONFIG) as [backend, _, _]: with run_service("nginx", name="nginx-discovery", labels={"mylabel": "abc"}): assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "nginx")), "Didn't get nginx datapoints" assert wait_for(p(has_datapoint_with_dim, backend, "mydim", "abc")), "Didn't get custom label dimension" # Let nginx be removed by docker observer and collectd restart time.sleep(5) backend.reset_datapoints() assert ensure_always( lambda: not has_datapoint_with_dim(backend, "container_name", "nginx-discovery"), 10)
def test_basic_etcd2_config(): with run_container(ETCD2_IMAGE, command=ETCD_COMMAND) as etcd: assert wait_for(p(container_cmd_exit_0, etcd, "/etcdctl ls"), 5), "etcd didn't start" create_path(etcd, "/env", "prod") create_path(etcd, "/monitors/cpu", "- type: collectd/cpu") create_path(etcd, "/monitors/signalfx-metadata", "- type: collectd/signalfx-metadata") final_conf = CONFIG.substitute(endpoint="%s:2379" % container_ip(etcd)) with run_agent(final_conf) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "signalfx-metadata")), "Datapoints didn't come through" assert wait_for(p(has_datapoint_with_dim, backend, "env", "prod")), "dimension wasn't set"
def test_protocols(): """ Test that we get any datapoints without any errors """ expected_metrics = get_monitor_metrics_from_selfdescribe("collectd/protocols") expected_dims = get_monitor_dims_from_selfdescribe("collectd/protocols") with run_agent( """ monitors: - type: collectd/protocols """ ) as [backend, get_output, _]: assert wait_for( p(has_any_metric_or_dim, backend, expected_metrics, expected_dims), timeout_seconds=60 ), "timed out waiting for metrics and/or dimensions!" assert not has_log_message(get_output().lower(), "error"), "error found in agent output!"
def test_docker_container_stats(): with run_service("nginx") as nginx_container: with run_agent(""" monitors: - type: docker-container-stats """) as [backend, _, _]: assert wait_for( p(has_datapoint_with_metric_name, backend, "cpu.percent")), "Didn't get docker cpu datapoints" assert wait_for( p(has_datapoint_with_metric_name, backend, "memory.percent")), "Didn't get docker memory datapoints" assert wait_for( p(has_datapoint_with_dim, backend, "container_id", nginx_container.id)), "Didn't get nginx datapoints"
def test_does_not_set_hostname_if_not_host_specific(): with run_agent(""" hostname: acmeinc.com disableHostDimensions: true monitors: - type: collectd/signalfx-metadata persistencePath: /dev/null - type: collectd/cpu - type: collectd/uptime """) as [backend, _, _]: assert ensure_always( lambda: not has_datapoint_with_dim(backend, "host", "acmeinc.com") ), "Got overridden hostname in datapoint" assert ensure_always( lambda: not has_event_with_dim(backend, "host", "acmeinc.com") ), "Got overridden hostname in event"
def test_win_perf_counters(monitor_config): measurement, config, include_total, metrics = monitor_config with run_agent(config) as [backend, get_output, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "telegraf-win_perf_counters")), ("Didn't get %s datapoints" % measurement) if include_total: assert wait_for( p(has_datapoint_with_dim, backend, "instance", "_Total")), "Didn't get _Total datapoints" for metric in metrics: assert wait_for(p(has_datapoint_with_metric_name, backend, metric)), "Didn't get metric %s" % metric assert not has_log_message(get_output().lower(), "error"), "error found in agent output!"
def test_health_checker_http(): with run_service("nginx") as nginx_container: host = container_ip(nginx_container) assert wait_for(p(tcp_socket_open, host, 80), 60), "service didn't start" with run_agent( string.Template( dedent(""" monitors: - type: collectd/health-checker host: $host port: 80 path: /nonexistent """)).substitute(host=host)) as [backend, _, _]: assert wait_for(p(has_datapoint_with_dim, backend, "plugin", "health_checker")), \ "Didn't get health_checker datapoints"
def test_health_checker_http_windows(): with run_agent( string.Template( dedent( """ monitors: - type: collectd/health-checker host: $host port: 80 path: / """ ) ).substitute(host="localhost") ) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "health_checker") ), "Didn't get health_checker datapoints"
def test_couchbase(tag): with run_service("couchbase", buildargs={"COUCHBASE_VERSION": tag}, hostname="node1.cluster") as couchbase_container: host = container_ip(couchbase_container) config = COUCHBASE_CONFIG.substitute(host=host) assert wait_for(p(tcp_socket_open, host, 8091), 60), "service not listening on port" assert wait_for( p(http_status, url="http://{0}:8091/pools".format(host), status=[401]), 120), "service didn't start" with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "couchbase")), "Didn't get couchbase datapoints"
def test_conviva_single_metric(): with run_agent( dedent( f""" monitors: - type: conviva pulseUsername: {{"#from": "env:CONVIVA_PULSE_USERNAME"}} pulsePassword: {{"#from": "env:CONVIVA_PULSE_PASSWORD"}} metricConfigs: - metricParameter: concurrent_plays """ ) ) as [backend, _, _]: assert wait_for(lambda: len(backend.datapoints) > 0), "Didn't get conviva datapoints" assert ensure_always( p(all_datapoints_have_metric_name, backend, "conviva.concurrent_plays") ), "Received conviva datapoints for other metrics"
def test_omitting_kafka_metrics(version="1.0.1"): with run_kafka(version) as kafka: kafkahost = container_ip(kafka) with run_agent( textwrap.dedent(""" monitors: - type: collectd/kafka host: {0} port: 7099 clusterName: testCluster mBeansToOmit: - kafka-active-controllers """.format(kafkahost))) as [backend, _, _]: assert not wait_for( p(has_datapoint_with_metric_name, backend, "gauge.kafka-active-controllers"), timeout_seconds=60), "Didn't get kafka datapoints"
def test_all_kafka_monitors(version): with run_kafka(version) as kafka: kafka_host = container_ip(kafka) with run_container( kafka.image.id, environment={"JMX_PORT": "8099", "START_AS": "producer", "KAFKA_BROKER": "%s:9092" % (kafka_host,)}, ) as kafka_producer: kafkaproducerhost = container_ip(kafka_producer) assert wait_for(p(tcp_socket_open, kafkaproducerhost, 8099), 60), "kafka producer jmx didn't start" with run_container( kafka.image.id, environment={"JMX_PORT": "9099", "START_AS": "consumer", "KAFKA_BROKER": "%s:9092" % (kafka_host,)}, ) as kafka_consumer: kafkaconsumerhost = container_ip(kafka_consumer) assert wait_for(p(tcp_socket_open, kafkaconsumerhost, 9099), 60), "kafka consumer jmx didn't start" with run_agent( textwrap.dedent( """ monitors: - type: collectd/kafka host: {0} port: 7099 clusterName: testCluster - type: collectd/kafka_producer host: {1} port: 8099 - type: collectd/kafka_consumer host: {2} port: 9099 """.format( kafka_host, kafkaproducerhost, kafkaconsumerhost ) ) ) as [backend, _, _]: assert wait_for( p(has_datapoint_with_metric_name, backend, "gauge.kafka-active-controllers"), timeout_seconds=60 ), "Didn't get kafka datapoints" assert wait_for( p(has_datapoint_with_dim, backend, "cluster", "testCluster"), timeout_seconds=60 ), "Didn't get cluster dimension from kafka datapoints" assert wait_for( p(has_datapoint_with_dim, backend, "client-id", "console-producer"), timeout_seconds=60 ), "Didn't get client-id dimension from kafka_producer datapoints" assert wait_for( p(has_datapoint_with_dim, backend, "client-id", "consumer-1"), timeout_seconds=60 ), "Didn't get client-id dimension from kafka_consumer datapoints"
def test_ecs_container_label_dimension(): with run_service("ecsmeta") as ecsmeta, run_container( "redis:4-alpine") as redis: ecsmeta_ip = container_ip(ecsmeta) redis_ip = container_ip(redis) with run_agent(""" monitors: - type: ecs-metadata metadataEndpoint: http://%s/metadata_single?redis_ip=%s statsEndpoint: http://%s/stats labelsToDimensions: container_name: container_title """ % (ecsmeta_ip, redis_ip, ecsmeta_ip)) as [backend, _, _]: assert ensure_always(lambda: not has_datapoint_with_dim( backend, "container_title", "ecs-seon-fargate-test-3-redis-baf2cfda88f8d8ee4900"))
def test_tail(): with tempfile.NamedTemporaryFile("w+b") as f: config = monitor_config.substitute(file=f.name) f.write(b"disk,customtag1=foo bytes=1024\n") f.flush() with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "customtag1", "foo") ), "didn't get datapoint written before startup" f.write(b"mem,customtag2=foo2 bytes=1024\n") f.flush() assert wait_for( p(has_datapoint_with_dim, backend, "customtag2", "foo2") ), "didn't get datapoint written after startup" assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "telegraf-tail") ), "didn't get datapoint with expected plugin dimension"
def test_signalfx_metadata(): expected_metrics = get_monitor_metrics_from_selfdescribe( "collectd/signalfx-metadata") expected_dims = get_monitor_dims_from_selfdescribe( "collectd/signalfx-metadata") with run_agent(""" monitors: - type: collectd/signalfx-metadata procFSPath: /proc etcPath: /etc persistencePath: /var/run/signalfx-agent - type: collectd/cpu """) as [backend, get_output, _]: assert has_any_metric_or_dim(backend, expected_metrics, expected_dims, timeout=60), \ "timed out waiting for metrics and/or dimensions!" assert not has_log_message(get_output().lower(), "error"), "error found in agent output!"
def test_nginx(): with run_service("expvar") as expvar_container: host = container_ip(expvar_container) assert wait_for(p(tcp_socket_open, host, 8080), 60), "service didn't start" with run_agent( dedent(f""" monitors: - type: expvar host: {host} port: 8080 """)) as [backend, _, _]: for metric in METADATA.included_metrics: print("Waiting for %s" % metric) assert wait_for( p(has_datapoint, backend, metric_name=metric)), "Didn't get included datapoints"
def test_endpoint_config_mapping(): with run_agent(CONFIG) as [backend, _, _]: with run_kafka( "1.1.1", name="kafka-discovery", labels={ "com.signalfx.extraDimensions": "{a: 1}", "com.signalfx.cluster": "prod" }, ): assert wait_for( p(has_datapoint, backend, dimensions={ "a": "1", "cluster": "prod" }) ), "Didn't get kafka datapoints with properly mapped config"
def test_conviva_multi_filter(conviva_filters): with run_agent( dedent( f""" monitors: - type: conviva pulseUsername: {{"#from": "env:CONVIVA_PULSE_USERNAME"}} pulsePassword: {{"#from": "env:CONVIVA_PULSE_PASSWORD"}} metricConfigs: - metricParameter: concurrent_plays filters: {conviva_filters} """ ) ) as [backend, _, _]: for cf in conviva_filters: assert wait_for(p(has_datapoint, backend, "conviva.concurrent_plays", {"filter": cf})), ( "Didn't get conviva datapoints for metric concurrent_plays with dimension {filter: %s}" % cf )
def test_conviva_extra_dimensions(): with run_agent( dedent( f""" monitors: - type: conviva pulseUsername: {{"#from": "env:CONVIVA_PULSE_USERNAME"}} pulsePassword: {{"#from": "env:CONVIVA_PULSE_PASSWORD"}} extraDimensions: metric_source: conviva mydim: foo """ ) ) as [backend, _, _]: assert wait_for(lambda: len(backend.datapoints) > 0), "Didn't get conviva datapoints" assert ensure_always( p(all_datapoints_have_dims, backend, {"metric_source": "conviva", "mydim": "foo"}) ), "Received conviva datapoints without extra dimensions"
def test_does_not_set_hostname_on_monitor_if_not_host_specific(): with run_agent(""" hostname: acmeinc.com monitors: - type: collectd/signalfx-metadata persistencePath: /dev/null - type: collectd/cpu - type: collectd/uptime disableHostDimensions: true """) as [backend, _, _]: assert wait_for( p(has_datapoint_with_all_dims, backend, dict(host="acmeinc.com", plugin="signalfx-metadata")) ), "Didn't get overridden hostname in datapoint" assert ensure_always(lambda: not has_datapoint_with_dim( backend, "uptime", "acmeinc.com") ), "Got overridden hostname in datapoint"
def test_ecs_container_image_filtering(): with run_service("ecsmeta") as ecsmeta, run_container( "redis:4-alpine") as redis: ecsmeta_ip = container_ip(ecsmeta) redis_ip = container_ip(redis) with run_agent(""" monitors: - type: ecs-metadata metadataEndpoint: http://%s/metadata_single?redis_ip=%s statsEndpoint: http://%s/stats excludedImages: - redis:latest """ % (ecsmeta_ip, redis_ip, ecsmeta_ip)) as [backend, _, _]: assert ensure_always(lambda: not has_datapoint_with_dim( backend, "container_id", "c42fa5a73634bcb6e301dfb7b13ac7ead2af473210be6a15da75a290c283b66c" ))
def test_custom_collectd_shutdown(): with run_agent( dedent(""" monitors: - type: collectd/df - type: collectd/custom template: | LoadPlugin "ping" <Plugin ping> Host "google.com" </Plugin> """)) as [backend, _, configure]: assert wait_for(p(has_datapoint_with_dim, backend, "plugin", "ping")), "Didn't get ping datapoints" assert wait_for(p(has_datapoint_with_dim, backend, "plugin", "df")), "Didn't get df datapoints" configure( dedent(""" monitors: - type: collectd/df """)) time.sleep(3) backend.reset_datapoints() assert ensure_always( lambda: not has_datapoint_with_dim(backend, "plugin", "ping") ), "Got ping datapoint when we shouldn't have" configure( dedent(""" monitors: - type: collectd/df - type: collectd/custom template: | LoadPlugin "ping" <Plugin ping> Host "google.com" </Plugin> """)) assert wait_for(p(has_datapoint_with_dim, backend, "plugin", "ping")), "Didn't get ping datapoints"
def test_kong(kong_image): kong_env = dict(KONG_ADMIN_LISTEN='0.0.0.0:8001', KONG_LOG_LEVEL='warn', KONG_DATABASE='postgres', KONG_PG_DATABASE='kong') with run_container('postgres:9.5', environment=dict(POSTGRES_USER='******', POSTGRES_DB='kong')) as db: db_ip = container_ip(db) kong_env['KONG_PG_HOST'] = db_ip def db_is_ready(): return db.exec_run('pg_isready -U postgres').exit_code == 0 assert wait_for(db_is_ready) with run_container(kong_image, environment=kong_env, command='sleep inf') as migrations: def db_is_reachable(): return migrations.exec_run('psql -h {} -U postgres'.format(db_ip)).exit_code == 0 assert wait_for(db_is_reachable) assert migrations.exec_run('kong migrations up --v').exit_code == 0 with run_container(kong_image, environment=kong_env) as kong: kong_ip = container_ip(kong) def kong_is_listening(): try: return get('http://{}:8001/signalfx'.format(kong_ip)).status_code == 200 except RequestException: return False assert wait_for(kong_is_listening) config = string.Template(dedent(''' monitors: - type: collectd/kong host: $host port: 8001 metrics: - metric: connections_handled report: true ''')).substitute(host=container_ip(kong)) with run_agent(config) as [backend, _, _]: assert wait_for(p(has_datapoint_with_dim, backend, 'plugin', 'kong')), "Didn't get Kong data point"
def test_hadoopjmx(version, nodeType): """ Any new versions of hadoop should be manually built, tagged, and pushed to quay.io, i.e. docker build \ -t quay.io/signalfx/hadoop-test:<version> \ --build-arg HADOOP_VER=<version> \ <repo_root>/test-services/hadoop docker push quay.io/signalfx/hadoop-test:<version> """ with run_container("quay.io/signalfx/hadoop-test:%s" % version, hostname="hadoop-master") as hadoop_master: with run_container("quay.io/signalfx/hadoop-test:%s" % version, hostname="hadoop-worker1") as hadoop_worker1: if nodeType in ["nameNode", "resourceManager"]: container = hadoop_master else: container = hadoop_worker1 host = container_ip(container) port = NODETYPE_PORT[nodeType] if nodeType in ["resourceManager", "nodeManager"]: yarn_var = YARN_VAR[nodeType] yarn_opts = YARN_OPTS % (yarn_var, port, yarn_var) cmd = [ "/bin/bash", "-c", "echo 'export %s' >> %s" % (yarn_opts, YARN_ENV_PATH) ] container.exec_run(cmd) start_hadoop(hadoop_master, hadoop_worker1) # wait for jmx to be available assert wait_for(p(tcp_socket_open, host, port), 60), "jmx service not listening on port %d" % port # start the agent with hadoopjmx config config = HADOOPJMX_CONFIG.substitute(host=host, port=port, nodeType=nodeType) with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "nodeType", nodeType)), ( "Didn't get hadoopjmx datapoints for nodeType %s" % nodeType)
def test_cpu_utilization_per_core(): with run_agent(""" monitors: - type: collectd/signalfx-metadata procFSPath: /proc etcPath: /etc persistencePath: /var/run/signalfx-agent perCoreCPUUtil: true - type: collectd/cpu metricsToInclude: - metricNames: - cpu.utilization_per_core monitorType: collectd/signalfx-metadata """) as [backend, get_output, _]: assert wait_for( p(has_datapoint, backend, "cpu.utilization_per_core", {"plugin": "signalfx-metadata"})) assert not has_log_message(get_output().lower(), "error"), "error found in agent output!"
def test_rabbitmq_broker_name(): with run_container("rabbitmq:3.6-management") as rabbitmq_cont: host = rabbitmq_cont.attrs["NetworkSettings"]["IPAddress"] config = rabbitmq_config.substitute(host=host) wait_for_rabbit_to_start(rabbitmq_cont) with run_agent(""" monitors: - type: collectd/rabbitmq host: %s brokerName: '{{.host}}-{{.username}}' port: 15672 username: guest password: guest collectNodes: true collectChannels: true """ % (host, )) as [backend, _, _]: assert wait_for(p(has_datapoint_with_dim, backend, "plugin_instance", "%s-guest" % host)), \ "Didn't get expected plugin_instance dimension"
def test_vault_token_renewal(): """ Test the token renewal feature """ with run_vault() as [vault_client, get_audit_events]: new_token = vault_client.create_token(policies=["root"], renewable=True, ttl="12s") vault_client.write("secret/data/appinfo", data={"env": "prod"}) with run_agent( dedent(f""" intervalSeconds: 2 globalDimensions: env: {{"#from": "vault:secret/data/appinfo[data.env]"}} configSources: vault: vaultToken: {new_token['auth']['client_token']} vaultAddr: {vault_client.url} monitors: - type: collectd/uptime """)) as [backend, _, _]: assert wait_for( p(has_datapoint, backend, dimensions={"env": "prod"})) assert audit_read_paths(get_audit_events()) == [ "secret/data/appinfo" ], "expected one reads" assert audit_token_renewals(get_audit_events()) == [ new_token["auth"]["accessor"] ], "token immediately renews" time.sleep(10) assert audit_token_renewals(get_audit_events()) == [ new_token["auth"]["accessor"], new_token["auth"]["accessor"], ], "token has renewed twice now" time.sleep(10) assert len(audit_token_renewals( get_audit_events())) >= 3, "token has renewed three times now"
def test_solr_monitor(): with run_service("solr") as solr_container: host = container_ip(solr_container) config = dedent(f""" monitors: - type: collectd/solr host: {host} port: 8983 """) assert wait_for(p(tcp_socket_open, host, 8983), 60), "service not listening on port" with run_agent(config) as [backend, get_output, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "solr")), "Didn't get solr datapoints" assert ensure_always(lambda: has_datapoint_with_metric_name( backend, "counter.solr.http_5xx_responses")) assert not has_log_message(get_output().lower(), "error"), "error found in agent output!"