def test_dogstream_events_validation(self): log_data = [ {"msg_title": "title", "timestamp": 1336999561}, {"msg_text": "body", "timestamp": 1336999561}, {"none of the above": "should get filtered out", "timestamp": 1336999561}, ] expected_output = { "dogstreamEvents": [ { "timestamp": 1336999561, "msg_title": "title", "event_type": EventDefaults.EVENT_TYPE, "aggregation_key": EventDefaults.EVENT_OBJECT, "event_object": EventDefaults.EVENT_OBJECT, }, { "timestamp": 1336999561, "msg_text": "body", "event_type": EventDefaults.EVENT_TYPE, "aggregation_key": EventDefaults.EVENT_OBJECT, "event_object": EventDefaults.EVENT_OBJECT, }, ] } self._write_log([repr(d) for d in log_data]) dogstream = Dogstreams.init(self.logger, {'dogstreams': '{0}:{1}:repr_event_parser'.format(self.log_file.name, __name__)}) actual_output = dogstream.check(self.config, move_end=False) self.assertEquals(expected_output, actual_output)
def test_alt_host_perfdata(self): from checks.datadog import NagiosHostPerfData self._write_nagios_config([ "host_perfdata_file=%s" % NAGIOS_TEST_HOST, "host_perfdata_file_template=%s" % NAGIOS_TEST_HOST_TEMPLATE, ]) dogstream = Dogstreams.init(self.logger, self.agent_config) self.assertEquals([NagiosHostPerfData], [d.__class__ for d in dogstream.dogstreams]) actual_output = dogstream.check(self.agent_config, move_end=False) expected_output = { 'dogstream': [('nagios.host.pl', 1339511440, 0.0, { 'warn': '80', 'metric_type': 'gauge', 'host_name': 'localhost', 'min': '0', 'crit': '100', 'unit': '%' }), ('nagios.host.rta', 1339511440, 0.048, { 'warn': '3000.000000', 'metric_type': 'gauge', 'host_name': 'localhost', 'min': '0.000000', 'crit': '5000.000000', 'unit': 'ms' })] } self.assertEquals(expected_output, actual_output)
def test_dogstream_events(self): log_data = [ '2012-05-14 12:46:01 [ERROR] - host0 is down (broke its collarbone)', '2012-05-14 12:48:07 [ERROR] - host1 is down (got a bloody nose)', '2012-05-14 12:52:03 [RECOVERY] - host0 is up (collarbone healed)', '2012-05-14 12:59:09 [RECOVERY] - host1 is up (nose stopped bleeding)', ] expected_output = { "dogstreamEvents": [ { "timestamp": 1336999561, "alert_type": "error", "host": "host0", "msg_title": "host0 is down (broke its collarbone)", "msg_text": "2012-05-14 12:46:01 [ERROR] - host0 is down (broke its collarbone)", "event_type": EventDefaults.EVENT_TYPE, "aggregation_key": EventDefaults.EVENT_OBJECT, "event_object": EventDefaults.EVENT_OBJECT, }, { "timestamp": 1336999687, "alert_type": "error", "host": "host1", "msg_title": "host1 is down (got a bloody nose)", "msg_text": "2012-05-14 12:48:07 [ERROR] - host1 is down (got a bloody nose)", "event_type": EventDefaults.EVENT_TYPE, "aggregation_key": EventDefaults.EVENT_OBJECT, "event_object": EventDefaults.EVENT_OBJECT, }, { "timestamp": 1336999923, "alert_type": "success", "host": "host0", "msg_title": "host0 is up (collarbone healed)", "msg_text": "2012-05-14 12:52:03 [RECOVERY] - host0 is up (collarbone healed)", "event_type": EventDefaults.EVENT_TYPE, "aggregation_key": EventDefaults.EVENT_OBJECT, "event_object": EventDefaults.EVENT_OBJECT, }, { "timestamp": 1337000349, "alert_type": "success", "host": "host1", "msg_title": "host1 is up (nose stopped bleeding)", "msg_text": "2012-05-14 12:59:09 [RECOVERY] - host1 is up (nose stopped bleeding)", "event_type": EventDefaults.EVENT_TYPE, "aggregation_key": EventDefaults.EVENT_OBJECT, "event_object": EventDefaults.EVENT_OBJECT, }, ] } self._write_log(log_data) dogstream = Dogstreams.init(self.logger, {'dogstreams': '{0}:{1}:parse_events'.format(self.log_file.name, __name__)}) actual_output = dogstream.check(self.config, move_end=False) self.assertEquals(expected_output, actual_output)
def __init__(self, agentConfig, emitters, systemStats): self.emit_duration = None self.agentConfig = agentConfig # system stats is generated by config.get_system_stats self.agentConfig["system_stats"] = systemStats # agent config is used during checks, system_stats can be accessed through the config self.os = get_os() self.plugins = None self.emitters = emitters self.metadata_interval = int(agentConfig.get("metadata_interval", 10 * 60)) self.metadata_start = time.time() socket.setdefaulttimeout(15) self.run_count = 0 self.continue_running = True self.metadata_cache = None self.initialized_checks_d = [] self.init_failed_checks_d = [] # Unix System Checks self._unix_system_checks = { "disk": u.Disk(log), "io": u.IO(log), "load": u.Load(log), "memory": u.Memory(log), "processes": u.Processes(log), "cpu": u.Cpu(log), } # Win32 System `Checks self._win32_system_checks = { "disk": w32.Disk(log), "io": w32.IO(log), "proc": w32.Processes(log), "memory": w32.Memory(log), "network": w32.Network(log), "cpu": w32.Cpu(log), } # Old-style metric checks self._ganglia = Ganglia(log) self._dogstream = Dogstreams.init(log, self.agentConfig) self._ddforwarder = DdForwarder(log, self.agentConfig) # Agent Metrics self._agent_metrics = CollectorMetrics(log) self._metrics_checks = [] # Custom metric checks for module_spec in [s.strip() for s in self.agentConfig.get("custom_checks", "").split(",")]: if len(module_spec) == 0: continue try: self._metrics_checks.append(modules.load(module_spec, "Check")(log)) log.info("Registered custom check %s" % module_spec) log.warning( "Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version" ) except Exception, e: log.exception("Unable to load custom check module %s" % module_spec)
def __init__(self, agentConfig, emitters, systemStats): self.emit_duration = None self.agentConfig = agentConfig # system stats is generated by config.get_system_stats self.agentConfig['system_stats'] = systemStats # agent config is used during checks, system_stats can be accessed through the config self.os = get_os() self.plugins = None self.emitters = emitters self.metadata_interval = int(agentConfig.get('metadata_interval', 10 * 60)) self.metadata_start = time.time() socket.setdefaulttimeout(15) self.run_count = 0 self.continue_running = True self.metadata_cache = None self.checks_d = [] # Unix System Checks self._unix_system_checks = { 'disk': u.Disk(log), 'io': u.IO(log), 'load': u.Load(log), 'memory': u.Memory(log), 'processes': u.Processes(log), 'cpu': u.Cpu(log) } # Win32 System `Checks self._win32_system_checks = { 'disk': w32.Disk(log), 'io': w32.IO(log), 'proc': w32.Processes(log), 'memory': w32.Memory(log), 'network': w32.Network(log), 'cpu': w32.Cpu(log) } # Old-style metric checks self._ganglia = Ganglia(log) self._cassandra = Cassandra() self._dogstream = Dogstreams.init(log, self.agentConfig) self._ddforwarder = DdForwarder(log, self.agentConfig) # Agent Metrics self._agent_metrics = CollectorMetrics(log) # Metric Checks self._metrics_checks = [ Memcache(log), ] # Custom metric checks for module_spec in [s.strip() for s in self.agentConfig.get('custom_checks', '').split(',')]: if len(module_spec) == 0: continue try: self._metrics_checks.append(modules.load(module_spec, 'Check')(log)) log.info("Registered custom check %s" % module_spec) except Exception, e: log.exception('Unable to load custom check module %s' % module_spec)
def __init__(self, agentConfig, emitters, systemStats): self.emit_duration = None self.agentConfig = agentConfig # system stats is generated by config.get_system_stats self.agentConfig['system_stats'] = systemStats # agent config is used during checks, system_stats can be accessed through the config self.os = get_os() self.plugins = None self.emitters = emitters self.metadata_interval = int(agentConfig.get('metadata_interval', 10 * 60)) self.metadata_start = time.time() socket.setdefaulttimeout(15) self.run_count = 0 self.continue_running = True self.metadata_cache = None self.initialized_checks_d = [] self.init_failed_checks_d = [] # Unix System Checks self._unix_system_checks = { 'disk': u.Disk(log), 'io': u.IO(log), 'load': u.Load(log), 'memory': u.Memory(log), 'processes': u.Processes(log), 'cpu': u.Cpu(log) } # Win32 System `Checks self._win32_system_checks = { 'disk': w32.Disk(log), 'io': w32.IO(log), 'proc': w32.Processes(log), 'memory': w32.Memory(log), 'network': w32.Network(log), 'cpu': w32.Cpu(log) } # Old-style metric checks self._ganglia = Ganglia(log) self._dogstream = Dogstreams.init(log, self.agentConfig) self._ddforwarder = DdForwarder(log, self.agentConfig) # Agent Metrics self._agent_metrics = CollectorMetrics(log) self._metrics_checks = [] # Custom metric checks for module_spec in [s.strip() for s in self.agentConfig.get('custom_checks', '').split(',')]: if len(module_spec) == 0: continue try: self._metrics_checks.append(modules.load(module_spec, 'Check')(log)) log.info("Registered custom check %s" % module_spec) log.warning("Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version") except Exception, e: log.exception('Unable to load custom check module %s' % module_spec)
def setUp(self): TailTestCase.setUp(self) self.config = { 'dogstreams': self.log_file.name, 'check_freq': 5, } log.info("Test config: %s" % self.config) self.dogstream = Dogstreams.init(self.logger, self.config) self.maxDiff = None
def test_host_perfdata(self): from checks.datadog import NagiosHostPerfData self._write_nagios_config([ "host_perfdata_file=%s" % self.log_file.name, "host_perfdata_file_template=DATATYPE::HOSTPERFDATA\tTIMET::$TIMET$\tHOSTNAME::$HOSTNAME$\tHOSTPERFDATA::$HOSTPERFDATA$\tHOSTCHECKCOMMAND::$HOSTCHECKCOMMAND$\tHOSTSTATE::$HOSTSTATE$\tHOSTSTATETYPE::$HOSTSTATETYPE$", ]) dogstream = Dogstreams.init(self.logger, self.agent_config) self.assertEquals([NagiosHostPerfData], [d.__class__ for d in dogstream.dogstreams]) log_data = [ ( "DATATYPE::HOSTPERFDATA", "TIMET::1000000010", "HOSTNAME::myhost1", "HOSTPERFDATA::" + " ".join([ "rta=0.978000ms;5000.000000;5000.000000;0.000000", "pl=0%;100;100;0", ]), "HOSTCHECKCOMMAND::check-host-alive", "HOSTSTATE::UP", "HOSTSTATETYPE::HARD", ), ] expected_output = [ ('nagios.host.rta', 1000000010, 0.978, { 'metric_type': 'gauge', 'host_name': 'myhost1', 'unit': 'ms', 'warn': '5000.000000', 'crit': '5000.000000', 'min': '0.000000' }), ('nagios.host.pl', 1000000010, 0., { 'metric_type': 'gauge', 'host_name': 'myhost1', 'unit': '%', 'warn': '100', 'crit': '100', 'min': '0' }), ] expected_output.sort(key=point_sorter) self._write_log(('\t'.join(data) for data in log_data)) actual_output = dogstream.check(self.agent_config, move_end=False)['dogstream'] actual_output.sort(key=point_sorter) self.assertEquals(expected_output, actual_output)
def test_host_perfdata(self): from checks.datadog import NagiosHostPerfData self._write_nagios_config([ "host_perfdata_file=%s" % self.log_file.name, "host_perfdata_file_template=DATATYPE::HOSTPERFDATA\tTIMET::$TIMET$\tHOSTNAME::$HOSTNAME$\tHOSTPERFDATA::$HOSTPERFDATA$\tHOSTCHECKCOMMAND::$HOSTCHECKCOMMAND$\tHOSTSTATE::$HOSTSTATE$\tHOSTSTATETYPE::$HOSTSTATETYPE$", ]) dogstream = Dogstreams.init(self.logger, self.agent_config) self.assertEquals([NagiosHostPerfData], [d.__class__ for d in dogstream.dogstreams]) log_data = [ ("DATATYPE::HOSTPERFDATA", "TIMET::1000000010", "HOSTNAME::myhost1", "HOSTPERFDATA::" + " ".join([ "rta=0.978000ms;5000.000000;5000.000000;0.000000", "pl=0%;100;100;0", ]), "HOSTCHECKCOMMAND::check-host-alive", "HOSTSTATE::UP", "HOSTSTATETYPE::HARD", ), ] expected_output = [ ('nagios.host.rta', 1000000010, 0.978, { 'metric_type': 'gauge', 'host_name': 'myhost1', 'unit': 'ms', 'warn': '5000.000000', 'crit': '5000.000000', 'min': '0.000000' }), ('nagios.host.pl', 1000000010, 0., { 'metric_type': 'gauge', 'host_name': 'myhost1', 'unit': '%', 'warn': '100', 'crit': '100', 'min': '0' }), ] expected_output.sort(key=point_sorter) self._write_log(('\t'.join(data) for data in log_data)) actual_output = dogstream.check(self.agent_config, move_end=False)['dogstream'] actual_output.sort(key=point_sorter) self.assertEquals(expected_output, actual_output)
def __init__(self, agentConfig, emitters): self.agentConfig = agentConfig self.plugins = None self.emitters = emitters self.os = None self.checksLogger = logging.getLogger("checks") socket.setdefaulttimeout(15) self._apache = Apache(self.checksLogger) self._nginx = Nginx(self.checksLogger) self._disk = Disk(self.checksLogger) self._io = IO() self._load = Load(self.checksLogger) self._memory = Memory(self.checksLogger) self._network = Network(self.checksLogger) self._processes = Processes() self._cpu = Cpu() self._couchdb = CouchDb(self.checksLogger) self._mongodb = MongoDb(self.checksLogger) self._mysql = MySql(self.checksLogger) self._pgsql = PostgreSql(self.checksLogger) self._rabbitmq = RabbitMq() self._ganglia = Ganglia(self.checksLogger) self._cassandra = Cassandra() self._redis = Redis(self.checksLogger) self._jvm = Jvm(self.checksLogger) self._tomcat = Tomcat(self.checksLogger) self._activemq = ActiveMQ(self.checksLogger) self._solr = Solr(self.checksLogger) self._memcache = Memcache(self.checksLogger) self._dogstream = Dogstreams.init(self.checksLogger, self.agentConfig) self._ddforwarder = DdForwarder(self.checksLogger, self.agentConfig) # All new checks should be metrics checks: self._metrics_checks = [ Cacti(self.checksLogger), Redis(self.checksLogger), Varnish(self.checksLogger), ElasticSearch(self.checksLogger), ] for module_spec in [s.strip() for s in self.agentConfig.get("custom_checks", "").split(",")]: if len(module_spec) == 0: continue try: self._metrics_checks.append(modules.load(module_spec, "Check")(self.checksLogger)) self.checksLogger.info("Registered custom check %s" % module_spec) except Exception, e: self.checksLogger.exception("Unable to load custom check module %s" % module_spec)
def test_alt_service_perfdata(self): from checks.datadog import NagiosServicePerfData self._write_nagios_config([ "service_perfdata_file=%s" % NAGIOS_TEST_SVC, "service_perfdata_file_template=%s" % NAGIOS_TEST_SVC_TEMPLATE, ]) dogstream = Dogstreams.init(self.logger, self.agent_config) self.assertEquals([NagiosServicePerfData], [d.__class__ for d in dogstream.dogstreams]) actual_output = dogstream.check(self.agent_config, move_end=False) expected_output = {'dogstream': [('nagios.current_users.users', 1339511440, 1.0, {'metric_type': 'gauge', 'warn': '20', 'host_name': 'localhost', 'crit': '50', 'min': '0'}), ('nagios.ping.pl', 1339511500, 0.0, {'warn': '20', 'metric_type': 'gauge', 'host_name': 'localhost', 'min': '0', 'crit': '60', 'unit': '%'}), ('nagios.ping.rta', 1339511500, 0.065, {'warn': '100.000000', 'metric_type': 'gauge', 'host_name': 'localhost', 'min': '0.000000', 'crit': '500.000000', 'unit': 'ms'}), ('nagios.root_partition', 1339511560, 2470.0, {'min': '0', 'max': '7315', 'device_name': '/', 'warn': '5852', 'metric_type': 'gauge', 'host_name': 'localhost', 'crit': '6583', 'unit': 'MB'})]} self.assertEquals(expected_output, actual_output)
def test_alt_host_perfdata(self): from checks.datadog import NagiosHostPerfData self._write_nagios_config([ "host_perfdata_file=%s" % NAGIOS_TEST_HOST, "host_perfdata_file_template=%s" % NAGIOS_TEST_HOST_TEMPLATE, ]) dogstream = Dogstreams.init(self.logger, self.agent_config) self.assertEquals([NagiosHostPerfData], [d.__class__ for d in dogstream.dogstreams]) actual_output = dogstream.check(self.agent_config, move_end=False) expected_output = {'dogstream': [('nagios.host.pl', 1339511440, 0.0, {'warn': '80', 'metric_type': 'gauge', 'host_name': 'localhost', 'min': '0', 'crit': '100', 'unit': '%'}), ('nagios.host.rta', 1339511440, 0.048, {'warn': '3000.000000', 'metric_type': 'gauge', 'host_name': 'localhost', 'min': '0.000000', 'crit': '5000.000000', 'unit': 'ms'})]} self.assertEquals(expected_output, actual_output)
def test_dogstream_ancient_function_plugin(self): """Ensure that pre-stateful plugins still work""" log_data = [ 'test.metric.simple 1000000000 1 metric_type=gauge', 'test.metric.simple 1100000000 1 metric_type=gauge' ] expected_output = { "dogstream": [ ('test.metric.simple', 1000000000, 1, self.gauge), ('test.metric.simple', 1100000000, 1, self.gauge)] } self._write_log(log_data) plugdog = Dogstreams.init(self.logger, {'dogstreams': '{0}:{1}:parse_ancient_function_plugin'.format(self.log_file.name, __name__)}) actual_output = plugdog.check(self.config, move_end=False)
def test_alt_service_perfdata(self): from checks.datadog import NagiosServicePerfData self._write_nagios_config([ "service_perfdata_file=%s" % NAGIOS_TEST_SVC, "service_perfdata_file_template=%s" % NAGIOS_TEST_SVC_TEMPLATE, ]) dogstream = Dogstreams.init(self.logger, self.agent_config) self.assertEquals([NagiosServicePerfData], [d.__class__ for d in dogstream.dogstreams]) actual_output = dogstream.check(self.agent_config, move_end=False) expected_output = { 'dogstream': [('nagios.current_users.users', 1339511440, 1.0, { 'metric_type': 'gauge', 'warn': '20', 'host_name': 'localhost', 'crit': '50', 'min': '0' }), ('nagios.ping.pl', 1339511500, 0.0, { 'warn': '20', 'metric_type': 'gauge', 'host_name': 'localhost', 'min': '0', 'crit': '60', 'unit': '%' }), ('nagios.ping.rta', 1339511500, 0.065, { 'warn': '100.000000', 'metric_type': 'gauge', 'host_name': 'localhost', 'min': '0.000000', 'crit': '500.000000', 'unit': 'ms' }), ('nagios.root_partition', 1339511560, 2470.0, { 'min': '0', 'max': '7315', 'device_name': '/', 'warn': '5852', 'metric_type': 'gauge', 'host_name': 'localhost', 'crit': '6583', 'unit': 'MB' })] } self.assertEquals(expected_output, actual_output)
def test_dogstream_stateful(self): log_data = [ 'test.metric.accumulator 1000000000 1 metric_type=counter', 'test.metric.accumulator 1100000000 1 metric_type=counter' ] expected_output = { "dogstream": [ ('test.metric.accumulator', 1000000000, 1, self.counter), ('test.metric.accumulator', 1100000000, 2, self.counter)] } self._write_log(log_data) statedog = Dogstreams.init(self.logger, {'dogstreams': '%s:tests.test_datadog:parse_stateful' % self.log_file.name}) actual_output = statedog.check(self.config, move_end=False) self.assertEquals(expected_output, actual_output)
def test_dogstream_new_plugin(self): """Ensure that class-based stateful plugins work""" log_data = [ 'test.metric.accumulator 1000000000 1 metric_type=counter', 'test.metric.accumulator 1100000000 1 metric_type=counter' ] expected_output = { "dogstream": [ ('foo.bar:test.metric.accumulator', 1000000000, 1, self.counter), ('foo.bar:test.metric.accumulator', 1100000000, 2, self.counter)] } self._write_log(log_data) statedog = Dogstreams.init(self.logger, {'dogstreams': '{0}:{1}:ParseClassPlugin:foo:bar'.format(self.log_file.name, __name__)}) actual_output = statedog.check(self.config, move_end=False) self.assertEquals(expected_output, actual_output)
def test_dogstream_function_plugin(self): """Ensure that non-class-based stateful plugins work""" log_data = [ 'test.metric.accumulator 1000000000 1 metric_type=counter', 'test.metric.accumulator 1100000000 1 metric_type=counter' ] expected_output = { "dogstream": [ ('test.metric.accumulator', 1000000000, 1, self.counter), ('test.metric.accumulator', 1100000000, 2, self.counter)] } self._write_log(log_data) statedog = Dogstreams.init(self.logger, {'dogstreams': '%s:tests.test_datadog:parse_function_plugin' % self.log_file.name}) actual_output = statedog.check(self.config, move_end=False) self.assertEquals(expected_output, actual_output)
def __init__(self, agentConfig, emitters): self.agentConfig = agentConfig self.plugins = None self.emitters = emitters self.os = None self.checksLogger = logging.getLogger('checks') socket.setdefaulttimeout(15) self._apache = Apache(self.checksLogger) self._nginx = Nginx(self.checksLogger) self._disk = Disk(self.checksLogger) self._io = IO() self._load = Load(self.checksLogger) self._memory = Memory(self.checksLogger) self._network = Network(self.checksLogger) self._processes = Processes() self._cpu = Cpu() self._couchdb = CouchDb(self.checksLogger) self._mongodb = MongoDb(self.checksLogger) self._mysql = MySql(self.checksLogger) self._pgsql = PostgreSql(self.checksLogger) self._rabbitmq = RabbitMq() self._ganglia = Ganglia(self.checksLogger) self._cassandra = Cassandra() self._redis = Redis(self.checksLogger) self._jvm = Jvm(self.checksLogger) self._tomcat = Tomcat(self.checksLogger) self._activemq = ActiveMQ(self.checksLogger) self._solr = Solr(self.checksLogger) self._memcache = Memcache(self.checksLogger) self._dogstream = Dogstreams.init(self.checksLogger, self.agentConfig) self._ddforwarder = DdForwarder(self.checksLogger, self.agentConfig) # All new checks should be metrics checks: self._metrics_checks = [ Cacti(self.checksLogger), Redis(self.checksLogger), Varnish(self.checksLogger), ElasticSearch(self.checksLogger), ] self._event_checks = [Hudson(), Nagios(socket.gethostname())] self._resources_checks = [ResProcesses(self.checksLogger,self.agentConfig)] self._ec2 = EC2(self.checksLogger)
def test_supervisord_parser(self): from dogstream import supervisord_log log_data = """2012-07-16 22:30:48,335 INFO spawned: 'monitor' with pid 20216 2012-07-14 03:02:47,325 INFO success: foo_bar entered RUNNING state, process has stayed up for > than 2 seconds (startsecs) 2012-07-17 02:53:04,600 CRIT Server 'inet_http_server' running without any HTTP authentication checking 2012-07-14 04:54:34,193 WARN received SIGTERM indicating exit request """ event_type = supervisord_log.EVENT_TYPE expected_output = { "dogstreamEvents": [ { "alert_type": "info", "event_type": event_type, "aggregation_key": "monitor", "event_object": "monitor", "msg_title": "spawned: 'monitor' with pid 20216", "timestamp": int(time.mktime(datetime(2012, 7, 16, 22, 30, 48).timetuple())), }, { "alert_type": "success", "event_type": event_type, "aggregation_key": "foo_bar", "event_object": "foo_bar", "msg_title": "success: foo_bar entered RUNNING state, " "process has stayed up for > than 2 seconds (startsecs)", "timestamp": int(time.mktime(datetime(2012, 7, 14, 3, 2, 47).timetuple())), }, { "alert_type": "error", "event_type": event_type, "aggregation_key": "inet_http_server", "event_object": "inet_http_server", "msg_title": "Server 'inet_http_server' running without any HTTP authentication checking", "timestamp": int(time.mktime(datetime(2012, 7, 17, 2, 53, 4).timetuple())), }, { "alert_type": "warning", "event_type": event_type, "aggregation_key": "SIGTERM", "event_object": "SIGTERM", "msg_title": "received SIGTERM indicating exit request", "timestamp": int(time.mktime(datetime(2012, 7, 14, 4, 54, 34).timetuple())), }, ] } self._write_log(log_data.split("\n")) dogstream = Dogstreams.init(self.logger, {'dogstreams': '%s:dogstream.supervisord_log:parse_supervisord' % self.log_file.name}) actual_output = dogstream.check(self.config, move_end=False) self.assertEquals(expected_output, actual_output)
def test_supervisord_parser(self): from dogstream import supervisord_log log_data = """2012-07-16 22:30:48,335 INFO spawned: 'monitor' with pid 20216 2012-07-14 03:02:47,325 INFO success: foo_bar entered RUNNING state, process has stayed up for > than 2 seconds (startsecs) 2012-07-17 02:53:04,600 CRIT Server 'inet_http_server' running without any HTTP authentication checking 2012-07-14 04:54:34,193 WARN received SIGTERM indicating exit request """ event_type = supervisord_log.EVENT_TYPE expected_output = { "dogstreamEvents":[ { "alert_type": "info", "event_type": event_type, "aggregation_key": "monitor", "event_object": "monitor", "msg_title": "spawned: 'monitor' with pid 20216", "timestamp": int(time.mktime(datetime(2012, 7, 16, 22, 30, 48).timetuple())), }, { "alert_type": "success", "event_type": event_type, "aggregation_key": "foo_bar", "event_object": "foo_bar", "msg_title": "success: foo_bar entered RUNNING state, " "process has stayed up for > than 2 seconds (startsecs)", "timestamp": int(time.mktime(datetime(2012, 7, 14, 3, 2, 47).timetuple())), }, { "alert_type": "error", "event_type": event_type, "aggregation_key": "inet_http_server", "event_object": "inet_http_server", "msg_title": "Server 'inet_http_server' running without any HTTP authentication checking", "timestamp": int(time.mktime(datetime(2012, 7, 17, 2, 53, 4).timetuple())), }, { "alert_type": "warning", "event_type": event_type, "aggregation_key": "SIGTERM", "event_object": "SIGTERM", "msg_title": "received SIGTERM indicating exit request", "timestamp": int(time.mktime(datetime(2012, 7, 14, 4, 54, 34).timetuple())), }, ]} self._write_log(log_data.split("\n")) dogstream = Dogstreams.init(self.logger, {'dogstreams': '%s:dogstream.supervisord_log:parse_supervisord' % self.log_file.name}) actual_output = dogstream.check(self.config, move_end=False) self.assertEquals(expected_output, actual_output)
def test_cassandra_parser(self): from dogstream import cassandra, common log_data = """ INFO [CompactionExecutor:1594] 2012-05-12 21:05:12,924 Saved test_data-Encodings-KeyCache (86400 items) in 85 ms INFO [CompactionExecutor:1595] 2012-05-12 21:05:15,144 Saved test_data-Metrics-KeyCache (86400 items) in 96 ms INFO [CompactionExecutor:1596] 2012-05-12 21:10:48,058 Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6528-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6531-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6529-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6530-Data.db')] INFO [CompactionExecutor:1596] 2012-05-12 21:10:54,851 Compacted to [/var/cassandra/a-hc-65-Data.db,]. 102,079,134 to 101,546,397 INFO [CompactionExecutor:1598] 2012-05-12 22:05:04,313 Saved test_data-ResourcesMetadata-KeyCache (1 items) in 10 ms INFO [CompactionExecutor:1599] 2012-05-12 22:05:14,813 Saved test_data-Encodings-KeyCache (86400 items) in 83 ms INFO [CompactionExecutor:1630] 2012-05-13 13:05:44,963 Saved test_data-Metrics-KeyCache (86400 items) in 77 ms INFO [CompactionExecutor:1631] 2012-05-13 13:15:01,923 Nothing to compact in data_log. Use forceUserDefinedCompaction if you wish to force compaction of single sstables (e.g. for tombstone collection) INFO [CompactionExecutor:1632] 2012-05-13 13:15:01,927 Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6527-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6522-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6532-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6517-Data.db')] INFO [CompactionExecutor:1632] 2012-05-13 13:27:17,685 Compacting large row test_data/series:6c6f677c32 (782001077 bytes) incrementally INFO [CompactionExecutor:34] 2012-05-14 18:00:41,281 Saved test_data-Encodings-KeyCache (86400 items) in 78 ms INFO 13:27:17,685 Compacting large row test_data/series:6c6f677c32 (782001077 bytes) incrementally """ alert_type = cassandra.ALERT_TYPES["INFO"] event_type = cassandra.EVENT_TYPE event_object = EventDefaults.EVENT_OBJECT expected_output = { "dogstreamEvents": [ { "timestamp": cassandra.parse_date("2012-05-12 21:10:48,058"), "msg_title": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6528-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6531-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6529-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6530-Data.db')]"[ 0:common.MAX_TITLE_LEN], "msg_text": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6528-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6531-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6529-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6530-Data.db')]", "alert_type": alert_type, "auto_priority": 0, "event_type": event_type, "aggregation_key": event_object, "event_object": event_object, }, { "timestamp": cassandra.parse_date("2012-05-12 21:10:54,851"), "msg_title": "Compacted to [/var/cassandra/a-hc-65-Data.db,]. 102,079,134 to 101,546,397", "alert_type": alert_type, "auto_priority": 0, "event_type": event_type, "aggregation_key": event_object, "event_object": event_object, }, { "timestamp": cassandra.parse_date("2012-05-13 13:15:01,927"), "msg_title": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6527-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6522-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6532-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6517-Data.db')]"[ 0:common.MAX_TITLE_LEN], "msg_text": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6527-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6522-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6532-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6517-Data.db')]", "alert_type": alert_type, "event_type": event_type, "auto_priority": 0, "aggregation_key": event_object, "event_object": event_object, }, { "timestamp": cassandra.parse_date("2012-05-13 13:27:17,685"), "msg_title": "Compacting large row test_data/series:6c6f677c32 (782001077 bytes) incrementally", "alert_type": alert_type, "event_type": event_type, "auto_priority": 0, "aggregation_key": event_object, "event_object": event_object, }, { "timestamp": cassandra.parse_date( datetime.utcnow().strftime("%Y-%m-%d") + " 13:27:17,685"), "msg_title": "Compacting large row test_data/series:6c6f677c32 (782001077 bytes) incrementally", "alert_type": alert_type, "event_type": event_type, "auto_priority": 0, "aggregation_key": event_object, "event_object": event_object, }, ] } self._write_log(log_data.split("\n")) dogstream = Dogstreams.init( self.logger, { 'dogstreams': '%s:dogstream.cassandra:parse_cassandra' % self.log_file.name }) actual_output = dogstream.check(self.config, move_end=False) self.assertEquals(expected_output, actual_output)
def test_cassandra_parser(self): from dogstream import cassandra, common log_data = """ INFO [CompactionExecutor:1594] 2012-05-12 21:05:12,924 Saved test_data-Encodings-KeyCache (86400 items) in 85 ms INFO [CompactionExecutor:1595] 2012-05-12 21:05:15,144 Saved test_data-Metrics-KeyCache (86400 items) in 96 ms INFO [CompactionExecutor:1596] 2012-05-12 21:10:48,058 Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6528-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6531-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6529-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6530-Data.db')] INFO [CompactionExecutor:1596] 2012-05-12 21:10:54,851 Compacted to [/var/cassandra/a-hc-65-Data.db,]. 102,079,134 to 101,546,397 INFO [CompactionExecutor:1598] 2012-05-12 22:05:04,313 Saved test_data-ResourcesMetadata-KeyCache (1 items) in 10 ms INFO [CompactionExecutor:1599] 2012-05-12 22:05:14,813 Saved test_data-Encodings-KeyCache (86400 items) in 83 ms INFO [CompactionExecutor:1630] 2012-05-13 13:05:44,963 Saved test_data-Metrics-KeyCache (86400 items) in 77 ms INFO [CompactionExecutor:1631] 2012-05-13 13:15:01,923 Nothing to compact in data_log. Use forceUserDefinedCompaction if you wish to force compaction of single sstables (e.g. for tombstone collection) INFO [CompactionExecutor:1632] 2012-05-13 13:15:01,927 Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6527-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6522-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6532-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6517-Data.db')] INFO [CompactionExecutor:1632] 2012-05-13 13:27:17,685 Compacting large row test_data/series:6c6f677c32 (782001077 bytes) incrementally INFO [CompactionExecutor:34] 2012-05-14 18:00:41,281 Saved test_data-Encodings-KeyCache (86400 items) in 78 ms INFO 13:27:17,685 Compacting large row test_data/series:6c6f677c32 (782001077 bytes) incrementally """ alert_type = cassandra.ALERT_TYPES["INFO"] event_type = cassandra.EVENT_TYPE event_object = EventDefaults.EVENT_OBJECT expected_output = { "dogstreamEvents": [ { "timestamp": cassandra.parse_date("2012-05-12 21:10:48,058"), "msg_title": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6528-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6531-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6529-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6530-Data.db')]"[0:common.MAX_TITLE_LEN], "msg_text": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6528-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6531-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6529-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6530-Data.db')]", "alert_type": alert_type, "auto_priority": 0, "event_type": event_type, "aggregation_key": event_object, "event_object": event_object, }, { "timestamp": cassandra.parse_date("2012-05-12 21:10:54,851"), "msg_title": "Compacted to [/var/cassandra/a-hc-65-Data.db,]. 102,079,134 to 101,546,397", "alert_type": alert_type, "auto_priority": 0, "event_type": event_type, "aggregation_key": event_object, "event_object": event_object, }, { "timestamp": cassandra.parse_date("2012-05-13 13:15:01,927"), "msg_title": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6527-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6522-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6532-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6517-Data.db')]"[0:common.MAX_TITLE_LEN], "msg_text": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6527-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6522-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6532-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6517-Data.db')]", "alert_type": alert_type, "event_type": event_type, "auto_priority": 0, "aggregation_key": event_object, "event_object": event_object, }, { "timestamp": cassandra.parse_date("2012-05-13 13:27:17,685"), "msg_title": "Compacting large row test_data/series:6c6f677c32 (782001077 bytes) incrementally", "alert_type": alert_type, "event_type": event_type, "auto_priority": 0, "aggregation_key": event_object, "event_object": event_object, }, { "timestamp": cassandra.parse_date(datetime.utcnow().strftime("%Y-%m-%d") + " 13:27:17,685"), "msg_title": "Compacting large row test_data/series:6c6f677c32 (782001077 bytes) incrementally", "alert_type": alert_type, "event_type": event_type, "auto_priority": 0, "aggregation_key": event_object, "event_object": event_object, }, ] } self._write_log(log_data.split("\n")) dogstream = Dogstreams.init(self.logger, {'dogstreams': '%s:dogstream.cassandra:parse_cassandra' % self.log_file.name}) actual_output = dogstream.check(self.config, move_end=False) self.assertEquals(expected_output, actual_output)
def test_service_perfdata(self): from checks.datadog import NagiosServicePerfData self._write_nagios_config([ "service_perfdata_file=%s" % self.log_file.name, "service_perfdata_file_template=DATATYPE::SERVICEPERFDATA\tTIMET::$TIMET$\tHOSTNAME::$HOSTNAME$\tSERVICEDESC::$SERVICEDESC$\tSERVICEPERFDATA::$SERVICEPERFDATA$\tSERVICECHECKCOMMAND::$SERVICECHECKCOMMAND$\tHOSTSTATE::$HOSTSTATE$\tHOSTSTATETYPE::$HOSTSTATETYPE$\tSERVICESTATE::$SERVICESTATE$\tSERVICESTATETYPE::$SERVICESTATETYPE$", ]) dogstream = Dogstreams.init(self.logger, self.agent_config) self.assertEquals([NagiosServicePerfData], [d.__class__ for d in dogstream.dogstreams]) log_data = [ ( "DATATYPE::SERVICEPERFDATA", "TIMET::1000000000", "HOSTNAME::myhost0", "SERVICEDESC::Pgsql Backends", "SERVICEPERFDATA::" + " ".join([ "time=0.06", "db0=33;180;190;0;200", "db1=1;150;190;0;200", "db2=0;120;290;1;200", "db3=0;110;195;5;100" ]), "SERVICECHECKCOMMAND::check_nrpe_1arg!check_postgres_backends", "HOSTSTATE::UP", "HOSTSTATETYPE::HARD", "SERVICESTATE::OK", "SERVICESTATETYPE::HARD", ), ] expected_output = [ ('nagios.pgsql_backends.time', 1000000000, 0.06, { 'metric_type': 'gauge', 'host_name': 'myhost0', }), ('nagios.pgsql_backends.db0', 1000000000, 33., { 'metric_type': 'gauge', 'host_name': 'myhost0', 'warn': '180', 'crit': '190', 'min': '0', 'max': '200', }), ('nagios.pgsql_backends.db1', 1000000000, 1., { 'metric_type': 'gauge', 'host_name': 'myhost0', 'warn': '150', 'crit': '190', 'min': '0', 'max': '200', }), ('nagios.pgsql_backends.db2', 1000000000, 0., { 'metric_type': 'gauge', 'host_name': 'myhost0', 'warn': '120', 'crit': '290', 'min': '1', 'max': '200', }), ('nagios.pgsql_backends.db3', 1000000000, 0., { 'metric_type': 'gauge', 'host_name': 'myhost0', 'warn': '110', 'crit': '195', 'min': '5', 'max': '100', }), ] expected_output.sort(key=point_sorter) self._write_log(('\t'.join(data) for data in log_data)) actual_output = dogstream.check(self.agent_config, move_end=False)['dogstream'] actual_output.sort(key=point_sorter) self.assertEquals(expected_output, actual_output)
def test_service_perfdata_special_cases(self): from checks.datadog import NagiosServicePerfData self._write_nagios_config([ "service_perfdata_file=%s" % self.log_file.name, "service_perfdata_file_template=DATATYPE::SERVICEPERFDATA\tTIMET::$TIMET$\tHOSTNAME::$HOSTNAME$\tSERVICEDESC::$SERVICEDESC$\tSERVICEPERFDATA::$SERVICEPERFDATA$\tSERVICECHECKCOMMAND::$SERVICECHECKCOMMAND$\tHOSTSTATE::$HOSTSTATE$\tHOSTSTATETYPE::$HOSTSTATETYPE$\tSERVICESTATE::$SERVICESTATE$\tSERVICESTATETYPE::$SERVICESTATETYPE$", ]) dogstream = Dogstreams.init(self.logger, self.agent_config) self.assertEquals([NagiosServicePerfData], [d.__class__ for d in dogstream.dogstreams]) log_data = [( "DATATYPE::SERVICEPERFDATA", "TIMET::1000000000", "HOSTNAME::myhost2", "SERVICEDESC::Disk Space", "SERVICEPERFDATA::" + " ".join([ "/=5477MB;6450;7256;0;8063", "/dev=0MB;2970;3341;0;3713", "/dev/shm=0MB;3080;3465;0;3851", "/var/run=0MB;3080;3465;0;3851", "/var/lock=0MB;3080;3465;0;3851", "/lib/init/rw=0MB;3080;3465;0;3851", "/mnt=290MB;338636;380966;0;423296", "/data=39812MB;40940;46057;0;51175", ]), "SERVICECHECKCOMMAND::check_all_disks!20%!10%", "HOSTSTATE::UP", "HOSTSTATETYPE::HARD", "SERVICESTATE::OK", "SERVICESTATETYPE::HARD", )] expected_output = [ ('nagios.disk_space', 1000000000, 5477., { 'metric_type': 'gauge', 'host_name': 'myhost2', 'device_name': '/', 'unit': 'MB', 'warn': '6450', 'crit': '7256', 'min': '0', 'max': '8063', }), ('nagios.disk_space', 1000000000, 0., { 'metric_type': 'gauge', 'host_name': 'myhost2', 'device_name': '/dev', 'unit': 'MB', 'warn': '2970', 'crit': '3341', 'min': '0', 'max': '3713', }), ('nagios.disk_space', 1000000000, 0., { 'metric_type': 'gauge', 'host_name': 'myhost2', 'device_name': '/dev/shm', 'unit': 'MB', 'warn': '3080', 'crit': '3465', 'min': '0', 'max': '3851', }), ('nagios.disk_space', 1000000000, 0., { 'metric_type': 'gauge', 'host_name': 'myhost2', 'device_name': '/var/run', 'unit': 'MB', 'warn': '3080', 'crit': '3465', 'min': '0', 'max': '3851', }), ('nagios.disk_space', 1000000000, 0., { 'metric_type': 'gauge', 'host_name': 'myhost2', 'device_name': '/var/lock', 'unit': 'MB', 'warn': '3080', 'crit': '3465', 'min': '0', 'max': '3851', }), ('nagios.disk_space', 1000000000, 0., { 'metric_type': 'gauge', 'host_name': 'myhost2', 'device_name': '/lib/init/rw', 'unit': 'MB', 'warn': '3080', 'crit': '3465', 'min': '0', 'max': '3851', }), ('nagios.disk_space', 1000000000, 290., { 'metric_type': 'gauge', 'host_name': 'myhost2', 'device_name': '/mnt', 'unit': 'MB', 'warn': '338636', 'crit': '380966', 'min': '0', 'max': '423296', }), ('nagios.disk_space', 1000000000, 39812., { 'metric_type': 'gauge', 'host_name': 'myhost2', 'device_name': '/data', 'unit': 'MB', 'warn': '40940', 'crit': '46057', 'min': '0', 'max': '51175', }), ] expected_output.sort(key=point_sorter) self._write_log(('\t'.join(data) for data in log_data)) actual_output = dogstream.check(self.agent_config, move_end=False)['dogstream'] actual_output.sort(key=point_sorter) self.assertEquals(expected_output, actual_output)
def __init__(self, agentConfig, emitters, systemStats): self.agentConfig = agentConfig # system stats is generated by config.get_system_stats self.agentConfig['system_stats'] = systemStats # agent config is used during checks, system_stats can be accessed through the config self.os = getOS() self.plugins = None self.emitters = emitters self.metadata_interval = int( agentConfig.get('metadata_interval', 10 * 60)) self.metadata_start = time.time() socket.setdefaulttimeout(15) self.run_count = 0 self.continue_running = True # Unix System Checks self._unix_system_checks = { 'disk': u.Disk(checks_logger), 'io': u.IO(), 'load': u.Load(checks_logger), 'memory': u.Memory(checks_logger), 'network': u.Network(checks_logger), 'processes': u.Processes(), 'cpu': u.Cpu(checks_logger) } # Win32 System `Checks self._win32_system_checks = { 'disk': w32.Disk(checks_logger), 'io': w32.IO(checks_logger), 'proc': w32.Processes(checks_logger), 'memory': w32.Memory(checks_logger), 'network': w32.Network(checks_logger), 'cpu': w32.Cpu(checks_logger) } # Old-style metric checks self._couchdb = CouchDb(checks_logger) self._mongodb = MongoDb(checks_logger) self._mysql = MySql(checks_logger) self._rabbitmq = RabbitMq() self._ganglia = Ganglia(checks_logger) self._cassandra = Cassandra() self._dogstream = Dogstreams.init(checks_logger, self.agentConfig) self._ddforwarder = DdForwarder(checks_logger, self.agentConfig) self._ec2 = EC2(checks_logger) # Metric Checks self._metrics_checks = [ ElasticSearch(checks_logger), Jvm(checks_logger), Tomcat(checks_logger), ActiveMQ(checks_logger), Solr(checks_logger), WMICheck(checks_logger), Memcache(checks_logger), ] # Custom metric checks for module_spec in [ s.strip() for s in self.agentConfig.get('custom_checks', '').split(',') ]: if len(module_spec) == 0: continue try: self._metrics_checks.append( modules.load(module_spec, 'Check')(checks_logger)) logger.info("Registered custom check %s" % module_spec) except Exception, e: logger.exception('Unable to load custom check module %s' % module_spec)
def __init__(self, agentConfig, emitters, systemStats, hostname): self.emit_duration = None self.agentConfig = agentConfig self.hostname = hostname # system stats is generated by config.get_system_stats self.agentConfig["system_stats"] = systemStats # agent config is used during checks, system_stats can be accessed through the config self.os = get_os() self.plugins = None self.emitters = emitters self.check_timings = agentConfig.get("check_timings") self.push_times = { "host_metadata": {"start": time.time(), "interval": int(agentConfig.get("metadata_interval", 4 * 60 * 60))}, "external_host_tags": { "start": time.time() - 3 * 60, # Wait for the checks to init "interval": int(agentConfig.get("external_host_tags", 5 * 60)), }, "agent_checks": {"start": time.time(), "interval": int(agentConfig.get("agent_checks_interval", 10 * 60))}, "processes": {"start": time.time(), "interval": int(agentConfig.get("processes_interval", 60))}, } socket.setdefaulttimeout(15) self.run_count = 0 self.continue_running = True self.hostname_metadata_cache = None self.initialized_checks_d = [] self.init_failed_checks_d = {} # Unix System Checks self._unix_system_checks = { "io": u.IO(log), "load": u.Load(log), "memory": u.Memory(log), "processes": u.Processes(log), "cpu": u.Cpu(log), "system": u.System(log), } # Win32 System `Checks self._win32_system_checks = { "io": w32.IO(log), "proc": w32.Processes(log), "memory": w32.Memory(log), "network": w32.Network(log), "cpu": w32.Cpu(log), "system": w32.System(log), } # Old-style metric checks self._ganglia = Ganglia(log) if self.agentConfig.get("ganglia_host", "") != "" else None self._dogstream = None if self.agentConfig.get("dogstreams") is None else Dogstreams.init(log, self.agentConfig) # Agent performance metrics check self._agent_metrics = None self._metrics_checks = [] # Custom metric checks for module_spec in [s.strip() for s in self.agentConfig.get("custom_checks", "").split(",")]: if len(module_spec) == 0: continue try: self._metrics_checks.append(modules.load(module_spec, "Check")(log)) log.info("Registered custom check %s" % module_spec) log.warning( "Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version" ) except Exception: log.exception("Unable to load custom check module %s" % module_spec)
def __init__(self, agentConfig, emitters, systemStats): self.emit_duration = None self.agentConfig = agentConfig # system stats is generated by config.get_system_stats self.agentConfig["system_stats"] = systemStats # agent config is used during checks, system_stats can be accessed through the config self.os = get_os() self.plugins = None self.emitters = emitters self.metadata_interval = int(agentConfig.get("metadata_interval", 10 * 60)) self.metadata_start = time.time() socket.setdefaulttimeout(15) self.run_count = 0 self.continue_running = True self.metadata_cache = None self.checks_d = [] # Unix System Checks self._unix_system_checks = { "disk": u.Disk(log), "io": u.IO(log), "load": u.Load(log), "memory": u.Memory(log), "network": u.Network(log), "processes": u.Processes(log), "cpu": u.Cpu(log), } # Win32 System `Checks self._win32_system_checks = { "disk": w32.Disk(log), "io": w32.IO(log), "proc": w32.Processes(log), "memory": w32.Memory(log), "network": w32.Network(log), "cpu": w32.Cpu(log), } # Old-style metric checks self._mongodb = MongoDb(log) self._mysql = MySql(log) self._rabbitmq = RabbitMq() self._ganglia = Ganglia(log) self._cassandra = Cassandra() self._dogstream = Dogstreams.init(log, self.agentConfig) self._ddforwarder = DdForwarder(log, self.agentConfig) self._ec2 = EC2(log) # Agent Metrics self._agent_metrics = CollectorMetrics(log) # Metric Checks self._metrics_checks = [Memcache(log)] # Custom metric checks for module_spec in [s.strip() for s in self.agentConfig.get("custom_checks", "").split(",")]: if len(module_spec) == 0: continue try: self._metrics_checks.append(modules.load(module_spec, "Check")(log)) log.info("Registered custom check %s" % module_spec) except Exception, e: log.exception("Unable to load custom check module %s" % module_spec)
def run(self, checksd=None, start_event=True, configs_reloaded=False): """ Collect data from each check and submit their data. """ log.debug("Found {num_checks} checks".format( num_checks=len(checksd['initialized_checks']))) timer = Timer() if not Platform.is_windows(): cpu_clock = time.clock() self.run_count += 1 log.debug("Starting collection run #%s" % self.run_count) if checksd: self.initialized_checks_d = checksd[ 'initialized_checks'] # is a list of AgentCheck instances self.init_failed_checks_d = checksd[ 'init_failed_checks'] # is of type {check_name: {error, traceback}} payload = AgentPayload() # Find the AgentMetrics check and pop it out # This check must run at the end of the loop to collect info on agent performance if not self._agent_metrics or configs_reloaded: for check in self.initialized_checks_d: if check.name == AGENT_METRICS_CHECK_NAME: self._agent_metrics = check self.initialized_checks_d.remove(check) break # Initialize payload self._build_payload(payload) metrics = payload['metrics'] events = payload['events'] service_checks = payload['service_checks'] # Run the system checks. Checks will depend on the OS if Platform.is_windows(): # Win32 system checks for check_name in ['memory', 'cpu', 'io', 'proc', 'system']: try: metrics.extend(self._win32_system_checks[check_name].check( self.agentConfig)) except Exception: log.exception('Unable to get %s metrics', check_name) else: # Unix system checks sys_checks = self._unix_system_checks for check_name in ['load', 'system', 'cpu', 'file_handles']: try: result_check = sys_checks[check_name].check( self.agentConfig) if result_check: payload.update(result_check) except Exception: log.exception('Unable to get %s metrics', check_name) try: memory = sys_checks['memory'].check(self.agentConfig) except Exception: log.exception('Unable to get memory metrics') else: if memory: memstats = { 'memPhysUsed': memory.get('physUsed'), 'memPhysPctUsable': memory.get('physPctUsable'), 'memPhysFree': memory.get('physFree'), 'memPhysTotal': memory.get('physTotal'), 'memPhysUsable': memory.get('physUsable'), 'memSwapUsed': memory.get('swapUsed'), 'memSwapFree': memory.get('swapFree'), 'memSwapPctFree': memory.get('swapPctFree'), 'memSwapTotal': memory.get('swapTotal'), 'memCached': memory.get('physCached'), 'memBuffers': memory.get('physBuffers'), 'memShared': memory.get('physShared'), 'memSlab': memory.get('physSlab'), 'memPageTables': memory.get('physPageTables'), 'memSwapCached': memory.get('swapCached') } payload.update(memstats) try: ioStats = sys_checks['io'].check(self.agentConfig) except Exception: log.exception('Unable to get io metrics') else: if ioStats: payload['ioStats'] = ioStats try: processes = sys_checks['processes'].check(self.agentConfig) except Exception: log.exception('Unable to get processes metrics') else: payload.update({'processes': processes}) # Run old-style checks if self._ganglia is not None: payload['ganglia'] = self._ganglia.check(self.agentConfig) if self._dogstream is not None: # every 10 run (~2min) we reload the list of files watched by # dogstream if (self.run_count % 10) == 0: log.info("reloading list of files watched by Dogstreams") self._dogstream = Dogstreams.init(log, self.agentConfig) dogstreamData = self._dogstream.check(self.agentConfig) dogstreamEvents = dogstreamData.get('dogstreamEvents', None) if dogstreamEvents: if 'dogstream' in payload['events']: events['dogstream'].extend(dogstreamEvents) else: events['dogstream'] = dogstreamEvents del dogstreamData['dogstreamEvents'] payload.update(dogstreamData) # process collector of gohai (compliant with payload of legacy "resources checks") if not Platform.is_windows() and self._should_send_additional_data( 'processes'): gohai_processes = self._run_gohai_processes() if gohai_processes: try: gohai_processes_json = json.loads(gohai_processes) processes_snaps = gohai_processes_json.get('processes') if processes_snaps: processes_payload = {'snaps': [processes_snaps]} payload['resources'] = { 'processes': processes_payload, 'meta': { 'host': self.hostname, } } except Exception: log.exception("Error running gohai processes collection") # newer-style checks (not checks.d style) for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) # Use `info` log level for some messages on the first run only, then `debug` log_at_first_run = log.info if self._is_first_run() else log.debug # checks.d checks check_statuses = [] for check in self.initialized_checks_d: if not self.continue_running: return log_at_first_run("Running check %s", check.name) instance_statuses = [] metric_count = 0 event_count = 0 service_check_count = 0 check_start_time = time.time() check_stats = None try: # Run the check. instance_statuses = check.run() # Collect the metrics and events. current_check_metrics = check.get_metrics() current_check_events = check.get_events() check_stats = check._get_internal_profiling_stats() # Collect metadata current_check_metadata = check.get_service_metadata() # Save metrics & events for the payload. metrics.extend(current_check_metrics) if current_check_events: if check.name not in events: events[check.name] = current_check_events else: events[check.name] += current_check_events # Save the status of the check. metric_count = len(current_check_metrics) event_count = len(current_check_events) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus( check.name, instance_statuses, metric_count, event_count, service_check_count, service_metadata=current_check_metadata, library_versions=check.get_library_info(), source_type_name=check.SOURCE_TYPE_NAME or check.name, check_stats=check_stats, check_version=check.check_version) # Service check for Agent checks failures service_check_tags = ["check:%s" % check.name] if check_status.status == STATUS_OK: status = AgentCheck.OK elif check_status.status == STATUS_ERROR: status = AgentCheck.CRITICAL check.service_check('datadog.agent.check_status', status, tags=service_check_tags) # Collect the service checks and save them in the payload current_check_service_checks = check.get_service_checks() if current_check_service_checks: service_checks.extend(current_check_service_checks) # -1 because the user doesn't care about the service check for check failure service_check_count = len(current_check_service_checks) - 1 # Update the check status with the correct service_check_count check_status.service_check_count = service_check_count check_statuses.append(check_status) check_run_time = time.time() - check_start_time log.debug("Check %s ran in %.2f s" % (check.name, check_run_time)) # Intrument check run timings if enabled. if self.check_timings: metric = 'datadog.agent.check_run_time' meta = {'tags': ["check:%s" % check.name]} metrics.append((metric, time.time(), check_run_time, meta)) for check_name, info in self.init_failed_checks_d.iteritems(): if not self.continue_running: return check_status = CheckStatus(check_name, None, None, None, None, check_version=info.get( 'version', 'unknown'), init_failed_error=info['error'], init_failed_traceback=info['traceback']) check_statuses.append(check_status) # Add a service check for the agent service_checks.append( create_service_check('datadog.agent.up', AgentCheck.OK, hostname=self.hostname)) # Store the metrics and events in the payload. payload['metrics'] = metrics payload['events'] = events payload['service_checks'] = service_checks # Populate metadata self._populate_payload_metadata(payload, check_statuses, start_event) collect_duration = timer.step() if self._agent_metrics: metric_context = { 'collection_time': collect_duration, 'emit_time': self.emit_duration, } if not Platform.is_windows(): metric_context['cpu_time'] = time.clock() - cpu_clock self._agent_metrics.set_metric_context(payload, metric_context) self._agent_metrics.run() agent_stats = self._agent_metrics.get_metrics() payload['metrics'].extend(agent_stats) if self.agentConfig.get('developer_mode'): log.debug("\n Agent developer mode stats: \n {0}".format( Collector._stats_for_display(agent_stats))) # Flush metadata for the Agent Metrics check. Otherwise they'll just accumulate and leak. self._agent_metrics.get_service_metadata() # Let's send our payload emitter_statuses = payload.emit(log, self.agentConfig, self.emitters, self.continue_running) self.emit_duration = timer.step() if self._is_first_run(): # This is not the exact payload sent to the backend as minor post # processing is done, but this will give us a good idea of what is sent # to the backend. data = payload.payload # deep copy and merge of meta and metric data data['apiKey'] = '*************************' + data.get( 'apiKey', '')[-5:] # removing unused keys for the metadata payload del data['metrics'] del data['events'] del data['service_checks'] if data.get('processes'): data['processes'][ 'apiKey'] = '*************************' + data[ 'processes'].get('apiKey', '')[-5:] log.debug("Metadata payload: %s", json.dumps(data)) # Persist the status of the collection run. try: CollectorStatus(check_statuses, emitter_statuses, self.hostname_metadata_cache).persist() except Exception: log.exception("Error persisting collector status") if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0: log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) if self.run_count == FLUSH_LOGGING_INITIAL: log.info( "First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD) else: log.debug( "Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) return payload
def __init__(self, agentConfig, emitters, systemStats): self.agentConfig = agentConfig # system stats is generated by config.get_system_stats self.agentConfig['system_stats'] = systemStats # agent config is used during checks, system_stats can be accessed through the config self.os = getOS() self.plugins = None self.emitters = emitters self.metadata_interval = int(agentConfig.get('metadata_interval', 10 * 60)) self.metadata_start = time.time() socket.setdefaulttimeout(15) self.run_count = 0 self.continue_running = True # Unix System Checks self._unix_system_checks = { 'disk': u.Disk(checks_logger), 'io': u.IO(), 'load': u.Load(checks_logger), 'memory': u.Memory(checks_logger), 'network': u.Network(checks_logger), 'processes': u.Processes(), 'cpu': u.Cpu(checks_logger) } # Win32 System `Checks self._win32_system_checks = { 'disk': w32.Disk(checks_logger), 'io': w32.IO(checks_logger), 'proc': w32.Processes(checks_logger), 'memory': w32.Memory(checks_logger), 'network': w32.Network(checks_logger), 'cpu': w32.Cpu(checks_logger) } # Old-style metric checks self._couchdb = CouchDb(checks_logger) self._mongodb = MongoDb(checks_logger) self._mysql = MySql(checks_logger) self._rabbitmq = RabbitMq() self._ganglia = Ganglia(checks_logger) self._cassandra = Cassandra() self._dogstream = Dogstreams.init(checks_logger, self.agentConfig) self._ddforwarder = DdForwarder(checks_logger, self.agentConfig) self._ec2 = EC2(checks_logger) # Metric Checks self._metrics_checks = [ ElasticSearch(checks_logger), WMICheck(checks_logger), Memcache(checks_logger), ] # Custom metric checks for module_spec in [s.strip() for s in self.agentConfig.get('custom_checks', '').split(',')]: if len(module_spec) == 0: continue try: self._metrics_checks.append(modules.load(module_spec, 'Check')(checks_logger)) logger.info("Registered custom check %s" % module_spec) except Exception, e: logger.exception('Unable to load custom check module %s' % module_spec)
def test_service_perfdata(self): from checks.datadog import NagiosServicePerfData self._write_nagios_config([ "service_perfdata_file=%s" % self.log_file.name, "service_perfdata_file_template=DATATYPE::SERVICEPERFDATA\tTIMET::$TIMET$\tHOSTNAME::$HOSTNAME$\tSERVICEDESC::$SERVICEDESC$\tSERVICEPERFDATA::$SERVICEPERFDATA$\tSERVICECHECKCOMMAND::$SERVICECHECKCOMMAND$\tHOSTSTATE::$HOSTSTATE$\tHOSTSTATETYPE::$HOSTSTATETYPE$\tSERVICESTATE::$SERVICESTATE$\tSERVICESTATETYPE::$SERVICESTATETYPE$", ]) dogstream = Dogstreams.init(self.logger, self.agent_config) self.assertEquals([NagiosServicePerfData], [d.__class__ for d in dogstream.dogstreams]) log_data = [ ("DATATYPE::SERVICEPERFDATA", "TIMET::1000000000", "HOSTNAME::myhost0", "SERVICEDESC::Pgsql Backends", "SERVICEPERFDATA::" + " ".join([ "time=0.06", "db0=33;180;190;0;200", "db1=1;150;190;0;200", "db2=0;120;290;1;200", "db3=0;110;195;5;100" ]), "SERVICECHECKCOMMAND::check_nrpe_1arg!check_postgres_backends", "HOSTSTATE::UP", "HOSTSTATETYPE::HARD", "SERVICESTATE::OK", "SERVICESTATETYPE::HARD", ), ] expected_output = [ ('nagios.pgsql_backends.time', 1000000000, 0.06, { 'metric_type': 'gauge', 'host_name': 'myhost0', }), ('nagios.pgsql_backends.db0', 1000000000, 33., { 'metric_type': 'gauge', 'host_name': 'myhost0', 'warn': '180', 'crit': '190', 'min': '0', 'max': '200', }), ('nagios.pgsql_backends.db1', 1000000000, 1., { 'metric_type': 'gauge', 'host_name': 'myhost0', 'warn': '150', 'crit': '190', 'min': '0', 'max': '200', }), ('nagios.pgsql_backends.db2', 1000000000, 0., { 'metric_type': 'gauge', 'host_name': 'myhost0', 'warn': '120', 'crit': '290', 'min': '1', 'max': '200', }), ('nagios.pgsql_backends.db3', 1000000000, 0., { 'metric_type': 'gauge', 'host_name': 'myhost0', 'warn': '110', 'crit': '195', 'min': '5', 'max': '100', }), ] expected_output.sort(key=point_sorter) self._write_log(('\t'.join(data) for data in log_data)) actual_output = dogstream.check(self.agent_config, move_end=False)['dogstream'] actual_output.sort(key=point_sorter) self.assertEquals(expected_output, actual_output)
def __init__(self, agentConfig, emitters, systemStats, hostname): self.emit_duration = None self.agentConfig = agentConfig self.hostname = hostname # system stats is generated by config.get_system_stats self.agentConfig['system_stats'] = systemStats # agent config is used during checks, system_stats can be accessed through the config self.os = get_os() self.plugins = None self.emitters = emitters self.check_timings = agentConfig.get('check_timings') self.push_times = { 'host_metadata': { 'start': time.time(), 'interval': int(agentConfig.get('metadata_interval', 4 * 60 * 60)) }, 'external_host_tags': { 'start': time.time() - 3 * 60, # Wait for the checks to init 'interval': int(agentConfig.get('external_host_tags', 5 * 60)) }, 'agent_checks': { 'start': time.time(), 'interval': int(agentConfig.get('agent_checks_interval', 10 * 60)) }, 'processes': { 'start': time.time(), 'interval': int(agentConfig.get('processes_interval', 60)) } } socket.setdefaulttimeout(15) self.run_count = 0 self.continue_running = True self.hostname_metadata_cache = None self.initialized_checks_d = [] self.init_failed_checks_d = {} if Platform.is_linux() and psutil is not None: procfs_path = agentConfig.get('procfs_path', '/proc').rstrip('/') psutil.PROCFS_PATH = procfs_path # Unix System Checks self._unix_system_checks = { 'io': u.IO(log), 'load': u.Load(log), 'memory': u.Memory(log), 'processes': u.Processes(log), 'cpu': u.Cpu(log), 'system': u.System(log) } # Win32 System `Checks self._win32_system_checks = { 'io': w32.IO(log), 'proc': w32.Processes(log), 'memory': w32.Memory(log), 'network': w32.Network(log), 'cpu': w32.Cpu(log), 'system': w32.System(log) } # Old-style metric checks self._ganglia = Ganglia(log) if self.agentConfig.get('ganglia_host', '') != '' else None self._dogstream = None if self.agentConfig.get('dogstreams') is None else Dogstreams.init(log, self.agentConfig) # Agent performance metrics check self._agent_metrics = None self._metrics_checks = [] # Custom metric checks for module_spec in [s.strip() for s in self.agentConfig.get('custom_checks', '').split(',')]: if len(module_spec) == 0: continue try: self._metrics_checks.append(modules.load(module_spec, 'Check')(log)) log.info("Registered custom check %s" % module_spec) log.warning("Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version") except Exception: log.exception('Unable to load custom check module %s' % module_spec)
def test_service_perfdata_special_cases(self): from checks.datadog import NagiosServicePerfData self._write_nagios_config([ "service_perfdata_file=%s" % self.log_file.name, "service_perfdata_file_template=DATATYPE::SERVICEPERFDATA\tTIMET::$TIMET$\tHOSTNAME::$HOSTNAME$\tSERVICEDESC::$SERVICEDESC$\tSERVICEPERFDATA::$SERVICEPERFDATA$\tSERVICECHECKCOMMAND::$SERVICECHECKCOMMAND$\tHOSTSTATE::$HOSTSTATE$\tHOSTSTATETYPE::$HOSTSTATETYPE$\tSERVICESTATE::$SERVICESTATE$\tSERVICESTATETYPE::$SERVICESTATETYPE$", ]) dogstream = Dogstreams.init(self.logger, self.agent_config) self.assertEquals([NagiosServicePerfData], [d.__class__ for d in dogstream.dogstreams]) log_data = [ ( "DATATYPE::SERVICEPERFDATA", "TIMET::1000000000", "HOSTNAME::myhost2", "SERVICEDESC::Disk Space", "SERVICEPERFDATA::" + " ".join([ "/=5477MB;6450;7256;0;8063", "/dev=0MB;2970;3341;0;3713", "/dev/shm=0MB;3080;3465;0;3851", "/var/run=0MB;3080;3465;0;3851", "/var/lock=0MB;3080;3465;0;3851", "/lib/init/rw=0MB;3080;3465;0;3851", "/mnt=290MB;338636;380966;0;423296", "/data=39812MB;40940;46057;0;51175", ]), "SERVICECHECKCOMMAND::check_all_disks!20%!10%", "HOSTSTATE::UP", "HOSTSTATETYPE::HARD", "SERVICESTATE::OK", "SERVICESTATETYPE::HARD", ) ] expected_output = [ ('nagios.disk_space', 1000000000, 5477., { 'metric_type': 'gauge', 'host_name': 'myhost2', 'device_name': '/', 'unit': 'MB', 'warn': '6450', 'crit': '7256', 'min': '0', 'max': '8063', }), ('nagios.disk_space', 1000000000, 0., { 'metric_type': 'gauge', 'host_name': 'myhost2', 'device_name': '/dev', 'unit': 'MB', 'warn': '2970', 'crit': '3341', 'min': '0', 'max': '3713', }), ('nagios.disk_space', 1000000000, 0., { 'metric_type': 'gauge', 'host_name': 'myhost2', 'device_name': '/dev/shm', 'unit': 'MB', 'warn': '3080', 'crit': '3465', 'min': '0', 'max': '3851', }), ('nagios.disk_space', 1000000000, 0., { 'metric_type': 'gauge', 'host_name': 'myhost2', 'device_name': '/var/run', 'unit': 'MB', 'warn': '3080', 'crit': '3465', 'min': '0', 'max': '3851', }), ('nagios.disk_space', 1000000000, 0., { 'metric_type': 'gauge', 'host_name': 'myhost2', 'device_name': '/var/lock', 'unit': 'MB', 'warn': '3080', 'crit': '3465', 'min': '0', 'max': '3851', }), ('nagios.disk_space', 1000000000, 0., { 'metric_type': 'gauge', 'host_name': 'myhost2', 'device_name': '/lib/init/rw', 'unit': 'MB', 'warn': '3080', 'crit': '3465', 'min': '0', 'max': '3851', }), ('nagios.disk_space', 1000000000, 290., { 'metric_type': 'gauge', 'host_name': 'myhost2', 'device_name': '/mnt', 'unit': 'MB', 'warn': '338636', 'crit': '380966', 'min': '0', 'max': '423296', }), ('nagios.disk_space', 1000000000, 39812., { 'metric_type': 'gauge', 'host_name': 'myhost2', 'device_name': '/data', 'unit': 'MB', 'warn': '40940', 'crit': '46057', 'min': '0', 'max': '51175', }), ] expected_output.sort(key=point_sorter) self._write_log(('\t'.join(data) for data in log_data)) actual_output = dogstream.check(self.agent_config, move_end=False)['dogstream'] actual_output.sort(key=point_sorter) self.assertEquals(expected_output, actual_output)
def __init__(self, agentConfig, emitters, systemStats, hostname): self.emit_duration = None self.agentConfig = agentConfig self.hostname = hostname # system stats is generated by config.get_system_stats self.agentConfig['system_stats'] = systemStats # agent config is used during checks, system_stats can be accessed through the config self.os = get_os() self.plugins = None self.emitters = emitters self.check_timings = agentConfig.get('check_timings') self.push_times = { 'host_metadata': { 'start': time.time(), 'interval': int(agentConfig.get('metadata_interval', 4 * 60 * 60)) }, 'external_host_tags': { 'start': time.time() - 3 * 60, # Wait for the checks to init 'interval': int(agentConfig.get('external_host_tags', 5 * 60)) }, 'agent_checks': { 'start': time.time(), 'interval': int(agentConfig.get('agent_checks_interval', 10 * 60)) }, 'processes': { 'start': time.time(), 'interval': int(agentConfig.get('processes_interval', 60)) } } socket.setdefaulttimeout(15) self.run_count = 0 self.continue_running = True self.hostname_metadata_cache = None self.initialized_checks_d = [] self.init_failed_checks_d = {} # Unix System Checks self._unix_system_checks = { 'io': u.IO(log), 'load': u.Load(log), 'memory': u.Memory(log), 'processes': u.Processes(log), 'cpu': u.Cpu(log), 'system': u.System(log) } # Win32 System `Checks self._win32_system_checks = { 'io': w32.IO(log), 'proc': w32.Processes(log), 'memory': w32.Memory(log), 'network': w32.Network(log), 'cpu': w32.Cpu(log), 'system': w32.System(log) } # Old-style metric checks self._ganglia = Ganglia(log) self._dogstream = Dogstreams.init(log, self.agentConfig) self._ddforwarder = DdForwarder(log, self.agentConfig) # Agent performance metrics check self._agent_metrics = None self._metrics_checks = [] # Custom metric checks for module_spec in [s.strip() for s in self.agentConfig.get('custom_checks', '').split(',')]: if len(module_spec) == 0: continue try: self._metrics_checks.append(modules.load(module_spec, 'Check')(log)) log.info("Registered custom check %s" % module_spec) log.warning("Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version") except Exception: log.exception('Unable to load custom check module %s' % module_spec)
def __init__(self, agentConfig, emitters): self.agentConfig = agentConfig self.os = getOS() self.plugins = None self.emitters = emitters self.checksLogger = logging.getLogger('checks') self.metadata_interval = int(agentConfig.get('metadata_interval', 10 * 60)) self.metadata_start = time.time() socket.setdefaulttimeout(15) # Unix System Checks self._unix_system_checks = { 'disk': u.Disk(self.checksLogger), 'io': u.IO(), 'load': u.Load(self.checksLogger), 'memory': u.Memory(self.checksLogger), 'network': u.Network(self.checksLogger), 'processes': u.Processes(), 'cpu': u.Cpu() } # Win32 System Checks self._win32_system_checks = { 'disk': w32.Disk(self.checksLogger), 'io': w32.IO(self.checksLogger), 'proc': w32.Processes(self.checksLogger), 'memory': w32.Memory(self.checksLogger), 'network': w32.Network(self.checksLogger), 'cpu': w32.Cpu(self.checksLogger) } # Old-style metric checks self._apache = Apache(self.checksLogger) self._couchdb = CouchDb(self.checksLogger) self._mongodb = MongoDb(self.checksLogger) self._mysql = MySql(self.checksLogger) self._rabbitmq = RabbitMq() self._ganglia = Ganglia(self.checksLogger) self._cassandra = Cassandra() self._dogstream = Dogstreams.init(self.checksLogger, self.agentConfig) self._ddforwarder = DdForwarder(self.checksLogger, self.agentConfig) self._ec2 = EC2(self.checksLogger) # Metric Checks self._metrics_checks = [ Varnish(self.checksLogger), ElasticSearch(self.checksLogger), Jvm(self.checksLogger), Tomcat(self.checksLogger), ActiveMQ(self.checksLogger), Solr(self.checksLogger), WMICheck(self.checksLogger), Nginx(self.checksLogger), Memcache(self.checksLogger), ] # Custom metric checks for module_spec in [s.strip() for s in self.agentConfig.get('custom_checks', '').split(',')]: if len(module_spec) == 0: continue try: self._metrics_checks.append(modules.load(module_spec, 'Check')(self.checksLogger)) self.checksLogger.info("Registered custom check %s" % module_spec) except Exception, e: self.checksLogger.exception('Unable to load custom check module %s' % module_spec)