def _postMetrics(self): if len(self._metrics) > 0: self._metrics['uuid'] = getUuid() self._metrics['internalHostname'] = gethostname(self._agentConfig) self._metrics['apiKey'] = self._agentConfig['api_key'] MetricTransaction(self._metrics) self._metrics = {}
def init(config_path=None, use_watchdog=False, use_forwarder=False): c = get_config(parse_args=False, cfg_path=config_path, init_logging=True) logger.debug("Configuration dogstatsd") port = c['dogstatsd_port'] interval = int(c['dogstatsd_interval']) normalize = c['dogstatsd_normalize'] api_key = c['api_key'] target = c['dd_url'] if use_forwarder: target = c['dogstatsd_target'] hostname = gethostname(c) # Create the aggregator (which is the point of communication between the # server and reporting threads. normalization_factor = 1.0 if normalize: normalization_factor = 1.0 / interval aggregator = MetricsAggregator(hostname, normalization_factor) # Start the reporting thread. reporter = Reporter(interval, aggregator, target, api_key, use_watchdog) # Start the server. server_host = '' server = Server(aggregator, server_host, port) return reporter, server
def check(self, logger, agentConfig): logger.debug('getProcesses: start') # Get output from ps try: ps = subprocess.Popen(['ps', 'auxww'], stdout=subprocess.PIPE, close_fds=True).communicate()[0] except: logger.exception('getProcesses') return False # Split out each process processLines = ps.split('\n') del processLines[0] # Removes the headers processLines.pop() # Removes a trailing empty line processes = [] logger.debug('getProcesses: Popen success, parsing, looping') for line in processLines: line = line.split(None, 10) processes.append(map(lambda s: s.strip(), line)) logger.debug('getProcesses: completed, returning') return { 'processes': processes, 'apiKey': agentConfig['api_key'], 'host': gethostname(agentConfig) }
def _process_data(self, data, tags=None): for node in data['nodes']: node_data = data['nodes'][node] def process_metric(metric, xtype, path, xform=None): # closure over node_data self._process_metric(node_data, metric, path, xform, tags=tags) if 'hostname' in node_data: # For ES >= 0.19 hostnames = ( gethostname(self.agentConfig).decode('utf-8'), socket.gethostname().decode('utf-8'), socket.getfqdn().decode('utf-8') ) if node_data['hostname'].decode('utf-8') in hostnames: self._map_metric(process_metric) else: # ES < 0.19 # Fetch interface address from ifconfig or ip addr and check # against the primary IP from ES try: base_url = self._base_es_url(self.agentConfig['elasticsearch']) url = "%s%s" % (base_url, NODES_URL) primary_addr = self._get_primary_addr(self.agentConfig, url, node) except NodeNotFound: # Skip any nodes that aren't found continue if self._host_matches_node(primary_addr): self._map_metric(process_metric)
def main(config_path=None): c = get_config(parse_args=False, cfg_path=config_path, init_logging=True) port = c["dogstatsd_port"] target = c["dogstatsd_target"] interval = c["dogstatsd_interval"] api_key = c["api_key"] host = "localhost" hostname = gethostname(c) rollup_interval = 10 # Create the aggregator (which is the point of communication between the # server and reporting threads. aggregator = MetricsAggregator(hostname, rollup_interval) # Start the reporting thread. reporter = Reporter(interval, aggregator, target, api_key) reporter.start() # Start the server. server_host = "" server = Server(aggregator, server_host, port) server.start() # If we're here, we're done. logger.info("Shutting down ...")
def _create_event(self, status): hostname = gethostname(self.agentConfig).decode('utf-8') if status == "red": alert_type = "error" msg_title = "%s is %s" % (hostname, status) elif status == "yellow": alert_type = "warning" msg_title = "%s is %s" % (hostname, status) else: # then it should be green alert_type = "info" msg_title = "%s recovered as %s" % (hostname, status) msg = "ElasticSearch: %s just reported as %s" % (hostname, status) return { 'timestamp': int(time.mktime(datetime.utcnow().timetuple())), 'event_type': 'elasticsearch', 'host': hostname, 'api_key': self.agentConfig['api_key'], 'msg_text':msg, 'msg_title': msg_title, "alert_type": alert_type, "source_type_name": "elasticsearch", "event_object": hostname }
def check(self, logger, agentConfig): if self.high_watermarks is None: # On the first run of check(), prime the high_watermarks dict # so that we only send events that occured after the agent # started. # (Setting high_watermarks in the next statement prevents # any kind of infinite loop (assuming nothing ever sets # high_watermarks to None again!)) self.high_watermarks = defaultdict(lambda: 0) self.check(logger, agentConfig) hudson_home = agentConfig.get('hudson_home', None) if not hudson_home: return False job_dirs = glob(os.path.join(hudson_home, 'jobs', '*')) build_events = [] for job_dir in job_dirs: for output in self._get_build_results(logger, job_dir): output['api_key'] = agentConfig['api_key'] output['host'] = gethostname(agentConfig) build_events.append(output) return build_events
def init(config_path=None, use_watchdog=False, use_forwarder=False): c = get_config(parse_args=False, cfg_path=config_path) log.debug("Configuration dogstatsd") port = c["dogstatsd_port"] interval = int(c["dogstatsd_interval"]) normalize = c["dogstatsd_normalize"] api_key = c["api_key"] target = c["dd_url"] if use_forwarder: target = c["dogstatsd_target"] hostname = gethostname(c) # Create the aggregator (which is the point of communication between the # server and reporting threads. assert 0 < interval aggregator = MetricsAggregator(hostname, interval) # Start the reporting thread. reporter = Reporter(interval, aggregator, target, api_key, use_watchdog) # Start the server. server_host = "" server = Server(aggregator, server_host, port) return reporter, server
def init(config_path=None, use_watchdog=False, use_forwarder=False): c = get_config(parse_args=False, cfg_path=config_path) log.debug("Configuration dogstatsd") port = c['dogstatsd_port'] interval = int(c['dogstatsd_interval']) normalize = c['dogstatsd_normalize'] api_key = c['api_key'] non_local_traffic = c['non_local_traffic'] target = c['dd_url'] if use_forwarder: target = c['dogstatsd_target'] hostname = gethostname(c) # Create the aggregator (which is the point of communication between the # server and reporting threads. assert 0 < interval aggregator = MetricsAggregator(hostname, interval) # Start the reporting thread. reporter = Reporter(interval, aggregator, target, api_key, use_watchdog) # Start the server on an IPv4 stack # Default to loopback server_host = '127.0.0.1' # If specified, bind to all addressses if non_local_traffic: server_host = '' server = Server(aggregator, server_host, port) return reporter, server
def _postMetrics(self): if len(self._metrics) > 0: self._metrics["uuid"] = getUuid() self._metrics["internalHostname"] = gethostname(self._agentConfig) self._metrics["apiKey"] = self._agentConfig["api_key"] MetricTransaction(self._metrics, {}) self._metrics = {}
def _postMetrics(self): if len(self._metrics) > 0: self._metrics['uuid'] = get_uuid() self._metrics['internalHostname'] = gethostname(self._agentConfig) self._metrics['apiKey'] = self._agentConfig['api_key'] MetricTransaction(self._metrics, {}) self._metrics = {}
def run(self): handlers = [ (r"/intake/?", AgentInputHandler), (r"/api/v1/series/?", ApiInputHandler), (r"/status/?", StatusHandler), ] settings = dict( cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=", xsrf_cookies=False, debug=False, ) non_local_traffic = self._agentConfig.get("non_local_traffic", False) tornado.web.Application.__init__(self, handlers, **settings) http_server = tornado.httpserver.HTTPServer(self) # non_local_traffic must be == True to match, not just some non-false value if non_local_traffic is True: http_server.listen(self._port) else: # localhost in lieu of 127.0.0.1 to support IPv6 http_server.listen(self._port, address = "localhost") logging.info("Listening on port %d" % self._port) # Register callbacks self.mloop = tornado.ioloop.IOLoop.instance() def flush_trs(): if self._watchdog: self._watchdog.reset() self._postMetrics() self._tr_manager.flush() tr_sched = tornado.ioloop.PeriodicCallback(flush_trs,TRANSACTION_FLUSH_INTERVAL, io_loop = self.mloop) # Register optional Graphite listener gport = self._agentConfig.get("graphite_listen_port", None) if gport is not None: logging.info("Starting graphite listener on port %s" % gport) from graphite import GraphiteServer gs = GraphiteServer(self, gethostname(self._agentConfig), io_loop=self.mloop) if non_local_traffic is True: gs.listen(gport) else: gs.listen(port, address = "localhost") # Start everything if self._watchdog: self._watchdog.reset() tr_sched.start() self.mloop.start() logging.info("Stopped")
def run(self): handlers = [ (r"/intake/?", AgentInputHandler), (r"/api/v1/series/?", ApiInputHandler), (r"/status/?", StatusHandler), ] settings = dict( cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=", xsrf_cookies=False, debug=True, ) tornado.web.Application.__init__(self, handlers, **settings) http_server = tornado.httpserver.HTTPServer(self) http_server.listen(self._port) logging.info("Listening on port %d" % self._port) # Register callbacks self.mloop = tornado.ioloop.IOLoop.instance() def flush_trs(): if self._watchdog: self._watchdog.reset() self._postMetrics() self._tr_manager.flush() tr_sched = tornado.ioloop.PeriodicCallback(flush_trs, TRANSACTION_FLUSH_INTERVAL, io_loop=self.mloop) # Register optional Graphite listener gport = self._agentConfig.get("graphite_listen_port", None) if gport is not None: logging.info("Starting graphite listener on port %s" % gport) from graphite import GraphiteServer gs = GraphiteServer(self, gethostname(self._agentConfig), io_loop=self.mloop) gs.listen(gport) # Start everything if self._watchdog: self._watchdog.reset() tr_sched.start() self.mloop.start() logging.info("Stopped")
def _build_payload(self, start_event=True): """ Return an dictionary that contains all of the generic payload data. """ now = time.time() payload = { "collection_timestamp": now, "os": self.os, "python": sys.version, "agentVersion": self.agentConfig["version"], "apiKey": self.agentConfig["api_key"], "events": {}, "metrics": [], "resources": {}, "internalHostname": gethostname(self.agentConfig), "uuid": get_uuid(), } # Include system stats on first postback if start_event and self._is_first_run(): payload["systemStats"] = self.agentConfig.get("system_stats", {}) # Also post an event in the newsfeed payload["events"]["System"] = [ { "api_key": self.agentConfig["api_key"], "host": payload["internalHostname"], "timestamp": now, "event_type": "Agent Startup", "msg_text": "Version %s" % get_version(), } ] # Periodically send the host metadata. if self._is_first_run() or self._should_send_metadata(): payload["meta"] = self._get_metadata() self.metadata_cache = payload["meta"] # Add static tags from the configuration file if self.agentConfig["tags"] is not None: payload["tags"] = self.agentConfig["tags"] # Log the metadata on the first run if self._is_first_run(): if self.agentConfig["tags"] is not None: log.info(u"Hostnames: %s, tags: %s" % (repr(self.metadata_cache), self.agentConfig["tags"])) else: log.info(u"Hostnames: %s" % repr(self.metadata_cache)) return payload
def run(self): handlers = [ (r"/intake/?", AgentInputHandler), (r"/api/v1/series/?", ApiInputHandler), (r"/status/?", StatusHandler), ] settings = dict( cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=", xsrf_cookies=False, debug=True, ) tornado.web.Application.__init__(self, handlers, **settings) http_server = tornado.httpserver.HTTPServer(self) http_server.listen(self._port) logging.info("Listening on port %d" % self._port) # Register callbacks self.mloop = tornado.ioloop.IOLoop.instance() def flush_trs(): if self._watchdog: self._watchdog.reset() self._postMetrics() self._tr_manager.flush() tr_sched = tornado.ioloop.PeriodicCallback(flush_trs,TRANSACTION_FLUSH_INTERVAL, io_loop = self.mloop) # Register optional Graphite listener gport = self._agentConfig.get("graphite_listen_port", None) if gport is not None: logging.info("Starting graphite listener on port %s" % gport) from graphite import GraphiteServer gs = GraphiteServer(self, gethostname(self._agentConfig), io_loop=self.mloop) gs.listen(gport) # Start everything if self._watchdog: self._watchdog.reset() tr_sched.start() self.mloop.start()
def _build_payload(self): """ Return an dictionary that contains all of the generic payload data. """ payload = { 'collection_timestamp': time.time(), 'os': self.os, 'python': sys.version, 'agentVersion': self.agentConfig['version'], 'apiKey': self.agentConfig['api_key'], 'events': {}, 'metrics': [], 'resources': {}, 'internalHostname': gethostname(self.agentConfig), 'uuid': get_uuid(), } # Include system stats on first postback if self._is_first_run(): payload['systemStats'] = self.agentConfig.get('systemStats', {}) # Also post an event in the newsfeed payload['events']['System'] = [{ 'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], 'timestamp': int(time.mktime(datetime.datetime.now().timetuple())), 'event_type': 'Agent Startup', 'msg_text': 'Version %s' % get_version() }] # Periodically send the host metadata. if self._is_first_run() or self._should_send_metadata(): payload['meta'] = self._get_metadata() # Add static tags from the configuration file if self.agentConfig['tags'] is not None: payload['tags'] = self.agentConfig['tags'] return payload
def check(self, agentConfig): # Get output from ps try: ps = subprocess.Popen(["ps", "auxww"], stdout=subprocess.PIPE, close_fds=True).communicate()[0] except StandardError: self.logger.exception("getProcesses") return False # Split out each process processLines = ps.split("\n") del processLines[0] # Removes the headers processLines.pop() # Removes a trailing empty line processes = [] for line in processLines: line = line.split(None, 10) processes.append(map(lambda s: s.strip(), line)) return {"processes": processes, "apiKey": agentConfig["api_key"], "host": gethostname(agentConfig)}
def _fetch_rrd_meta(self, agentConfig, whitelist): ''' Return a list of list of dicts with host_name, host_desc, device_name, and rrd_path ''' def _in_whitelist(rrd): path = rrd.replace('<path_rra>/','') for p in whitelist: if fnmatch(path, p): return True return False c = self.db.cursor() c.execute(""" SELECT h.hostname as host_name, dl.snmp_index as device_name, dt.data_source_path as rrd_path FROM data_local dl JOIN host h on dl.host_id = h.id JOIN data_template_data dt on dt.local_data_id = dl.id WHERE dt.data_source_path IS NOT NULL AND dt.data_source_path != '' """) res = [] for host_name, device_name, rrd_path in c.fetchall(): if not whitelist or _in_whitelist(rrd_path): if host_name in ('localhost', '127.0.0.1'): host_name = gethostname(agentConfig) res.append({ 'host_name': host_name, 'device_name': device_name or None, 'rrd_path': rrd_path.replace('<path_rra>', self.rrd_path) }) # Collect stats self._add_stat('cacti.rrd.count', len(res), agentConfig) num_hosts = len(set([r['host_name'] for r in res])) self._add_stat('cacti.hosts.count', num_hosts, agentConfig) return res
def _build_payload(self, start_event=True): """ Return an dictionary that contains all of the generic payload data. """ payload = { 'collection_timestamp': time.time(), 'os' : self.os, 'python': sys.version, 'agentVersion' : self.agentConfig['version'], 'apiKey': self.agentConfig['api_key'], 'events': {}, 'metrics': [], 'resources': {}, 'internalHostname' : gethostname(self.agentConfig), 'uuid' : get_uuid(), } # Include system stats on first postback if start_event and self._is_first_run(): payload['systemStats'] = self.agentConfig.get('system_stats', {}) # Also post an event in the newsfeed payload['events']['System'] = [{'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], 'timestamp': int(time.mktime(datetime.datetime.now().timetuple())), 'event_type':'Agent Startup', 'msg_text': 'Version %s' % get_version() }] # Periodically send the host metadata. if self._is_first_run() or self._should_send_metadata(): payload['meta'] = self._get_metadata() self.metadata_cache = payload['meta'] # Add static tags from the configuration file if self.agentConfig['tags'] is not None: payload['tags'] = self.agentConfig['tags'] return payload
def check(self, agentConfig): # Get output from ps try: ps = subprocess.Popen(['ps', 'auxww'], stdout=subprocess.PIPE, close_fds=True).communicate()[0] except StandardError: self.logger.exception('getProcesses') return False # Split out each process processLines = ps.split('\n') del processLines[0] # Removes the headers processLines.pop() # Removes a trailing empty line processes = [] for line in processLines: line = line.split(None, 10) processes.append(map(lambda s: s.strip(), line)) return { 'processes': processes, 'apiKey': agentConfig['api_key'], 'host': gethostname(agentConfig) }
def testCheck(self): config = { 'instances': [{ 'url': 'http://localhost:3834/stats', 'username': '******', 'password': '******' }] } self.start_server(HAPROXY_CFG, config) # Run the check against our running server self.check.check(config['instances'][0]) # Sleep for 1 second so the rate interval >=1 time.sleep(1) # Run the check again so we get the rates self.check.check(config['instances'][0]) # Metric assertions metrics = self.check.get_metrics() assert metrics self.assertTrue(type(metrics) == type([])) self.assertTrue(len(metrics) > 0) self.assertEquals(len([t for t in metrics if t[0] == "haproxy.backend.bytes.in_rate"]), 2, metrics) self.assertEquals(len([t for t in metrics if t[0] == "haproxy.frontend.session.current"]), 1, metrics) inst = config['instances'][0] data = self.check._fetch_data(inst['url'], inst['username'], inst['password']) new_data = [l.replace("OPEN", "DOWN") for l in data] self.check._process_data(new_data, gethostname(self.agentConfig), event_cb=self.check._process_events) assert self.check.has_events() assert len(self.check.get_events()) == 1
def _add_stat(self, name, value, agentConfig): ''' For collecting stats on Cacti checks ''' self.stats.append( (name, time.time(), value, {'host_name': gethostname(agentConfig)}) )
def doChecks(self, firstRun=False, systemStats=False): """Actual work """ self.checksLogger.info("Starting checks") apacheStatus = self.getApacheStatus() diskUsage = self.getDiskUsage() loadAvrgs = self.getLoadAvrgs() memory = self.getMemoryUsage() mysqlStatus = self.getMySQLStatus() pgsqlStatus = self.getPgSQLStatus() networkTraffic = self.getNetworkTraffic() nginxStatus = self.getNginxStatus() processes = self.getProcesses() rabbitmq = self.getRabbitMQStatus() mongodb = self.getMongoDBStatus() couchdb = self.getCouchDBStatus() ioStats = self.getIOStats() cpuStats = self.getCPUStats() gangliaData = self.getGangliaData() cassandraData = self.getCassandraData() jvmData = self.getJvmData() tomcatData = self.getTomcatData() activeMQData = self.getActiveMQData() solrData = self.getSolrData() memcacheData = self.getMemcacheData() dogstreamData = self.getDogstreamData() ddforwarderData = self.getDdforwarderData() checksData = { 'collection_timestamp': time.time(), 'os' : self.os, 'python': sys.version, 'agentVersion' : self.agentConfig['version'], 'loadAvrg1' : loadAvrgs['1'], 'loadAvrg5' : loadAvrgs['5'], 'loadAvrg15' : loadAvrgs['15'], 'memPhysUsed' : memory.get('physUsed'), 'memPhysFree' : memory.get('physFree'), 'memPhysTotal' : memory.get('physTotal'), 'memPhysUsable' : memory.get('physUsable'), 'memSwapUsed' : memory.get('swapUsed'), 'memSwapFree' : memory.get('swapFree'), 'memSwapTotal' : memory.get('swapTotal'), 'memCached' : memory.get('physCached'), 'memBuffers': memory.get('physBuffers'), 'memShared': memory.get('physShared'), 'networkTraffic' : networkTraffic, 'processes' : processes, 'apiKey': self.agentConfig['apiKey'], 'events': {}, 'resources': {}, } if diskUsage is not False and len(diskUsage) == 2: checksData["diskUsage"] = diskUsage[0] checksData["inodes"] = diskUsage[1] if cpuStats is not False and cpuStats is not None: checksData.update(cpuStats) if gangliaData is not False and gangliaData is not None: checksData['ganglia'] = gangliaData if cassandraData is not False and cassandraData is not None: checksData['cassandra'] = cassandraData # Apache Status if apacheStatus: checksData.update(apacheStatus) # MySQL Status if mysqlStatus: checksData.update(mysqlStatus) # PostgreSQL status if pgsqlStatus: checksData['postgresql'] = pgsqlStatus # Nginx Status if nginxStatus: checksData.update(nginxStatus) # RabbitMQ if rabbitmq: checksData['rabbitMQ'] = rabbitmq # MongoDB if mongodb: if mongodb.has_key('events'): checksData['events']['Mongo'] = mongodb['events']['Mongo'] del mongodb['events'] checksData['mongoDB'] = mongodb # CouchDB if couchdb: checksData['couchDB'] = couchdb if ioStats: checksData['ioStats'] = ioStats if jvmData: checksData['jvm'] = jvmData if tomcatData: checksData['tomcat'] = tomcatData if activeMQData: checksData['activemq'] = activeMQData if solrData: checksData['solr'] = solrData if memcacheData: checksData['memcache'] = memcacheData if dogstreamData: dogstreamEvents = dogstreamData.get('dogstreamEvents', None) if dogstreamEvents: if 'dogstream' in checksData['events']: checksData['events']['dogstream'].extend(dogstreamEvents) else: checksData['events']['dogstream'] = dogstreamEvents del dogstreamData['dogstreamEvents'] checksData.update(dogstreamData) if ddforwarderData: checksData['datadog'] = ddforwarderData # Include server indentifiers checksData['internalHostname'] = gethostname(self.agentConfig) checksData['uuid'] = getUuid() self.checksLogger.debug('doChecks: added uuid %s' % checksData['uuid']) # Process the event checks. for event_check in self._event_checks: event_data = event_check.check(self.checksLogger, self.agentConfig) if event_data: checksData['events'][event_check.key] = event_data # Include system stats on first postback if firstRun: checksData['systemStats'] = systemStats # Add static tags from the configuration file if self.agentConfig['tags'] is not None: checksData['tags'] = self.agentConfig['tags'] # Also post an event in the newsfeed checksData['events']['System'] = [{'api_key': self.agentConfig['apiKey'], 'host': checksData['internalHostname'], 'timestamp': int(time.mktime(datetime.datetime.now().timetuple())), 'event_type':'Agent Startup', 'msg_text': 'Version %s' % get_version() }] # Collect metadata checksData['meta'] = self.get_metadata() # Resources checks has_resource = False for resources_check in self._resources_checks: resources_check.check() snaps = resources_check.pop_snapshots() if snaps: has_resource = True res_value = { 'snaps': snaps, 'format_version': resources_check.get_format_version() } res_format = resources_check.describe_format_if_needed() if res_format is not None: res_value['format_description'] = res_format checksData['resources'][resources_check.RESOURCE_KEY] = res_value if has_resource: checksData['resources']['meta'] = { 'api_key': self.agentConfig['apiKey'], 'host': checksData['internalHostname'], } metrics = [] for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) checksData['metrics'] = metrics # Send back data self.checksLogger.debug("checksData: %s" % checksData) for emitter in self.emitters: emitter(checksData, self.checksLogger, self.agentConfig) self.checksLogger.info("Checks done")
def doChecks(self, firstRun=False, systemStats=False): """Actual work """ self.checksLogger.info("Starting checks") apacheStatus = self._apache.check(self.agentConfig) diskUsage = self._disk.check(self.agentConfig) loadAvrgs = self._load.check(self.agentConfig) memory = self._memory.check(self.agentConfig) mysqlStatus = self._mysql.check(self.agentConfig) pgsqlStatus = self._pgsql.check(self.agentConfig) networkTraffic = self._network.check(self.agentConfig) nginxStatus = self._nginx.check(self.agentConfig) processes = self._processes.check(self.checksLogger, self.agentConfig) rabbitmq = self._rabbitmq.check(self.checksLogger, self.agentConfig) mongodb = self._mongodb.check(self.agentConfig) couchdb = self._couchdb.check(self.agentConfig) ioStats = self._io.check(self.checksLogger, self.agentConfig) cpuStats = self._cpu.check(self.checksLogger, self.agentConfig) gangliaData = self._ganglia.check(self.agentConfig) cassandraData = self._cassandra.check(self.checksLogger, self.agentConfig) jvmData = self._jvm.check(self.agentConfig) tomcatData = self._tomcat.check(self.agentConfig) activeMQData = self._activemq.check(self.agentConfig) solrData = self._solr.check(self.agentConfig) memcacheData = self._memcache.check(self.agentConfig) dogstreamData = self._dogstream.check(self.agentConfig) ddforwarderData = self._ddforwarder.check(self.agentConfig) checksData = { "collection_timestamp": time.time(), "os": self.os, "python": sys.version, "agentVersion": self.agentConfig["version"], "loadAvrg1": loadAvrgs["1"], "loadAvrg5": loadAvrgs["5"], "loadAvrg15": loadAvrgs["15"], "memPhysUsed": memory.get("physUsed"), "memPhysFree": memory.get("physFree"), "memPhysTotal": memory.get("physTotal"), "memPhysUsable": memory.get("physUsable"), "memSwapUsed": memory.get("swapUsed"), "memSwapFree": memory.get("swapFree"), "memSwapTotal": memory.get("swapTotal"), "memCached": memory.get("physCached"), "memBuffers": memory.get("physBuffers"), "memShared": memory.get("physShared"), "networkTraffic": networkTraffic, "processes": processes, "apiKey": self.agentConfig["api_key"], "events": {}, "resources": {}, } if diskUsage is not False and len(diskUsage) == 2: checksData["diskUsage"] = diskUsage[0] checksData["inodes"] = diskUsage[1] if cpuStats is not False and cpuStats is not None: checksData.update(cpuStats) if gangliaData is not False and gangliaData is not None: checksData["ganglia"] = gangliaData if cassandraData is not False and cassandraData is not None: checksData["cassandra"] = cassandraData # Apache Status if apacheStatus: checksData.update(apacheStatus) # MySQL Status if mysqlStatus: checksData.update(mysqlStatus) # PostgreSQL status if pgsqlStatus: checksData["postgresql"] = pgsqlStatus # Nginx Status if nginxStatus: checksData.update(nginxStatus) # RabbitMQ if rabbitmq: checksData["rabbitMQ"] = rabbitmq # MongoDB if mongodb: if mongodb.has_key("events"): checksData["events"]["Mongo"] = mongodb["events"]["Mongo"] del mongodb["events"] checksData["mongoDB"] = mongodb # CouchDB if couchdb: checksData["couchDB"] = couchdb if ioStats: checksData["ioStats"] = ioStats if jvmData: checksData["jvm"] = jvmData if tomcatData: checksData["tomcat"] = tomcatData if activeMQData: checksData["activemq"] = activeMQData if solrData: checksData["solr"] = solrData if memcacheData: checksData["memcache"] = memcacheData if dogstreamData: dogstreamEvents = dogstreamData.get("dogstreamEvents", None) if dogstreamEvents: if "dogstream" in checksData["events"]: checksData["events"]["dogstream"].extend(dogstreamEvents) else: checksData["events"]["dogstream"] = dogstreamEvents del dogstreamData["dogstreamEvents"] checksData.update(dogstreamData) if ddforwarderData: checksData["datadog"] = ddforwarderData # Include server indentifiers checksData["internalHostname"] = gethostname(self.agentConfig) checksData["uuid"] = getUuid() self.checksLogger.debug("doChecks: added uuid %s" % checksData["uuid"]) # Process the event checks. for event_check in self._event_checks: event_data = event_check.check(self.checksLogger, self.agentConfig) if event_data: checksData["events"][event_check.key] = event_data # Include system stats on first postback if firstRun: checksData["systemStats"] = systemStats # Add static tags from the configuration file if self.agentConfig["tags"] is not None: checksData["tags"] = self.agentConfig["tags"] # Also post an event in the newsfeed checksData["events"]["System"] = [ { "api_key": self.agentConfig["api_key"], "host": checksData["internalHostname"], "timestamp": int(time.mktime(datetime.datetime.now().timetuple())), "event_type": "Agent Startup", "msg_text": "Version %s" % get_version(), } ] # Collect metadata checksData["meta"] = self.get_metadata() # Resources checks has_resource = False for resources_check in self._resources_checks: resources_check.check() snaps = resources_check.pop_snapshots() if snaps: has_resource = True res_value = {"snaps": snaps, "format_version": resources_check.get_format_version()} res_format = resources_check.describe_format_if_needed() if res_format is not None: res_value["format_description"] = res_format checksData["resources"][resources_check.RESOURCE_KEY] = res_value if has_resource: checksData["resources"]["meta"] = { "api_key": self.agentConfig["api_key"], "host": checksData["internalHostname"], } metrics = [] for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) checksData["metrics"] = metrics # Send back data self.checksLogger.debug("checksData: %s" % checksData) for emitter in self.emitters: emitter(checksData, self.checksLogger, self.agentConfig) self.checksLogger.info("Checks done")
def doChecks(self, firstRun=False, systemStats=False, checksd=None): """Actual work """ self.checksLogger.info("Starting checks") checksData = { 'collection_timestamp': time.time(), 'os' : self.os, 'python': sys.version, 'agentVersion' : self.agentConfig['version'], 'apiKey': self.agentConfig['api_key'], 'events': {}, 'resources': {} } metrics = [] events = {} # Run the system checks. Checks will depend on the OS if self.os == 'windows': # Win32 system checks metrics.extend(self._win32_system_checks['disk'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['memory'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['cpu'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['network'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['io'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['proc'].check(self.agentConfig)) else: # Unix system checks sys_checks = self._unix_system_checks diskUsage = sys_checks['disk'].check(self.agentConfig) if diskUsage is not False and len(diskUsage) == 2: checksData["diskUsage"] = diskUsage[0] checksData["inodes"] = diskUsage[1] loadAvrgs = sys_checks['load'].check(self.agentConfig) checksData.update({ 'loadAvrg1': loadAvrgs['1'], 'loadAvrg5': loadAvrgs['5'], 'loadAvrg15': loadAvrgs['15'] }) memory = sys_checks['memory'].check(self.agentConfig) checksData.update({ 'memPhysUsed' : memory.get('physUsed'), 'memPhysFree' : memory.get('physFree'), 'memPhysTotal' : memory.get('physTotal'), 'memPhysUsable' : memory.get('physUsable'), 'memSwapUsed' : memory.get('swapUsed'), 'memSwapFree' : memory.get('swapFree'), 'memSwapTotal' : memory.get('swapTotal'), 'memCached' : memory.get('physCached'), 'memBuffers': memory.get('physBuffers'), 'memShared': memory.get('physShared') }) ioStats = sys_checks['io'].check(self.checksLogger, self.agentConfig) if ioStats: checksData['ioStats'] = ioStats processes = sys_checks['processes'].check(self.checksLogger, self.agentConfig) checksData.update({'processes': processes}) networkTraffic = sys_checks['network'].check(self.agentConfig) checksData.update({'networkTraffic': networkTraffic}) cpuStats = sys_checks['cpu'].check(self.checksLogger, self.agentConfig) if cpuStats is not False and cpuStats is not None: checksData.update(cpuStats) # Run old-style checks apacheStatus = self._apache.check(self.agentConfig) mysqlStatus = self._mysql.check(self.agentConfig) rabbitmq = self._rabbitmq.check(self.checksLogger, self.agentConfig) mongodb = self._mongodb.check(self.agentConfig) couchdb = self._couchdb.check(self.agentConfig) gangliaData = self._ganglia.check(self.agentConfig) cassandraData = self._cassandra.check(self.checksLogger, self.agentConfig) dogstreamData = self._dogstream.check(self.agentConfig) ddforwarderData = self._ddforwarder.check(self.agentConfig) if gangliaData is not False and gangliaData is not None: checksData['ganglia'] = gangliaData if cassandraData is not False and cassandraData is not None: checksData['cassandra'] = cassandraData # Apache Status if apacheStatus: checksData.update(apacheStatus) # MySQL Status if mysqlStatus: checksData.update(mysqlStatus) # RabbitMQ if rabbitmq: checksData['rabbitMQ'] = rabbitmq # MongoDB if mongodb: if mongodb.has_key('events'): events['Mongo'] = mongodb['events']['Mongo'] del mongodb['events'] checksData['mongoDB'] = mongodb # CouchDB if couchdb: checksData['couchDB'] = couchdb if dogstreamData: dogstreamEvents = dogstreamData.get('dogstreamEvents', None) if dogstreamEvents: if 'dogstream' in checksData['events']: events['dogstream'].extend(dogstreamEvents) else: events['dogstream'] = dogstreamEvents del dogstreamData['dogstreamEvents'] checksData.update(dogstreamData) if ddforwarderData: checksData['datadog'] = ddforwarderData # Include server indentifiers checksData['internalHostname'] = gethostname(self.agentConfig) checksData['uuid'] = getUuid() self.checksLogger.debug('doChecks: added uuid %s' % checksData['uuid']) # Process the event checks. for event_check in self._event_checks: event_data = event_check.check(self.checksLogger, self.agentConfig) if event_data: events[event_check.key] = event_data # Include system stats on first postback if firstRun: checksData['systemStats'] = systemStats # Also post an event in the newsfeed events['System'] = [{'api_key': self.agentConfig['api_key'], 'host': checksData['internalHostname'], 'timestamp': int(time.mktime(datetime.datetime.now().timetuple())), 'event_type':'Agent Startup', 'msg_text': 'Version %s' % get_version() }] if firstRun or self.should_send_metadata(): # Collect metadata checksData['meta'] = self.get_metadata() # Add static tags from the configuration file if self.agentConfig['tags'] is not None: checksData['tags'] = self.agentConfig['tags'] # Resources checks if self.os != 'windows': has_resource = False for resources_check in self._resources_checks: resources_check.check() snaps = resources_check.pop_snapshots() if snaps: has_resource = True res_value = { 'snaps': snaps, 'format_version': resources_check.get_format_version() } res_format = resources_check.describe_format_if_needed() if res_format is not None: res_value['format_description'] = res_format checksData['resources'][resources_check.RESOURCE_KEY] = res_value if has_resource: checksData['resources']['meta'] = { 'api_key': self.agentConfig['api_key'], 'host': checksData['internalHostname'], } for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) # checks.d checks checksd = checksd or [] for check in checksd: check_cls = check['class'] for instance in check['instances']: try: # Run the check for each configuration check_cls.check(instance) metrics.extend(check_cls.get_metrics()) if check_cls.has_events(): if check['name'] not in events: events[check['name']] = [] for ev in check_cls.get_events(): events[check['name']].append(ev) except Exception: self.checksLogger.exception("Check %s failed" % check_cls.name) # Store the metrics in the payload checksData['metrics'] = metrics # Store the events in the payload checksData['events'] = events # Send back data self.checksLogger.debug("checksData: %s" % checksData) for emitter in self.emitters: emitter(checksData, self.checksLogger, self.agentConfig) self.checksLogger.info("Checks done")
def run(self): handlers = [ (r"/intake/?", AgentInputHandler), (r"/api/v1/series/?", ApiInputHandler), (r"/status/?", StatusHandler), ] settings = dict( cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=", xsrf_cookies=False, debug=False, ) non_local_traffic = self._agentConfig.get("non_local_traffic", False) tornado.web.Application.__init__(self, handlers, **settings) http_server = tornado.httpserver.HTTPServer(self) # set the root logger to warn so tornado is less chatty logging.getLogger().setLevel(logging.WARNING) # but keep the forwarder logger at the original level forwarder_logger = logging.getLogger('forwarder') log_config = get_logging_config() forwarder_logger.setLevel(log_config['log_level'] or logging.INFO) # non_local_traffic must be == True to match, not just some non-false value if non_local_traffic is True: http_server.listen(self._port) else: # localhost in lieu of 127.0.0.1 to support IPv6 try: http_server.listen(self._port, address = "localhost") except gaierror: log.warning("Warning localhost seems undefined in your host file, using 127.0.0.1 instead") http_server.listen(self._port, address = "127.0.0.1") log.info("Listening on port %d" % self._port) # Register callbacks self.mloop = tornado.ioloop.IOLoop.instance() def flush_trs(): if self._watchdog: self._watchdog.reset() self._postMetrics() self._tr_manager.flush() tr_sched = tornado.ioloop.PeriodicCallback(flush_trs,TRANSACTION_FLUSH_INTERVAL, io_loop = self.mloop) # Register optional Graphite listener gport = self._agentConfig.get("graphite_listen_port", None) if gport is not None: log.info("Starting graphite listener on port %s" % gport) from graphite import GraphiteServer gs = GraphiteServer(self, gethostname(self._agentConfig), io_loop=self.mloop) if non_local_traffic is True: gs.listen(gport) else: gs.listen(gport, address = "localhost") # Start everything if self._watchdog: self._watchdog.reset() tr_sched.start() self.mloop.start() log.info("Stopped")