def run(self, checksd=None, start_event=True): """ Collect data from each check and submit their data. """ timer = Timer() if self.os != 'windows': cpu_clock = time.clock() self.run_count += 1 log.debug("Starting collection run #%s" % self.run_count) payload = self._build_payload(start_event=start_event) metrics = payload['metrics'] events = payload['events'] service_checks = payload['service_checks'] if checksd: self.initialized_checks_d = checksd['initialized_checks'] # is of type {check_name: check} self.init_failed_checks_d = checksd['init_failed_checks'] # is of type {check_name: {error, traceback}} # Run the system checks. Checks will depend on the OS if self.os == 'windows': # Win32 system checks try: metrics.extend(self._win32_system_checks['disk'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['memory'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['cpu'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['network'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['io'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['proc'].check(self.agentConfig)) except Exception: log.exception('Unable to fetch Windows system metrics.') else: # Unix system checks sys_checks = self._unix_system_checks diskUsage = sys_checks['disk'].check(self.agentConfig) if diskUsage and len(diskUsage) == 2: payload["diskUsage"] = diskUsage[0] payload["inodes"] = diskUsage[1] load = sys_checks['load'].check(self.agentConfig) payload.update(load) memory = sys_checks['memory'].check(self.agentConfig) if memory: payload.update({ 'memPhysUsed' : memory.get('physUsed'), 'memPhysPctUsable' : memory.get('physPctUsable'), 'memPhysFree' : memory.get('physFree'), 'memPhysTotal' : memory.get('physTotal'), 'memPhysUsable' : memory.get('physUsable'), 'memSwapUsed' : memory.get('swapUsed'), 'memSwapFree' : memory.get('swapFree'), 'memSwapPctFree' : memory.get('swapPctFree'), 'memSwapTotal' : memory.get('swapTotal'), 'memCached' : memory.get('physCached'), 'memBuffers': memory.get('physBuffers'), 'memShared': memory.get('physShared') }) ioStats = sys_checks['io'].check(self.agentConfig) if ioStats: payload['ioStats'] = ioStats processes = sys_checks['processes'].check(self.agentConfig) payload.update({'processes': processes}) cpuStats = sys_checks['cpu'].check(self.agentConfig) if cpuStats: payload.update(cpuStats) # Run old-style checks gangliaData = self._ganglia.check(self.agentConfig) dogstreamData = self._dogstream.check(self.agentConfig) ddforwarderData = self._ddforwarder.check(self.agentConfig) if gangliaData is not False and gangliaData is not None: payload['ganglia'] = gangliaData # dogstream if dogstreamData: dogstreamEvents = dogstreamData.get('dogstreamEvents', None) if dogstreamEvents: if 'dogstream' in payload['events']: events['dogstream'].extend(dogstreamEvents) else: events['dogstream'] = dogstreamEvents del dogstreamData['dogstreamEvents'] payload.update(dogstreamData) # metrics about the forwarder if ddforwarderData: payload['datadog'] = ddforwarderData # Process the event checks. for event_check in self._event_checks: event_data = event_check.check(log, self.agentConfig) if event_data: events[event_check.key] = event_data # Resources checks if self.os != 'windows': has_resource = False for resources_check in self._resources_checks: resources_check.check() snaps = resources_check.pop_snapshots() if snaps: has_resource = True res_value = { 'snaps': snaps, 'format_version': resources_check.get_format_version() } res_format = resources_check.describe_format_if_needed() if res_format is not None: res_value['format_description'] = res_format payload['resources'][resources_check.RESOURCE_KEY] = res_value if has_resource: payload['resources']['meta'] = { 'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], } # newer-style checks (not checks.d style) for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) # checks.d checks check_statuses = [] for check in self.initialized_checks_d: if not self.continue_running: return log.info("Running check %s" % check.name) instance_statuses = [] metric_count = 0 event_count = 0 try: # Run the check. instance_statuses = check.run() # Collect the metrics and events. current_check_metrics = check.get_metrics() current_check_events = check.get_events() current_check_service_checks = check.get_service_checks() # Save them for the payload. metrics.extend(current_check_metrics) if current_check_events: if check.name not in events: events[check.name] = current_check_events else: events[check.name] += current_check_events if current_check_service_checks: service_checks.extend(current_check_service_checks) # Save the status of the check. metric_count = len(current_check_metrics) event_count = len(current_check_events) service_check_count = len(current_check_service_checks) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus(check.name, instance_statuses, metric_count, event_count, service_check_count, library_versions=check.get_library_info()) check_statuses.append(check_status) for check_name, info in self.init_failed_checks_d.iteritems(): if not self.continue_running: return check_status = CheckStatus(check_name, None, None, None, None, init_failed_error=info['error'], init_failed_traceback=info['traceback']) check_statuses.append(check_status) # Store the metrics and events in the payload. payload['metrics'] = metrics payload['events'] = events payload['service_checks'] = service_checks collect_duration = timer.step() if self.os != 'windows': payload['metrics'].extend(self._agent_metrics.check(payload, self.agentConfig, collect_duration, self.emit_duration, time.clock() - cpu_clock)) else: payload['metrics'].extend(self._agent_metrics.check(payload, self.agentConfig, collect_duration, self.emit_duration)) emitter_statuses = self._emit(payload) self.emit_duration = timer.step() # Persist the status of the collection run. try: CollectorStatus(check_statuses, emitter_statuses, self.metadata_cache).persist() except Exception: log.exception("Error persisting collector status") if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0: log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) if self.run_count == FLUSH_LOGGING_INITIAL: log.info("First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD) else: log.debug("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2)))
def run(self, checksd=None, start_event=True, configs_reloaded=False): """ Collect data from each check and submit their data. """ log.debug("Found {num_checks} checks".format(num_checks=len(checksd['initialized_checks']))) timer = Timer() if not Platform.is_windows(): cpu_clock = time.clock() self.run_count += 1 log.debug("Starting collection run #%s" % self.run_count) if checksd: self.initialized_checks_d = checksd['initialized_checks'] # is a list of AgentCheck instances self.init_failed_checks_d = checksd['init_failed_checks'] # is of type {check_name: {error, traceback}} payload = AgentPayload() # Find the AgentMetrics check and pop it out # This check must run at the end of the loop to collect info on agent performance if not self._agent_metrics or configs_reloaded: for check in self.initialized_checks_d: if check.name == AGENT_METRICS_CHECK_NAME: self._agent_metrics = check self.initialized_checks_d.remove(check) break # Initialize payload self._build_payload(payload) metrics = payload['metrics'] events = payload['events'] service_checks = payload['service_checks'] # Run the system checks. Checks will depend on the OS if Platform.is_windows(): # Win32 system checks try: metrics.extend(self._win32_system_checks['memory'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['cpu'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['network'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['io'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['proc'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['system'].check(self.agentConfig)) except Exception: log.exception('Unable to fetch Windows system metrics.') else: # Unix system checks sys_checks = self._unix_system_checks load = sys_checks['load'].check(self.agentConfig) payload.update(load) system = sys_checks['system'].check(self.agentConfig) payload.update(system) memory = sys_checks['memory'].check(self.agentConfig) if memory: memstats = { 'memPhysUsed': memory.get('physUsed'), 'memPhysPctUsable': memory.get('physPctUsable'), 'memPhysFree': memory.get('physFree'), 'memPhysTotal': memory.get('physTotal'), 'memPhysUsable': memory.get('physUsable'), 'memSwapUsed': memory.get('swapUsed'), 'memSwapFree': memory.get('swapFree'), 'memSwapPctFree': memory.get('swapPctFree'), 'memSwapTotal': memory.get('swapTotal'), 'memCached': memory.get('physCached'), 'memBuffers': memory.get('physBuffers'), 'memShared': memory.get('physShared'), 'memSlab': memory.get('physSlab'), 'memPageTables': memory.get('physPageTables'), 'memSwapCached': memory.get('swapCached') } payload.update(memstats) ioStats = sys_checks['io'].check(self.agentConfig) if ioStats: payload['ioStats'] = ioStats processes = sys_checks['processes'].check(self.agentConfig) payload.update({'processes': processes}) cpuStats = sys_checks['cpu'].check(self.agentConfig) if cpuStats: payload.update(cpuStats) # Run old-style checks gangliaData = self._ganglia.check(self.agentConfig) dogstreamData = self._dogstream.check(self.agentConfig) ddforwarderData = self._ddforwarder.check(self.agentConfig) if gangliaData is not False and gangliaData is not None: payload['ganglia'] = gangliaData # dogstream if dogstreamData: dogstreamEvents = dogstreamData.get('dogstreamEvents', None) if dogstreamEvents: if 'dogstream' in payload['events']: events['dogstream'].extend(dogstreamEvents) else: events['dogstream'] = dogstreamEvents del dogstreamData['dogstreamEvents'] payload.update(dogstreamData) # metrics about the forwarder if ddforwarderData: payload['datadog'] = ddforwarderData # process collector of gohai (compliant with payload of legacy "resources checks") if not Platform.is_windows() and self._should_send_additional_data('processes'): gohai_processes = self._run_gohai_processes() if gohai_processes: try: gohai_processes_json = json.loads(gohai_processes) processes_payload = { 'snaps': [gohai_processes_json.get('processes')], 'format_version': 1 } if self._is_first_run(): processes_payload['format_description'] = PROCESSES_FORMAT_DESCRIPTION payload['resources'] = { 'processes': processes_payload, 'meta': { 'host': payload['internalHostname'], } } except Exception: log.exception("Error running gohai processes collection") # newer-style checks (not checks.d style) for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) # checks.d checks check_statuses = [] for check in self.initialized_checks_d: if not self.continue_running: return log.info("Running check %s" % check.name) instance_statuses = [] metric_count = 0 event_count = 0 service_check_count = 0 check_start_time = time.time() check_stats = None try: # Run the check. instance_statuses = check.run() # Collect the metrics and events. current_check_metrics = check.get_metrics() current_check_events = check.get_events() check_stats = check._get_internal_profiling_stats() # Collect metadata current_check_metadata = check.get_service_metadata() # Save metrics & events for the payload. metrics.extend(current_check_metrics) if current_check_events: if check.name not in events: events[check.name] = current_check_events else: events[check.name] += current_check_events # Save the status of the check. metric_count = len(current_check_metrics) event_count = len(current_check_events) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus( check.name, instance_statuses, metric_count, event_count, service_check_count, service_metadata=current_check_metadata, library_versions=check.get_library_info(), source_type_name=check.SOURCE_TYPE_NAME or check.name, check_stats=check_stats ) # Service check for Agent checks failures service_check_tags = ["check:%s" % check.name] if check_status.status == STATUS_OK: status = AgentCheck.OK elif check_status.status == STATUS_ERROR: status = AgentCheck.CRITICAL check.service_check('datadog.agent.check_status', status, tags=service_check_tags) # Collect the service checks and save them in the payload current_check_service_checks = check.get_service_checks() if current_check_service_checks: service_checks.extend(current_check_service_checks) service_check_count = len(current_check_service_checks) # Update the check status with the correct service_check_count check_status.service_check_count = service_check_count check_statuses.append(check_status) check_run_time = time.time() - check_start_time log.debug("Check %s ran in %.2f s" % (check.name, check_run_time)) # Intrument check run timings if enabled. if self.check_timings: metric = 'datadog.agent.check_run_time' meta = {'tags': ["check:%s" % check.name]} metrics.append((metric, time.time(), check_run_time, meta)) for check_name, info in self.init_failed_checks_d.iteritems(): if not self.continue_running: return check_status = CheckStatus(check_name, None, None, None, None, init_failed_error=info['error'], init_failed_traceback=info['traceback']) check_statuses.append(check_status) # Add a service check for the agent service_checks.append(create_service_check('datadog.agent.up', AgentCheck.OK, hostname=self.hostname)) # Store the metrics and events in the payload. payload['metrics'] = metrics payload['events'] = events payload['service_checks'] = service_checks # Populate metadata self._populate_payload_metadata(payload, check_statuses, start_event) collect_duration = timer.step() if self._agent_metrics: metric_context = { 'collection_time': collect_duration, 'emit_time': self.emit_duration, } if not Platform.is_windows(): metric_context['cpu_time'] = time.clock() - cpu_clock self._agent_metrics.set_metric_context(payload, metric_context) self._agent_metrics.run() agent_stats = self._agent_metrics.get_metrics() payload['metrics'].extend(agent_stats) if self.agentConfig.get('developer_mode'): log.debug("\n Agent developer mode stats: \n {0}".format( Collector._stats_for_display(agent_stats)) ) # Flush metadata for the Agent Metrics check. Otherwise they'll just accumulate and leak. self._agent_metrics.get_service_metadata() # Let's send our payload emitter_statuses = payload.emit(log, self.agentConfig, self.emitters, self.continue_running) self.emit_duration = timer.step() # Persist the status of the collection run. try: CollectorStatus(check_statuses, emitter_statuses, self.hostname_metadata_cache).persist() except Exception: log.exception("Error persisting collector status") if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0: log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) if self.run_count == FLUSH_LOGGING_INITIAL: log.info("First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD) else: log.debug("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) return payload
def run(self, checksd=None, start_event=True, configs_reloaded=False): log.debug("Found {num_checks} checks".format(num_checks=len(checksd['initialized_checks']))) timer = Timer() if not Platform.is_windows(): cpu_clock = time.clock() self.run_count += 1 log.debug("Starting collection run #%s" % self.run_count) if checksd: self.initialized_checks_d = checksd['initialized_checks'] self.init_failed_checks_d = checksd['init_failed_checks'] payload = AgentPayload() if not self._agent_metrics or configs_reloaded: for check in self.initialized_checks_d: if check.name == AGENT_METRICS_CHECK_NAME: self._agent_metrics = check self.initialized_checks_d.remove(check) break self._build_payload(payload) metrics = payload['metrics'] events = payload['events'] service_checks = payload['service_checks'] if Platform.is_windows(): try: metrics.extend(self._win32_system_checks['memory'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['cpu'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['network'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['io'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['proc'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['system'].check(self.agentConfig)) except Exception: log.exception('Unable to fetch Windows system metrics.') else: sys_checks = self._unix_system_checks load = sys_checks['load'].check(self.agentConfig) payload.update(load) system = sys_checks['system'].check(self.agentConfig) payload.update(system) memory = sys_checks['memory'].check(self.agentConfig) if memory: memstats = { 'memPhysUsed': memory.get('physUsed'), 'memPhysPctUsage': memory.get('physPctUsage'), 'memPhysFree': memory.get('physFree'), 'memPhysTotal': memory.get('physTotal'), 'memPhysUsable': memory.get('physUsable'), 'memSwapUsed': memory.get('swapUsed'), 'memSwapFree': memory.get('swapFree'), 'memSwapPctFree': memory.get('swapPctFree'), 'memSwapTotal': memory.get('swapTotal'), 'memCached': memory.get('physCached'), 'memBuffers': memory.get('physBuffers'), 'memShared': memory.get('physShared'), 'memSlab': memory.get('physSlab'), 'memPageTables': memory.get('physPageTables'), 'memSwapCached': memory.get('swapCached') } payload.update(memstats) ioStats = sys_checks['io'].check(self.agentConfig) if ioStats: payload['ioStats'] = ioStats processes = sys_checks['processes'].check(self.agentConfig) payload.update({'processes': processes}) cpuStats = sys_checks['cpu'].check(self.agentConfig) if cpuStats: payload.update(cpuStats) gangliaData = self._ganglia.check(self.agentConfig) monitorstreamData = self._monitorstream.check(self.agentConfig) ddforwarderData = self._ddforwarder.check(self.agentConfig) if gangliaData is not False and gangliaData is not None: payload['ganglia'] = gangliaData if monitorstreamData: monitorstreamEvents = monitorstreamData.get('monitorstreamEvents', None) if monitorstreamEvents: if 'monitorstream' in payload['events']: events['monitorstream'].extend(monitorstreamEvents) else: events['monitorstream'] = monitorstreamEvents del monitorstreamData['monitorstreamEvents'] payload.update(monitorstreamData) if ddforwarderData: payload['datamonitor'] = ddforwarderData for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) check_statuses = [] for check in self.initialized_checks_d: if not self.continue_running: return log.info("Running check %s" % check.name) instance_statuses = [] metric_count = 0 event_count = 0 service_check_count = 0 check_start_time = time.time() check_stats = None try: instance_statuses = check.run() current_check_metrics = check.get_metrics() current_check_events = check.get_events() check_stats = check._get_internal_profiling_stats() current_check_metadata = check.get_service_metadata() metrics.extend(current_check_metrics) if current_check_events: if check.name not in events: events[check.name] = current_check_events else: events[check.name] += current_check_events metric_count = len(current_check_metrics) event_count = len(current_check_events) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus( check.name, instance_statuses, metric_count, event_count, service_check_count, service_metadata=current_check_metadata, library_versions=check.get_library_info(), source_type_name=check.SOURCE_TYPE_NAME or check.name, check_stats=check_stats ) service_check_tags = ["check:%s" % check.name] if check_status.status == STATUS_OK: status = AgentCheck.OK elif check_status.status == STATUS_ERROR: status = AgentCheck.CRITICAL check.service_check('datamonitor.agent.check_status', status, tags=service_check_tags) current_check_service_checks = check.get_service_checks() if current_check_service_checks: service_checks.extend(current_check_service_checks) service_check_count = len(current_check_service_checks) check_status.service_check_count = service_check_count check_statuses.append(check_status) check_run_time = time.time() - check_start_time log.debug("Check %s ran in %.2f s" % (check.name, check_run_time)) if self.check_timings: metric = 'datamonitor.agent.check_run_time' meta = {'tags': ["check:%s" % check.name]} metrics.append((metric, time.time(), check_run_time, meta)) for check_name, info in self.init_failed_checks_d.iteritems(): if not self.continue_running: return check_status = CheckStatus(check_name, None, None, None, None, init_failed_error=info['error'], init_failed_traceback=info['error']) check_statuses.append(check_status) service_checks.append(create_service_check('datamonitor.agent.up', AgentCheck.OK, hostname=self.hostname)) payload['metrics'] = metrics payload['events'] = events payload['service_checks'] = service_checks self._populate_payload_metadata(payload, check_statuses, start_event) collect_duration = timer.step() if self._agent_metrics: metric_context = { 'collection_time': collect_duration, 'emit_time': self.emit_duration, } if not Platform.is_windows(): metric_context['cpu_time'] = time.clock() - cpu_clock self._agent_metrics.set_metric_context(payload, metric_context) self._agent_metrics.run() agent_stats = self._agent_metrics.get_metrics() payload['metrics'].extend(agent_stats) if self.agentConfig.get('developer_mode'): log.debug("\n Agent developer mode stats: \n {0}".format( Collector._stats_for_display(agent_stats)) ) self._agent_metrics.get_service_metadata() emitter_statuses = payload.emit(log, self.agentConfig, self.emitters, self.continue_running) self.emit_duration = timer.step() try: CollectorStatus(check_statuses, emitter_statuses, self.hostname_metadata_cache).persist() except Exception: log.exception("Error persisting collector status") if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0: log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) if self.run_count == FLUSH_LOGGING_INITIAL: log.info("First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD) else: log.debug("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) return payload
def run(self, agentConfig): hostname='' timer = Timer() log.debug("Starting collection run") self.run_counter = 0 while True: checksd = load_check_directory(agentConfig, hostname) log.debug("Found {num_checks} checks".format(num_checks=len(checksd['initialized_checks']))) if checksd: self.initialized_checks_d = checksd['initialized_checks'] # is a list of AgentCheck instances self.init_failed_checks_d = checksd['init_failed_checks'] # is of type {check_name: {error, traceback}} start = time.time() if not self.initialized_checks_d: log.info("Not found checks valid, please check for it") # checks.d checks for check in self.initialized_checks_d: log.info("Running check %s", check.name) instance_statuses = [] check_start_time = time.time() try: # Run the check. instance_statuses = check.run() # Collect the metrics. current_check_metrics = check.get_metrics() parse_result(current_check_metrics, self.attr_map, self.id_with_ratio, self.id_with_ip) log.info("check result for %s: \n\t%s" %(check.name,current_check_metrics)) except Exception: log.exception("Error running check %s" % check.name) check_run_time = time.time() - check_start_time log.debug("Check %s ran in %.2f s" % (check.name, check_run_time)) collect_duration = timer.step() collect_duration = collect_duration - (0 if collect_duration <= COLLECT_INTERVAL else COLLECT_INTERVAL) self.run_counter = self.run_counter + 1; log.info("Finished run #%d. Collection time: %ss" % (self.run_counter, round(collect_duration, 2))) if self.run_counter > RELOAD_CHESK_INTERVAL: log.debug("Reload checks....") tmp_checksd = load_check_directory(agentConfig, hostname) checksd.clear() checksd['init_failed_checks'] = tmp_checksd['init_failed_checks'] checksd['initialized_checks'] = [] for new_check in tmp_checksd['initialized_checks']: for check in self.initialized_checks_d: if new_check.name == check.name: new_check.aggregator = check.aggregator break checksd['initialized_checks'].append(new_check) if checksd: self.initialized_checks_d = checksd['initialized_checks'] # is a list of AgentCheck instances self.init_failed_checks_d = checksd['init_failed_checks'] # is of type {check_name: {error, traceback}} log.debug("Reload done, found {num_checks} checks".format(num_checks=len(self.initialized_checks_d))) self.run_counter = 0 #time.sleep(COLLECT_INTERVAL if self.run_counter != 0 else (COLLECT_INTERVAL - timer.step() + collect_duration)) #collect interval sleep_time = COLLECT_INTERVAL - time.time() + start time.sleep(sleep_time) return {}
def run(self, checksd=None, start_event=True): """ Collect data from each check and submit their data. """ timer = Timer() if self.os != 'windows': cpu_clock = time.clock() self.run_count += 1 log.debug("Starting collection run #%s" % self.run_count) payload = self._build_payload(start_event=start_event) metrics = payload['metrics'] events = payload['events'] service_checks = payload['service_checks'] if checksd: self.initialized_checks_d = checksd[ 'initialized_checks'] # is of type {check_name: check} self.init_failed_checks_d = checksd[ 'init_failed_checks'] # is of type {check_name: {error, traceback}} # Run the system checks. Checks will depend on the OS if self.os == 'windows': # Win32 system checks try: metrics.extend(self._win32_system_checks['disk'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['memory'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['cpu'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['network'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['io'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['proc'].check( self.agentConfig)) except Exception: log.exception('Unable to fetch Windows system metrics.') else: # Unix system checks sys_checks = self._unix_system_checks diskUsage = sys_checks['disk'].check(self.agentConfig) if diskUsage and len(diskUsage) == 2: payload["diskUsage"] = diskUsage[0] payload["inodes"] = diskUsage[1] load = sys_checks['load'].check(self.agentConfig) payload.update(load) system = sys_checks['system'].check(self.agentConfig) payload.update(system) memory = sys_checks['memory'].check(self.agentConfig) if memory: payload.update({ 'memPhysUsed': memory.get('physUsed'), 'memPhysPctUsable': memory.get('physPctUsable'), 'memPhysFree': memory.get('physFree'), 'memPhysTotal': memory.get('physTotal'), 'memPhysUsable': memory.get('physUsable'), 'memSwapUsed': memory.get('swapUsed'), 'memSwapFree': memory.get('swapFree'), 'memSwapPctFree': memory.get('swapPctFree'), 'memSwapTotal': memory.get('swapTotal'), 'memCached': memory.get('physCached'), 'memBuffers': memory.get('physBuffers'), 'memShared': memory.get('physShared') }) ioStats = sys_checks['io'].check(self.agentConfig) if ioStats: payload['ioStats'] = ioStats processes = sys_checks['processes'].check(self.agentConfig) payload.update({'processes': processes}) cpuStats = sys_checks['cpu'].check(self.agentConfig) if cpuStats: payload.update(cpuStats) # Run old-style checks gangliaData = self._ganglia.check(self.agentConfig) dogstreamData = self._dogstream.check(self.agentConfig) ddforwarderData = self._ddforwarder.check(self.agentConfig) if gangliaData is not False and gangliaData is not None: payload['ganglia'] = gangliaData # dogstream if dogstreamData: dogstreamEvents = dogstreamData.get('dogstreamEvents', None) if dogstreamEvents: if 'dogstream' in payload['events']: events['dogstream'].extend(dogstreamEvents) else: events['dogstream'] = dogstreamEvents del dogstreamData['dogstreamEvents'] payload.update(dogstreamData) # metrics about the forwarder if ddforwarderData: payload['datadog'] = ddforwarderData # Resources checks if self.os != 'windows': has_resource = False for resources_check in self._resources_checks: resources_check.check() snaps = resources_check.pop_snapshots() if snaps: has_resource = True res_value = { 'snaps': snaps, 'format_version': resources_check.get_format_version() } res_format = resources_check.describe_format_if_needed() if res_format is not None: res_value['format_description'] = res_format payload['resources'][ resources_check.RESOURCE_KEY] = res_value if has_resource: payload['resources']['meta'] = { 'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], } # newer-style checks (not checks.d style) for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) # checks.d checks check_statuses = [] for check in self.initialized_checks_d: if not self.continue_running: return log.info("Running check %s" % check.name) instance_statuses = [] metric_count = 0 event_count = 0 service_check_count = 0 check_start_time = time.time() try: # Run the check. instance_statuses = check.run() # Collect the metrics and events. current_check_metrics = check.get_metrics() current_check_events = check.get_events() # Save them for the payload. metrics.extend(current_check_metrics) if current_check_events: if check.name not in events: events[check.name] = current_check_events else: events[check.name] += current_check_events # Save the status of the check. metric_count = len(current_check_metrics) event_count = len(current_check_events) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus( check.name, instance_statuses, metric_count, event_count, service_check_count, library_versions=check.get_library_info(), source_type_name=check.SOURCE_TYPE_NAME or check.name) # Service check for Agent checks failures service_check_tags = ["check:%s" % check.name] if check_status.status == STATUS_OK: status = AgentCheck.OK elif check_status.status == STATUS_ERROR: status = AgentCheck.CRITICAL check.service_check('datadog.agent.check_status', status, tags=service_check_tags) # Collect the service checks and save them in the payload current_check_service_checks = check.get_service_checks() if current_check_service_checks: service_checks.extend(current_check_service_checks) service_check_count = len(current_check_service_checks) # Update the check status with the correct service_check_count check_status.service_check_count = service_check_count check_statuses.append(check_status) check_run_time = time.time() - check_start_time log.debug("Check %s ran in %.2f s" % (check.name, check_run_time)) # Intrument check run timings if enabled. if self.check_timings: metric = 'datadog.agent.check_run_time' meta = {'tags': ["check:%s" % check.name]} metrics.append((metric, time.time(), check_run_time, meta)) for check_name, info in self.init_failed_checks_d.iteritems(): if not self.continue_running: return check_status = CheckStatus(check_name, None, None, None, None, init_failed_error=info['error'], init_failed_traceback=info['traceback']) check_statuses.append(check_status) # Add a service check for the agent service_checks.append( create_service_check('datadog.agent.up', AgentCheck.OK, hostname=self.hostname)) # Store the metrics and events in the payload. payload['metrics'] = metrics payload['events'] = events payload['service_checks'] = service_checks if self._should_send_additional_data('agent_checks'): # Add agent checks statuses and error/warning messages agent_checks = [] for check in check_statuses: if check.instance_statuses is not None: for instance_status in check.instance_statuses: agent_checks.append(( check.name, check.source_type_name, instance_status.instance_id, instance_status.status, # put error message or list of warning messages in the same field # it will be handled by the UI instance_status.error or instance_status.warnings or "")) else: agent_checks.append( (check.name, check.source_type_name, "initialization", check.status, repr(check.init_failed_error))) payload['agent_checks'] = agent_checks payload['meta'] = self.metadata_cache # add hostname metadata collect_duration = timer.step() if self.os != 'windows': payload['metrics'].extend( self._agent_metrics.check(payload, self.agentConfig, collect_duration, self.emit_duration, time.clock() - cpu_clock)) else: payload['metrics'].extend( self._agent_metrics.check(payload, self.agentConfig, collect_duration, self.emit_duration)) emitter_statuses = self._emit(payload) self.emit_duration = timer.step() # Persist the status of the collection run. try: CollectorStatus(check_statuses, emitter_statuses, self.metadata_cache).persist() except Exception: log.exception("Error persisting collector status") if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0: log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) if self.run_count == FLUSH_LOGGING_INITIAL: log.info( "First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD) else: log.debug( "Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) return payload
def run(self, checksd=None, start_event=True): """ Collect data from each check and submit their data. """ timer = Timer() if self.os != 'windows': cpu_clock = time.clock() self.run_count += 1 log.debug("Starting collection run #%s" % self.run_count) if checksd: self.initialized_checks_d = checksd['initialized_checks'] # is a list of AgentCheck instances self.init_failed_checks_d = checksd['init_failed_checks'] # is of type {check_name: {error, traceback}} # Find the AgentMetrics check and pop it out # This check must run at the end of the loop to collect info on agent performance if not self._agent_metrics: for check in self.initialized_checks_d: if check.name == AGENT_METRICS_CHECK_NAME: self._agent_metrics = check self.initialized_checks_d.remove(check) break payload = self._build_payload(start_event=start_event) metrics = payload['metrics'] events = payload['events'] service_checks = payload['service_checks'] # Run the system checks. Checks will depend on the OS if self.os == 'windows': # Win32 system checks try: metrics.extend(self._win32_system_checks['disk'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['memory'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['cpu'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['network'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['io'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['proc'].check(self.agentConfig)) except Exception: log.exception('Unable to fetch Windows system metrics.') else: # Unix system checks sys_checks = self._unix_system_checks load = sys_checks['load'].check(self.agentConfig) payload.update(load) system = sys_checks['system'].check(self.agentConfig) payload.update(system) memory = sys_checks['memory'].check(self.agentConfig) if memory: payload.update({ 'memPhysUsed' : memory.get('physUsed'), 'memPhysPctUsable' : memory.get('physPctUsable'), 'memPhysFree' : memory.get('physFree'), 'memPhysTotal' : memory.get('physTotal'), 'memPhysUsable' : memory.get('physUsable'), 'memSwapUsed' : memory.get('swapUsed'), 'memSwapFree' : memory.get('swapFree'), 'memSwapPctFree' : memory.get('swapPctFree'), 'memSwapTotal' : memory.get('swapTotal'), 'memCached' : memory.get('physCached'), 'memBuffers': memory.get('physBuffers'), 'memShared': memory.get('physShared') }) ioStats = sys_checks['io'].check(self.agentConfig) if ioStats: payload['ioStats'] = ioStats processes = sys_checks['processes'].check(self.agentConfig) payload.update({'processes': processes}) cpuStats = sys_checks['cpu'].check(self.agentConfig) if cpuStats: payload.update(cpuStats) # Run old-style checks gangliaData = self._ganglia.check(self.agentConfig) dogstreamData = self._dogstream.check(self.agentConfig) ddforwarderData = self._ddforwarder.check(self.agentConfig) if gangliaData is not False and gangliaData is not None: payload['ganglia'] = gangliaData # dogstream if dogstreamData: dogstreamEvents = dogstreamData.get('dogstreamEvents', None) if dogstreamEvents: if 'dogstream' in payload['events']: events['dogstream'].extend(dogstreamEvents) else: events['dogstream'] = dogstreamEvents del dogstreamData['dogstreamEvents'] payload.update(dogstreamData) # metrics about the forwarder if ddforwarderData: payload['datadog'] = ddforwarderData # Resources checks if self.os != 'windows': has_resource = False for resources_check in self._resources_checks: resources_check.check() snaps = resources_check.pop_snapshots() if snaps: has_resource = True res_value = {'snaps': snaps, 'format_version': resources_check.get_format_version()} res_format = resources_check.describe_format_if_needed() if res_format is not None: res_value['format_description'] = res_format payload['resources'][resources_check.RESOURCE_KEY] = res_value if has_resource: payload['resources']['meta'] = { 'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], } # newer-style checks (not checks.d style) for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) # checks.d checks check_statuses = [] for check in self.initialized_checks_d: if not self.continue_running: return log.info("Running check %s" % check.name) instance_statuses = [] metric_count = 0 event_count = 0 service_check_count = 0 check_start_time = time.time() check_stats = None try: # Run the check. instance_statuses = check.run() # Collect the metrics and events. current_check_metrics = check.get_metrics() current_check_events = check.get_events() check_stats = check._get_internal_profiling_stats() # Save them for the payload. metrics.extend(current_check_metrics) if current_check_events: if check.name not in events: events[check.name] = current_check_events else: events[check.name] += current_check_events # Save the status of the check. metric_count = len(current_check_metrics) event_count = len(current_check_events) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus( check.name, instance_statuses, metric_count, event_count, service_check_count, library_versions=check.get_library_info(), source_type_name=check.SOURCE_TYPE_NAME or check.name, check_stats=check_stats ) # Service check for Agent checks failures service_check_tags = ["check:%s" % check.name] if check_status.status == STATUS_OK: status = AgentCheck.OK elif check_status.status == STATUS_ERROR: status = AgentCheck.CRITICAL check.service_check('datadog.agent.check_status', status, tags=service_check_tags) # Collect the service checks and save them in the payload current_check_service_checks = check.get_service_checks() if current_check_service_checks: service_checks.extend(current_check_service_checks) service_check_count = len(current_check_service_checks) # Update the check status with the correct service_check_count check_status.service_check_count = service_check_count check_statuses.append(check_status) check_run_time = time.time() - check_start_time log.debug("Check %s ran in %.2f s" % (check.name, check_run_time)) # Intrument check run timings if enabled. if self.check_timings: metric = 'datadog.agent.check_run_time' meta = {'tags': ["check:%s" % check.name]} metrics.append((metric, time.time(), check_run_time, meta)) for check_name, info in self.init_failed_checks_d.iteritems(): if not self.continue_running: return check_status = CheckStatus(check_name, None, None, None, None, init_failed_error=info['error'], init_failed_traceback=info['traceback']) check_statuses.append(check_status) # Add a service check for the agent service_checks.append(create_service_check('datadog.agent.up', AgentCheck.OK, hostname=self.hostname)) # Store the metrics and events in the payload. payload['metrics'] = metrics payload['events'] = events payload['service_checks'] = service_checks if self._should_send_additional_data('agent_checks'): # Add agent checks statuses and error/warning messages agent_checks = [] for check in check_statuses: if check.instance_statuses is not None: for instance_status in check.instance_statuses: agent_checks.append( ( check.name, check.source_type_name, instance_status.instance_id, instance_status.status, # put error message or list of warning messages in the same field # it will be handled by the UI instance_status.error or instance_status.warnings or "" ) ) else: agent_checks.append( ( check.name, check.source_type_name, "initialization", check.status, repr(check.init_failed_error) ) ) payload['agent_checks'] = agent_checks payload['meta'] = self.metadata_cache # add hostname metadata collect_duration = timer.step() if self.os != 'windows': if self._agent_metrics is not None: self._agent_metrics.set_metric_context(payload, { 'collection_time': collect_duration, 'emit_time': self.emit_duration, 'cpu_time': time.clock() - cpu_clock }) self._agent_metrics.run() agent_stats = self._agent_metrics.get_metrics() payload['metrics'].extend(agent_stats) # Dump the metrics to log when in developer mode if self.agentConfig.get('developer_mode', False): log.info("\n AGENT STATS: \n {0}".format(Collector._stats_for_display(agent_stats))) else: if self._agent_metrics is not None: self._agent_metrics.set_metric_context(payload, { 'collection_time': collect_duration, 'emit_time': self.emit_duration, }) self._agent_metrics.run() agent_stats = self._agent_metrics.get_metrics() payload['metrics'].extend(agent_stats) # Dump the metrics to log when in developer mode if self.agentConfig.get('developer_mode', False): log.info("\n AGENT STATS: \n {0}".format(Collector._stats_for_display(agent_stats))) # Let's send our payload emitter_statuses = self._emit(payload) self.emit_duration = timer.step() # Persist the status of the collection run. try: CollectorStatus(check_statuses, emitter_statuses, self.metadata_cache).persist() except Exception: log.exception("Error persisting collector status") if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0: log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) if self.run_count == FLUSH_LOGGING_INITIAL: log.info("First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD) else: log.debug("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) return payload
def run(self, payload): # 运行时间计时器 log.info("Calling %s" % self.path) while self.continue_running: timer = Timer() payload_temp = deepcopy(payload) try: self._standard_parser(self.path) # stderr输出在测试运行时都是空字符串 不管脚本运行正常还是失败 # 此变量暂时保存 后续有需求时再处理 stdout_data, stderr_data = self._get_value() for i in stdout_data: if 'metric' in i: self._parse_metric(i) elif 'event' in i: self._parse_events(i) except IOError as e: io_err_event = self._format_event( str(e), "Reading script file failed", "error", "path:%s" % self.path) self.events.append(io_err_event) except OSError as e: os_err_event = self._format_event( str(e), "Executing script file failed", "error", "path:%s" % self.path) self.events.append(os_err_event) except KeyError as e: key_err_event = self._format_event( str(e), "Output of script file missing or misspelled, or unsupported script file", "error", "path:%s" % self.path) self.events.append(key_err_event) except Exception as e: common_err_event = self._format_event( str(e), "Uncatergorized error when calling script file", "error", "path:%s" % self.path) self.events.append(common_err_event) finally: payload_temp["metrics"].extend(self.aggregator.flush()) payload_temp['events'][self.path] = self.events if not payload_temp["metrics"] and not payload_temp['events'][ self.path]: no_out_event = self._format_event( "There is no output when executing this script", "Uncatergorized error when calling script file", "error", "path:%s" % self.path) payload_temp['events'][self.path] = no_out_event self.events = [] # log.info(payload_temp['metrics']) # log.info(payload_temp['events']) collect_duration = timer.step() payload_temp.emit(log, self.agent_config, self.emitters, self.continue_running) emit_duration = timer.step() # 运行状况记录到日志中 if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0: log.info( "Script: %s. Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.path, self.run_count, round( collect_duration, 2), round(emit_duration, 2))) if self.run_count == FLUSH_LOGGING_INITIAL: log.info( "Script: %s. First flushes done, next flushes will be logged every %s flushes." % self.path, FLUSH_LOGGING_PERIOD) else: log.debug( "Script: %s. Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.path, self.run_count, round( collect_duration, 2), round(emit_duration, 2))) time.sleep(self.interval)
def run(self, checksd=None, start_event=True, configs_reloaded=False): """ Collect data from each check and submit their data. """ log.debug("Found {num_checks} checks".format(num_checks=len(checksd["initialized_checks"]))) timer = Timer() if not Platform.is_windows(): cpu_clock = time.clock() self.run_count += 1 log.debug("Starting collection run #%s" % self.run_count) if checksd: self.initialized_checks_d = checksd["initialized_checks"] # is a list of AgentCheck instances self.init_failed_checks_d = checksd["init_failed_checks"] # is of type {check_name: {error, traceback}} payload = AgentPayload() # Find the AgentMetrics check and pop it out # This check must run at the end of the loop to collect info on agent performance if not self._agent_metrics or configs_reloaded: for check in self.initialized_checks_d: if check.name == AGENT_METRICS_CHECK_NAME: self._agent_metrics = check self.initialized_checks_d.remove(check) break # Initialize payload self._build_payload(payload) metrics = payload["metrics"] events = payload["events"] service_checks = payload["service_checks"] # Run the system checks. Checks will depend on the OS if Platform.is_windows(): # Win32 system checks try: metrics.extend(self._win32_system_checks["memory"].check(self.agentConfig)) metrics.extend(self._win32_system_checks["cpu"].check(self.agentConfig)) metrics.extend(self._win32_system_checks["network"].check(self.agentConfig)) metrics.extend(self._win32_system_checks["io"].check(self.agentConfig)) metrics.extend(self._win32_system_checks["proc"].check(self.agentConfig)) except Exception: log.exception("Unable to fetch Windows system metrics.") else: # Unix system checks sys_checks = self._unix_system_checks load = sys_checks["load"].check(self.agentConfig) payload.update(load) system = sys_checks["system"].check(self.agentConfig) payload.update(system) memory = sys_checks["memory"].check(self.agentConfig) if memory: memstats = { "memPhysUsed": memory.get("physUsed"), "memPhysPctUsable": memory.get("physPctUsable"), "memPhysFree": memory.get("physFree"), "memPhysTotal": memory.get("physTotal"), "memPhysUsable": memory.get("physUsable"), "memSwapUsed": memory.get("swapUsed"), "memSwapFree": memory.get("swapFree"), "memSwapPctFree": memory.get("swapPctFree"), "memSwapTotal": memory.get("swapTotal"), "memCached": memory.get("physCached"), "memBuffers": memory.get("physBuffers"), "memShared": memory.get("physShared"), } payload.update(memstats) ioStats = sys_checks["io"].check(self.agentConfig) if ioStats: payload["ioStats"] = ioStats processes = sys_checks["processes"].check(self.agentConfig) payload.update({"processes": processes}) cpuStats = sys_checks["cpu"].check(self.agentConfig) if cpuStats: payload.update(cpuStats) # Run old-style checks gangliaData = self._ganglia.check(self.agentConfig) dogstreamData = self._dogstream.check(self.agentConfig) ddforwarderData = self._ddforwarder.check(self.agentConfig) if gangliaData is not False and gangliaData is not None: payload["ganglia"] = gangliaData # dogstream if dogstreamData: dogstreamEvents = dogstreamData.get("dogstreamEvents", None) if dogstreamEvents: if "dogstream" in payload["events"]: events["dogstream"].extend(dogstreamEvents) else: events["dogstream"] = dogstreamEvents del dogstreamData["dogstreamEvents"] payload.update(dogstreamData) # metrics about the forwarder if ddforwarderData: payload["datadog"] = ddforwarderData # Resources checks if not Platform.is_windows(): has_resource = False for resources_check in self._resources_checks: try: resources_check.check() snaps = resources_check.pop_snapshots() if snaps: has_resource = True res_value = {"snaps": snaps, "format_version": resources_check.get_format_version()} res_format = resources_check.describe_format_if_needed() if res_format is not None: res_value["format_description"] = res_format payload["resources"][resources_check.RESOURCE_KEY] = res_value except Exception: log.exception("Error running resource check %s" % resources_check.RESOURCE_KEY) if has_resource: payload["resources"]["meta"] = { "api_key": self.agentConfig["api_key"], "host": payload["internalHostname"], } # newer-style checks (not checks.d style) for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) # checks.d checks check_statuses = [] for check in self.initialized_checks_d: if not self.continue_running: return log.info("Running check %s" % check.name) instance_statuses = [] metric_count = 0 event_count = 0 service_check_count = 0 check_start_time = time.time() check_stats = None try: # Run the check. instance_statuses = check.run() # Collect the metrics and events. current_check_metrics = check.get_metrics() current_check_events = check.get_events() check_stats = check._get_internal_profiling_stats() # Collect metadata current_check_metadata = check.get_service_metadata() # Save metrics & events for the payload. metrics.extend(current_check_metrics) if current_check_events: if check.name not in events: events[check.name] = current_check_events else: events[check.name] += current_check_events # Save the status of the check. metric_count = len(current_check_metrics) event_count = len(current_check_events) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus( check.name, instance_statuses, metric_count, event_count, service_check_count, service_metadata=current_check_metadata, library_versions=check.get_library_info(), source_type_name=check.SOURCE_TYPE_NAME or check.name, check_stats=check_stats, ) # Service check for Agent checks failures service_check_tags = ["check:%s" % check.name] if check_status.status == STATUS_OK: status = AgentCheck.OK elif check_status.status == STATUS_ERROR: status = AgentCheck.CRITICAL check.service_check("datadog.agent.check_status", status, tags=service_check_tags) # Collect the service checks and save them in the payload current_check_service_checks = check.get_service_checks() if current_check_service_checks: service_checks.extend(current_check_service_checks) service_check_count = len(current_check_service_checks) # Update the check status with the correct service_check_count check_status.service_check_count = service_check_count check_statuses.append(check_status) check_run_time = time.time() - check_start_time log.debug("Check %s ran in %.2f s" % (check.name, check_run_time)) # Intrument check run timings if enabled. if self.check_timings: metric = "datadog.agent.check_run_time" meta = {"tags": ["check:%s" % check.name]} metrics.append((metric, time.time(), check_run_time, meta)) for check_name, info in self.init_failed_checks_d.iteritems(): if not self.continue_running: return check_status = CheckStatus( check_name, None, None, None, None, init_failed_error=info["error"], init_failed_traceback=info["traceback"], ) check_statuses.append(check_status) # Add a service check for the agent service_checks.append(create_service_check("datadog.agent.up", AgentCheck.OK, hostname=self.hostname)) # Store the metrics and events in the payload. payload["metrics"] = metrics payload["events"] = events payload["service_checks"] = service_checks # Populate metadata self._populate_payload_metadata(payload, check_statuses, start_event) collect_duration = timer.step() if self._agent_metrics: metric_context = {"collection_time": collect_duration, "emit_time": self.emit_duration} if not Platform.is_windows(): metric_context["cpu_time"] = time.clock() - cpu_clock self._agent_metrics.set_metric_context(payload, metric_context) self._agent_metrics.run() agent_stats = self._agent_metrics.get_metrics() payload["metrics"].extend(agent_stats) if self.agentConfig.get("developer_mode"): log.debug("\n Agent developer mode stats: \n {0}".format(Collector._stats_for_display(agent_stats))) # Let's send our payload emitter_statuses = payload.emit(log, self.agentConfig, self.emitters, self.continue_running) self.emit_duration = timer.step() # Persist the status of the collection run. try: CollectorStatus(check_statuses, emitter_statuses, self.hostname_metadata_cache).persist() except Exception: log.exception("Error persisting collector status") if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0: log.info( "Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2)) ) if self.run_count == FLUSH_LOGGING_INITIAL: log.info("First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD) else: log.debug( "Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2)) ) return payload
def run(self, checksd=None): """ Collect data from each check and submit their data. """ timer = Timer() self.run_count += 1 logger.info("Starting collection run #%s" % self.run_count) payload = self._build_payload() metrics = payload['metrics'] events = payload['events'] # Run the system checks. Checks will depend on the OS if self.os == 'windows': # Win32 system checks metrics.extend(self._win32_system_checks['disk'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['memory'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['cpu'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['network'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['io'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['proc'].check( self.agentConfig)) else: # Unix system checks sys_checks = self._unix_system_checks diskUsage = sys_checks['disk'].check(self.agentConfig) if diskUsage and len(diskUsage) == 2: payload["diskUsage"] = diskUsage[0] payload["inodes"] = diskUsage[1] load = sys_checks['load'].check(self.agentConfig) payload.update(load) memory = sys_checks['memory'].check(self.agentConfig) payload.update({ 'memPhysUsed': memory.get('physUsed'), 'memPhysFree': memory.get('physFree'), 'memPhysTotal': memory.get('physTotal'), 'memPhysUsable': memory.get('physUsable'), 'memSwapUsed': memory.get('swapUsed'), 'memSwapFree': memory.get('swapFree'), 'memSwapTotal': memory.get('swapTotal'), 'memCached': memory.get('physCached'), 'memBuffers': memory.get('physBuffers'), 'memShared': memory.get('physShared') }) ioStats = sys_checks['io'].check(checks_logger, self.agentConfig) if ioStats: payload['ioStats'] = ioStats processes = sys_checks['processes'].check(checks_logger, self.agentConfig) payload.update({'processes': processes}) networkTraffic = sys_checks['network'].check(self.agentConfig) payload.update({'networkTraffic': networkTraffic}) cpuStats = sys_checks['cpu'].check(self.agentConfig) if cpuStats: payload.update(cpuStats) # Run old-style checks mysqlStatus = self._mysql.check(self.agentConfig) rabbitmq = self._rabbitmq.check(checks_logger, self.agentConfig) mongodb = self._mongodb.check(self.agentConfig) couchdb = self._couchdb.check(self.agentConfig) gangliaData = self._ganglia.check(self.agentConfig) cassandraData = self._cassandra.check(checks_logger, self.agentConfig) dogstreamData = self._dogstream.check(self.agentConfig) ddforwarderData = self._ddforwarder.check(self.agentConfig) if gangliaData is not False and gangliaData is not None: payload['ganglia'] = gangliaData if cassandraData is not False and cassandraData is not None: payload['cassandra'] = cassandraData # MySQL Status if mysqlStatus: payload.update(mysqlStatus) # RabbitMQ if rabbitmq: payload['rabbitMQ'] = rabbitmq # MongoDB if mongodb: if mongodb.has_key('events'): events['Mongo'] = mongodb['events']['Mongo'] del mongodb['events'] payload['mongoDB'] = mongodb # CouchDB if couchdb: payload['couchDB'] = couchdb # dogstream if dogstreamData: dogstreamEvents = dogstreamData.get('dogstreamEvents', None) if dogstreamEvents: if 'dogstream' in payload['events']: events['dogstream'].extend(dogstreamEvents) else: events['dogstream'] = dogstreamEvents del dogstreamData['dogstreamEvents'] payload.update(dogstreamData) # metrics about the forwarder if ddforwarderData: payload['datadog'] = ddforwarderData # Process the event checks. for event_check in self._event_checks: event_data = event_check.check(checks_logger, self.agentConfig) if event_data: events[event_check.key] = event_data # Resources checks if self.os != 'windows': has_resource = False for resources_check in self._resources_checks: resources_check.check() snaps = resources_check.pop_snapshots() if snaps: has_resource = True res_value = { 'snaps': snaps, 'format_version': resources_check.get_format_version() } res_format = resources_check.describe_format_if_needed() if res_format is not None: res_value['format_description'] = res_format payload['resources'][ resources_check.RESOURCE_KEY] = res_value if has_resource: payload['resources']['meta'] = { 'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], } # newer-style checks (not checks.d style) for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) # checks.d checks checksd = checksd or [] for check in checksd: check_cls = check['class'] for instance in check['instances']: try: # Run the check for each configuration check_cls.check(instance) metrics.extend(check_cls.get_metrics()) if check_cls.has_events(): if check['name'] not in events: events[check['name']] = [] for ev in check_cls.get_events(): events[check['name']].append(ev) except Exception: logger.exception("Check %s failed" % check_cls.name) # Store the metrics and events in the payload. payload['metrics'] = metrics payload['events'] = events collect_duration = timer.step() # Pass the payload along to the emitters. for emitter in self.emitters: emitter(payload, checks_logger, self.agentConfig) emit_duration = timer.step() logger.info("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(emit_duration, 2)))
def run(self, checksd=None, start_event=True): """ Collect data from each check and submit their data. """ timer = Timer() if self.os != 'windows': cpu_clock = time.clock() self.run_count += 1 log.debug("Starting collection run #%s" % self.run_count) if checksd: self.initialized_checks_d = checksd['initialized_checks'] # is a list of AgentCheck instances self.init_failed_checks_d = checksd['init_failed_checks'] # is of type {check_name: {error, traceback}} payload = AgentPayload() # Find the AgentMetrics check and pop it out # This check must run at the end of the loop to collect info on agent performance if not self._agent_metrics: for check in self.initialized_checks_d: if check.name == AGENT_METRICS_CHECK_NAME: self._agent_metrics = check self.initialized_checks_d.remove(check) break # Initialize payload self._build_payload(payload) metrics = payload['metrics'] events = payload['events'] service_checks = payload['service_checks'] # Run the system checks. Checks will depend on the OS if self.os == 'windows': # Win32 system checks try: metrics.extend(self._win32_system_checks['memory'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['cpu'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['network'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['io'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['proc'].check(self.agentConfig)) except Exception: log.exception('Unable to fetch Windows system metrics.') else: sd_checks = self._server_density_checks identifier = sd_checks['identifier'].check(self.agentConfig) payload.update(identifier) # SDv1 plugins pluginsData = sd_checks['plugins'].check(self.agentConfig) if pluginsData: payload['plugins'] = pluginsData # Unix system checks sys_checks = self._unix_system_checks load = sys_checks['load'].check(self.agentConfig) payload.update(load) system = sys_checks['system'].check(self.agentConfig) payload.update(system) memory = sys_checks['memory'].check(self.agentConfig) if memory: payload.update({ 'memPhysUsed': memory.get('physUsed'), 'memPhysPctUsable': memory.get('physPctUsable'), 'memPhysFree': memory.get('physFree'), 'memPhysTotal': memory.get('physTotal'), 'memPhysUsable': memory.get('physUsable'), 'memSwapUsed': memory.get('swapUsed'), 'memSwapFree': memory.get('swapFree'), 'memSwapPctFree': memory.get('swapPctFree'), 'memSwapTotal': memory.get('swapTotal'), 'memCached': memory.get('physCached'), 'memBuffers': memory.get('physBuffers'), 'memShared': memory.get('physShared') }) ioStats = sys_checks['io'].check(self.agentConfig) if ioStats: payload['ioStats'] = ioStats processes = sys_checks['processes'].check(self.agentConfig) payload.update({'processes': processes}) cpuStats = sys_checks['cpu'].check(self.agentConfig) if cpuStats: payload.update(cpuStats) # Run old-style checks gangliaData = self._ganglia.check(self.agentConfig) if gangliaData is not False and gangliaData is not None: payload['ganglia'] = gangliaData # Resources checks if self.os != 'windows': has_resource = False for resources_check in self._resources_checks: resources_check.check() snaps = resources_check.pop_snapshots() if snaps: has_resource = True res_value = { 'snaps': snaps, 'format_version': resources_check.get_format_version() } res_format = resources_check.describe_format_if_needed() if res_format is not None: res_value['format_description'] = res_format payload['resources'][resources_check.RESOURCE_KEY] = res_value if has_resource: payload['resources']['meta'] = { 'agent_key': self.agentConfig['agent_key'], 'host': payload['internalHostname'], } # newer-style checks (not checks.d style) for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) # checks.d checks check_statuses = [] for check in self.initialized_checks_d: if not self.continue_running: return log.info("Running check %s" % check.name) instance_statuses = [] metric_count = 0 event_count = 0 service_check_count = 0 check_start_time = time.time() check_stats = None try: # Run the check. instance_statuses = check.run() # Collect the metrics and events. current_check_metrics = check.get_metrics() current_check_events = check.get_events() check_stats = check._get_internal_profiling_stats() # Collect metadata current_check_metadata = check.get_service_metadata() # Save metrics & events for the payload. metrics.extend(current_check_metrics) if current_check_events: if check.name not in events: events[check.name] = current_check_events else: events[check.name] += current_check_events # Save the status of the check. metric_count = len(current_check_metrics) event_count = len(current_check_events) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus( check.name, instance_statuses, metric_count, event_count, service_check_count, service_metadata=current_check_metadata, library_versions=check.get_library_info(), source_type_name=check.SOURCE_TYPE_NAME or check.name, check_stats=check_stats ) # Service check for Agent checks failures service_check_tags = ["check:%s" % check.name] if check_status.status == STATUS_OK: status = AgentCheck.OK elif check_status.status == STATUS_ERROR: status = AgentCheck.CRITICAL check.service_check('sd.agent.check_status', status, tags=service_check_tags) # Collect the service checks and save them in the payload current_check_service_checks = check.get_service_checks() if current_check_service_checks: service_checks.extend(current_check_service_checks) service_check_count = len(current_check_service_checks) # Update the check status with the correct service_check_count check_status.service_check_count = service_check_count check_statuses.append(check_status) check_run_time = time.time() - check_start_time log.debug("Check %s ran in %.2f s" % (check.name, check_run_time)) # Intrument check run timings if enabled. if self.check_timings: metric = 'sd.agent.check_run_time' meta = {'tags': ["check:%s" % check.name]} metrics.append((metric, time.time(), check_run_time, meta)) for check_name, info in self.init_failed_checks_d.iteritems(): if not self.continue_running: return check_status = CheckStatus(check_name, None, None, None, None, init_failed_error=info['error'], init_failed_traceback=info['traceback']) check_statuses.append(check_status) # Add a service check for the agent service_checks.append(create_service_check('sd.agent.up', AgentCheck.OK, hostname=self.hostname)) # Store the metrics and events in the payload. payload['metrics'] = metrics payload['events'] = events payload['service_checks'] = service_checks # Populate metadata self._populate_payload_metadata(payload, check_statuses, start_event) collect_duration = timer.step() if self.os != 'windows': if self._agent_metrics is not None: self._agent_metrics.set_metric_context(payload, { 'collection_time': collect_duration, 'emit_time': self.emit_duration, 'cpu_time': time.clock() - cpu_clock }) self._agent_metrics.run() agent_stats = self._agent_metrics.get_metrics() payload['metrics'].extend(agent_stats) # Dump the metrics to log when in developer mode if self.agentConfig.get('developer_mode', False): log.info("\n AGENT STATS: \n {0}".format(Collector._stats_for_display(agent_stats))) else: if self._agent_metrics is not None: self._agent_metrics.set_metric_context(payload, { 'collection_time': collect_duration, 'emit_time': self.emit_duration, }) self._agent_metrics.run() agent_stats = self._agent_metrics.get_metrics() payload['metrics'].extend(agent_stats) # Dump the metrics to log when in developer mode if self.agentConfig.get('developer_mode', False): log.info("\n AGENT STATS: \n {0}".format(Collector._stats_for_display(agent_stats))) # Let's send our payload emitter_statuses = payload.emit(log, self.agentConfig, self.emitters, self.continue_running) self.emit_duration = timer.step() # Persist the status of the collection run. try: CollectorStatus(check_statuses, emitter_statuses, self.hostname_metadata_cache).persist() except Exception: log.exception("Error persisting collector status") if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0: log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) if self.run_count == FLUSH_LOGGING_INITIAL: log.info("First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD) else: log.debug("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) return payload
def run(self, checksd=None): """ Collect data from each check and submit their data. """ timer = Timer() self.run_count += 1 logger.info("Starting collection run #%s" % self.run_count) payload = self._build_payload() metrics = payload['metrics'] events = payload['events'] # Run the system checks. Checks will depend on the OS if self.os == 'windows': # Win32 system checks metrics.extend(self._win32_system_checks['disk'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['memory'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['cpu'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['network'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['io'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['proc'].check(self.agentConfig)) else: # Unix system checks sys_checks = self._unix_system_checks diskUsage = sys_checks['disk'].check(self.agentConfig) if diskUsage and len(diskUsage) == 2: payload["diskUsage"] = diskUsage[0] payload["inodes"] = diskUsage[1] load = sys_checks['load'].check(self.agentConfig) payload.update(load) memory = sys_checks['memory'].check(self.agentConfig) payload.update({ 'memPhysUsed' : memory.get('physUsed'), 'memPhysFree' : memory.get('physFree'), 'memPhysTotal' : memory.get('physTotal'), 'memPhysUsable' : memory.get('physUsable'), 'memSwapUsed' : memory.get('swapUsed'), 'memSwapFree' : memory.get('swapFree'), 'memSwapTotal' : memory.get('swapTotal'), 'memCached' : memory.get('physCached'), 'memBuffers': memory.get('physBuffers'), 'memShared': memory.get('physShared') }) ioStats = sys_checks['io'].check(checks_logger, self.agentConfig) if ioStats: payload['ioStats'] = ioStats processes = sys_checks['processes'].check(checks_logger, self.agentConfig) payload.update({'processes': processes}) networkTraffic = sys_checks['network'].check(self.agentConfig) payload.update({'networkTraffic': networkTraffic}) cpuStats = sys_checks['cpu'].check(self.agentConfig) if cpuStats: payload.update(cpuStats) # Run old-style checks mysqlStatus = self._mysql.check(self.agentConfig) rabbitmq = self._rabbitmq.check(checks_logger, self.agentConfig) mongodb = self._mongodb.check(self.agentConfig) couchdb = self._couchdb.check(self.agentConfig) gangliaData = self._ganglia.check(self.agentConfig) cassandraData = self._cassandra.check(checks_logger, self.agentConfig) dogstreamData = self._dogstream.check(self.agentConfig) ddforwarderData = self._ddforwarder.check(self.agentConfig) if gangliaData is not False and gangliaData is not None: payload['ganglia'] = gangliaData if cassandraData is not False and cassandraData is not None: payload['cassandra'] = cassandraData # MySQL Status if mysqlStatus: payload.update(mysqlStatus) # RabbitMQ if rabbitmq: payload['rabbitMQ'] = rabbitmq # MongoDB if mongodb: if mongodb.has_key('events'): events['Mongo'] = mongodb['events']['Mongo'] del mongodb['events'] payload['mongoDB'] = mongodb # CouchDB if couchdb: payload['couchDB'] = couchdb # dogstream if dogstreamData: dogstreamEvents = dogstreamData.get('dogstreamEvents', None) if dogstreamEvents: if 'dogstream' in payload['events']: events['dogstream'].extend(dogstreamEvents) else: events['dogstream'] = dogstreamEvents del dogstreamData['dogstreamEvents'] payload.update(dogstreamData) # metrics about the forwarder if ddforwarderData: payload['datadog'] = ddforwarderData # Process the event checks. for event_check in self._event_checks: event_data = event_check.check(checks_logger, self.agentConfig) if event_data: events[event_check.key] = event_data # Resources checks if self.os != 'windows': has_resource = False for resources_check in self._resources_checks: resources_check.check() snaps = resources_check.pop_snapshots() if snaps: has_resource = True res_value = { 'snaps': snaps, 'format_version': resources_check.get_format_version() } res_format = resources_check.describe_format_if_needed() if res_format is not None: res_value['format_description'] = res_format payload['resources'][resources_check.RESOURCE_KEY] = res_value if has_resource: payload['resources']['meta'] = { 'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], } # newer-style checks (not checks.d style) for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) # checks.d checks checksd = checksd or [] for check in checksd: check_cls = check['class'] for instance in check['instances']: try: # Run the check for each configuration check_cls.check(instance) metrics.extend(check_cls.get_metrics()) if check_cls.has_events(): if check['name'] not in events: events[check['name']] = [] for ev in check_cls.get_events(): events[check['name']].append(ev) except Exception: logger.exception("Check %s failed" % check_cls.name) # Store the metrics and events in the payload. payload['metrics'] = metrics payload['events'] = events collect_duration = timer.step() # Pass the payload along to the emitters. for emitter in self.emitters: emitter(payload, checks_logger, self.agentConfig) emit_duration = timer.step() logger.info("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(emit_duration, 2)))