Example #1
0
    def testSpeed(self):
        # Pretend to be gmetad and serve a large piece of content
        original_file = Fixtures.file('ganglia.txt')
        server = subprocess.Popen("nc -l 8651 < %s" % original_file,
                                  shell=True)
        # Wait for 1 second
        time.sleep(1)

        pfile = tempfile.NamedTemporaryFile()
        g = Ganglia(logging.getLogger(__file__))
        # Running the profiler
        # profile.runctx("g.check({'ganglia_host': 'localhost', 'ganglia_port': 8651})", {}, {"g": g}, pfile.name)
        # p = pstats.Stats(pfile.name)
        # p.sort_stats('time').print_stats()
        parsed = StringIO(
            g.check({
                'ganglia_host': 'localhost',
                'ganglia_port': 8651
            }))
        original = Fixtures.file('ganglia.txt')
        x1 = tree.parse(parsed)
        x2 = tree.parse(original)
        # Cursory test
        self.assertEquals([c.tag for c in x1.getroot()],
                          [c.tag for c in x2.getroot()])
Example #2
0
    def testSpeed(self):
        # Pretend to be gmetad and serve a large piece of content
        server = subprocess.Popen("nc -l 8651 < %s" % TEST_FN, shell=True)
        # Wait for 1 second
        time.sleep(1)

        pfile = tempfile.NamedTemporaryFile()
        g = Ganglia(logging.getLogger(__file__))
        # Running the profiler
        # profile.runctx("g.check({'ganglia_host': 'localhost', 'ganglia_port': 8651})", {}, {"g": g}, pfile.name)
        # p = pstats.Stats(pfile.name)
        # p.sort_stats('time').print_stats()
        self.assertEquals(md5(g.check({'ganglia_host': 'localhost', 'ganglia_port': 8651})).hexdigest(), md5(open(TEST_FN).read()).hexdigest())
Example #3
0
    def testSpeed(self):
        # Pretend to be gmetad and serve a large piece of content
        original_file = Fixtures.file('ganglia.txt')
        subprocess.Popen("nc -l 8651 < %s" % original_file, shell=True)
        # Wait for 1 second
        time.sleep(1)

        g = Ganglia(logging.getLogger(__file__))
        parsed = StringIO(g.check({'ganglia_host': 'localhost', 'ganglia_port': 8651}))
        original = Fixtures.file('ganglia.txt')
        x1 = tree.parse(parsed)
        x2 = tree.parse(original)
        # Cursory test
        self.assertEquals([c.tag for c in x1.getroot()], [c.tag for c in x2.getroot()])
Example #4
0
    def testSpeed(self):
        # Pretend to be gmetad and serve a large piece of content
        server = subprocess.Popen("nc -l 8651 < %s" % TEST_FN, shell=True)
        # Wait for 1 second
        time.sleep(1)

        pfile = tempfile.NamedTemporaryFile()
        g = Ganglia(logging.getLogger(__file__))
        # Running the profiler
        # profile.runctx("g.check({'ganglia_host': 'localhost', 'ganglia_port': 8651})", {}, {"g": g}, pfile.name)
        # p = pstats.Stats(pfile.name)
        # p.sort_stats('time').print_stats()
        parsed = StringIO(g.check({'ganglia_host': 'localhost', 'ganglia_port': 8651}))
        original = open(TEST_FN)
        x1 = tree.parse(parsed)
        x2 = tree.parse(original)
        # Cursory test
        self.assertEquals([c.tag for c in x1.getroot()], [c.tag for c in x2.getroot()])
Example #5
0
    def testSpeed(self):
        # Pretend to be gmetad and serve a large piece of content
        server = subprocess.Popen("nc -l 8651 < %s" % TEST_FN, shell=True)
        # Wait for 1 second
        time.sleep(1)

        pfile = tempfile.NamedTemporaryFile()
        g = Ganglia(logging.getLogger(__file__))
        # Running the profiler
        # profile.runctx("g.check({'ganglia_host': 'localhost', 'ganglia_port': 8651})", {}, {"g": g}, pfile.name)
        # p = pstats.Stats(pfile.name)
        # p.sort_stats('time').print_stats()
        self.assertEquals(
            md5(g.check({
                'ganglia_host': 'localhost',
                'ganglia_port': 8651
            })).hexdigest(),
            md5(open(TEST_FN).read()).hexdigest())
Example #6
0
    def __init__(self, agentConfig, emitters, systemStats):
        self.emit_duration = None
        self.agentConfig = agentConfig
        # system stats is generated by config.get_system_stats
        self.agentConfig['system_stats'] = systemStats
        # agent config is used during checks, system_stats can be accessed through the config
        self.os = get_os()
        self.plugins = None
        self.emitters = emitters
        self.metadata_interval = int(agentConfig.get('metadata_interval', 10 * 60))
        self.metadata_start = time.time()
        socket.setdefaulttimeout(15)
        self.run_count = 0
        self.continue_running = True
        self.metadata_cache = None
        self.initialized_checks_d = []
        self.init_failed_checks_d = []

        # Unix System Checks
        self._unix_system_checks = {
            'disk': u.Disk(log),
            'io': u.IO(log),
            'load': u.Load(log),
            'memory': u.Memory(log),
            'processes': u.Processes(log),
            'cpu': u.Cpu(log)
        }

        # Win32 System `Checks
        self._win32_system_checks = {
            'disk': w32.Disk(log),
            'io': w32.IO(log),
            'proc': w32.Processes(log),
            'memory': w32.Memory(log),
            'network': w32.Network(log),
            'cpu': w32.Cpu(log)
        }

        # Old-style metric checks
        self._ganglia = Ganglia(log)
        self._dogstream = Dogstreams.init(log, self.agentConfig)
        self._ddforwarder = DdForwarder(log, self.agentConfig)

        # Agent Metrics
        self._agent_metrics = CollectorMetrics(log)

        self._metrics_checks = []

        # Custom metric checks
        for module_spec in [s.strip() for s in self.agentConfig.get('custom_checks', '').split(',')]:
            if len(module_spec) == 0: continue
            try:
                self._metrics_checks.append(modules.load(module_spec, 'Check')(log))
                log.info("Registered custom check %s" % module_spec)
                log.warning("Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version")
            except Exception, e:
                log.exception('Unable to load custom check module %s' % module_spec)
Example #7
0
    def testSpeed(self):
        # Pretend to be gmetad and serve a large piece of content
        original_file = Fixtures.file('ganglia.txt')
        subprocess.Popen("nc -l 8651 < %s" % original_file, shell=True)
        # Wait for 1 second
        time.sleep(1)

        g = Ganglia(logging.getLogger(__file__))
        parsed = StringIO(
            g.check({
                'ganglia_host': 'localhost',
                'ganglia_port': 8651
            }))
        original = Fixtures.file('ganglia.txt')
        x1 = tree.parse(parsed)
        x2 = tree.parse(original)
        # Cursory test
        self.assertEquals([c.tag for c in x1.getroot()],
                          [c.tag for c in x2.getroot()])
Example #8
0
    def __init__(self, agentConfig, emitters):
        self.agentConfig = agentConfig
        self.plugins = None
        self.emitters = emitters
        self.os = None
        
        self.checksLogger = logging.getLogger('checks')
        socket.setdefaulttimeout(15)
        
        self._apache = Apache(self.checksLogger)
        self._nginx = Nginx(self.checksLogger)
        self._disk = Disk(self.checksLogger)
        self._io = IO()
        self._load = Load(self.checksLogger)
        self._memory = Memory(self.checksLogger)
        self._network = Network(self.checksLogger)
        self._processes = Processes()
        self._cpu = Cpu()
        self._couchdb = CouchDb(self.checksLogger)
        self._mongodb = MongoDb(self.checksLogger)
        self._mysql = MySql(self.checksLogger)
        self._pgsql = PostgreSql(self.checksLogger)
        self._rabbitmq = RabbitMq()
        self._ganglia = Ganglia(self.checksLogger)
        self._cassandra = Cassandra()
        self._redis = Redis(self.checksLogger)
        self._jvm = Jvm(self.checksLogger)
        self._tomcat = Tomcat(self.checksLogger)
        self._activemq = ActiveMQ(self.checksLogger)
        self._solr = Solr(self.checksLogger)
        self._memcache = Memcache(self.checksLogger)
        self._dogstream = Dogstreams.init(self.checksLogger, self.agentConfig)
        self._ddforwarder = DdForwarder(self.checksLogger, self.agentConfig)

        # All new checks should be metrics checks:
        self._metrics_checks = [
            Cacti(self.checksLogger),
            Redis(self.checksLogger),
            Varnish(self.checksLogger),
            ElasticSearch(self.checksLogger),
            ]
        self._event_checks = [Hudson(), Nagios(socket.gethostname())]
        self._resources_checks = [ResProcesses(self.checksLogger,self.agentConfig)]

        self._ec2 = EC2(self.checksLogger)
Example #9
0
class Collector(object):
    """
    The collector is responsible for collecting data from each check and
    passing it along to the emitters, who send it to their final destination.
    """
    def __init__(self, agentConfig, emitters, systemStats, hostname):
        self.emit_duration = None
        self.agentConfig = agentConfig
        self.hostname = hostname
        # system stats is generated by config.get_system_stats
        self.agentConfig['system_stats'] = systemStats
        # agent config is used during checks, system_stats can be accessed through the config
        self.os = get_os()
        self.plugins = None
        self.emitters = emitters
        self.check_timings = agentConfig.get('check_timings')
        self.push_times = {
            'host_metadata': {
                'start': time.time(),
                'interval': int(agentConfig.get('metadata_interval', 4 * 60 * 60))
            },
            'external_host_tags': {
                'start': time.time() - 3 * 60,  # Wait for the checks to init
                'interval': int(agentConfig.get('external_host_tags', 5 * 60))
            },
            'agent_checks': {
                'start': time.time(),
                'interval': int(agentConfig.get('agent_checks_interval', 10 * 60))
            },
            'processes': {
                'start': time.time(),
                'interval': int(agentConfig.get('processes_interval', 60))
            }
        }
        socket.setdefaulttimeout(15)
        self.run_count = 0
        self.continue_running = True
        self.hostname_metadata_cache = None
        self.initialized_checks_d = []
        self.init_failed_checks_d = {}

        # Unix System Checks
        self._unix_system_checks = {
            'io': u.IO(log),
            'load': u.Load(log),
            'memory': u.Memory(log),
            'processes': u.Processes(log),
            'cpu': u.Cpu(log),
            'system': u.System(log)
        }

        # Win32 System `Checks
        self._win32_system_checks = {
            'io': w32.IO(log),
            'proc': w32.Processes(log),
            'memory': w32.Memory(log),
            'network': w32.Network(log),
            'cpu': w32.Cpu(log),
            'system': w32.System(log)
        }

        # Old-style metric checks
        self._ganglia = Ganglia(log)
        self._dogstream = Dogstreams.init(log, self.agentConfig)
        self._ddforwarder = DdForwarder(log, self.agentConfig)

        # Agent performance metrics check
        self._agent_metrics = None

        self._metrics_checks = []

        # Custom metric checks
        for module_spec in [s.strip() for s in self.agentConfig.get('custom_checks', '').split(',')]:
            if len(module_spec) == 0:
                continue
            try:
                self._metrics_checks.append(modules.load(module_spec, 'Check')(log))
                log.info("Registered custom check %s" % module_spec)
                log.warning("Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version")
            except Exception:
                log.exception('Unable to load custom check module %s' % module_spec)

    def stop(self):
        """
        Tell the collector to stop at the next logical point.
        """
        # This is called when the process is being killed, so
        # try to stop the collector as soon as possible.
        # Most importantly, don't try to submit to the emitters
        # because the forwarder is quite possibly already killed
        # in which case we'll get a misleading error in the logs.
        # Best to not even try.
        self.continue_running = False
        for check in self.initialized_checks_d:
            check.stop()

    @staticmethod
    def _stats_for_display(raw_stats):
        return pprint.pformat(raw_stats, indent=4)

    @log_exceptions(log)
    def run(self, checksd=None, start_event=True, configs_reloaded=False):
        """
        Collect data from each check and submit their data.
        """
        log.debug("Found {num_checks} checks".format(num_checks=len(checksd['initialized_checks'])))
        timer = Timer()
        if not Platform.is_windows():
            cpu_clock = time.clock()
        self.run_count += 1
        log.debug("Starting collection run #%s" % self.run_count)

        if checksd:
            self.initialized_checks_d = checksd['initialized_checks']  # is a list of AgentCheck instances
            self.init_failed_checks_d = checksd['init_failed_checks']  # is of type {check_name: {error, traceback}}

        payload = AgentPayload()

        # Find the AgentMetrics check and pop it out
        # This check must run at the end of the loop to collect info on agent performance
        if not self._agent_metrics or configs_reloaded:
            for check in self.initialized_checks_d:
                if check.name == AGENT_METRICS_CHECK_NAME:
                    self._agent_metrics = check
                    self.initialized_checks_d.remove(check)
                    break

        # Initialize payload
        self._build_payload(payload)

        metrics = payload['metrics']
        events = payload['events']
        service_checks = payload['service_checks']

        # Run the system checks. Checks will depend on the OS
        if Platform.is_windows():
            # Win32 system checks
            try:
                metrics.extend(self._win32_system_checks['memory'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['cpu'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['network'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['io'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['proc'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['system'].check(self.agentConfig))
            except Exception:
                log.exception('Unable to fetch Windows system metrics.')
        else:
            # Unix system checks
            sys_checks = self._unix_system_checks

            load = sys_checks['load'].check(self.agentConfig)
            payload.update(load)

            system = sys_checks['system'].check(self.agentConfig)
            payload.update(system)

            memory = sys_checks['memory'].check(self.agentConfig)

            if memory:
                memstats = {
                    'memPhysUsed': memory.get('physUsed'),
                    'memPhysPctUsable': memory.get('physPctUsable'),
                    'memPhysFree': memory.get('physFree'),
                    'memPhysTotal': memory.get('physTotal'),
                    'memPhysUsable': memory.get('physUsable'),
                    'memSwapUsed': memory.get('swapUsed'),
                    'memSwapFree': memory.get('swapFree'),
                    'memSwapPctFree': memory.get('swapPctFree'),
                    'memSwapTotal': memory.get('swapTotal'),
                    'memCached': memory.get('physCached'),
                    'memBuffers': memory.get('physBuffers'),
                    'memShared': memory.get('physShared'),
                    'memSlab': memory.get('physSlab'),
                    'memPageTables': memory.get('physPageTables'),
                    'memSwapCached': memory.get('swapCached')
                }
                payload.update(memstats)

            ioStats = sys_checks['io'].check(self.agentConfig)
            if ioStats:
                payload['ioStats'] = ioStats

            processes = sys_checks['processes'].check(self.agentConfig)
            payload.update({'processes': processes})

            cpuStats = sys_checks['cpu'].check(self.agentConfig)
            if cpuStats:
                payload.update(cpuStats)

        # Run old-style checks
        gangliaData = self._ganglia.check(self.agentConfig)
        dogstreamData = self._dogstream.check(self.agentConfig)
        ddforwarderData = self._ddforwarder.check(self.agentConfig)

        if gangliaData is not False and gangliaData is not None:
            payload['ganglia'] = gangliaData

        # dogstream
        if dogstreamData:
            dogstreamEvents = dogstreamData.get('dogstreamEvents', None)
            if dogstreamEvents:
                if 'dogstream' in payload['events']:
                    events['dogstream'].extend(dogstreamEvents)
                else:
                    events['dogstream'] = dogstreamEvents
                del dogstreamData['dogstreamEvents']

            payload.update(dogstreamData)

        # metrics about the forwarder
        if ddforwarderData:
            payload['datadog'] = ddforwarderData

        # process collector of gohai (compliant with payload of legacy "resources checks")
        if not Platform.is_windows() and self._should_send_additional_data('processes'):
            gohai_processes = self._run_gohai_processes()
            if gohai_processes:
                try:
                    gohai_processes_json = json.loads(gohai_processes)
                    processes_payload = {
                        'snaps': [gohai_processes_json.get('processes')],
                        'format_version': 1
                    }
                    if self._is_first_run():
                        processes_payload['format_description'] = PROCESSES_FORMAT_DESCRIPTION

                    payload['resources'] = {
                        'processes': processes_payload,
                        'meta': {
                            'api_key': self.agentConfig['api_key'],
                            'host': payload['internalHostname'],
                        }
                    }
                except Exception:
                    log.exception("Error running gohai processes collection")

        # newer-style checks (not checks.d style)
        for metrics_check in self._metrics_checks:
            res = metrics_check.check(self.agentConfig)
            if res:
                metrics.extend(res)

        # checks.d checks
        check_statuses = []
        for check in self.initialized_checks_d:
            if not self.continue_running:
                return
            log.info("Running check %s" % check.name)
            instance_statuses = []
            metric_count = 0
            event_count = 0
            service_check_count = 0
            check_start_time = time.time()
            check_stats = None

            try:
                # Run the check.
                instance_statuses = check.run()

                # Collect the metrics and events.
                current_check_metrics = check.get_metrics()
                current_check_events = check.get_events()
                check_stats = check._get_internal_profiling_stats()

                # Collect metadata
                current_check_metadata = check.get_service_metadata()

                # Save metrics & events for the payload.
                metrics.extend(current_check_metrics)
                if current_check_events:
                    if check.name not in events:
                        events[check.name] = current_check_events
                    else:
                        events[check.name] += current_check_events

                # Save the status of the check.
                metric_count = len(current_check_metrics)
                event_count = len(current_check_events)

            except Exception:
                log.exception("Error running check %s" % check.name)

            check_status = CheckStatus(
                check.name, instance_statuses, metric_count,
                event_count, service_check_count, service_metadata=current_check_metadata,
                library_versions=check.get_library_info(),
                source_type_name=check.SOURCE_TYPE_NAME or check.name,
                check_stats=check_stats
            )

            # Service check for Agent checks failures
            service_check_tags = ["check:%s" % check.name]
            if check_status.status == STATUS_OK:
                status = AgentCheck.OK
            elif check_status.status == STATUS_ERROR:
                status = AgentCheck.CRITICAL
            check.service_check('datadog.agent.check_status', status, tags=service_check_tags)

            # Collect the service checks and save them in the payload
            current_check_service_checks = check.get_service_checks()
            if current_check_service_checks:
                service_checks.extend(current_check_service_checks)
            service_check_count = len(current_check_service_checks)

            # Update the check status with the correct service_check_count
            check_status.service_check_count = service_check_count
            check_statuses.append(check_status)

            check_run_time = time.time() - check_start_time
            log.debug("Check %s ran in %.2f s" % (check.name, check_run_time))

            # Intrument check run timings if enabled.
            if self.check_timings:
                metric = 'datadog.agent.check_run_time'
                meta = {'tags': ["check:%s" % check.name]}
                metrics.append((metric, time.time(), check_run_time, meta))

        for check_name, info in self.init_failed_checks_d.iteritems():
            if not self.continue_running:
                return
            check_status = CheckStatus(check_name, None, None, None, None,
                                       init_failed_error=info['error'],
                                       init_failed_traceback=info['traceback'])
            check_statuses.append(check_status)

        # Add a service check for the agent
        service_checks.append(create_service_check('datadog.agent.up', AgentCheck.OK,
                              hostname=self.hostname))

        # Store the metrics and events in the payload.
        payload['metrics'] = metrics
        payload['events'] = events
        payload['service_checks'] = service_checks

        # Populate metadata
        self._populate_payload_metadata(payload, check_statuses, start_event)

        collect_duration = timer.step()

        if self._agent_metrics:
            metric_context = {
                'collection_time': collect_duration,
                'emit_time': self.emit_duration,
            }
            if not Platform.is_windows():
                metric_context['cpu_time'] = time.clock() - cpu_clock

            self._agent_metrics.set_metric_context(payload, metric_context)
            self._agent_metrics.run()
            agent_stats = self._agent_metrics.get_metrics()
            payload['metrics'].extend(agent_stats)
            if self.agentConfig.get('developer_mode'):
                log.debug("\n Agent developer mode stats: \n {0}".format(
                    Collector._stats_for_display(agent_stats))
                )
            # Flush metadata for the Agent Metrics check. Otherwise they'll just accumulate and leak.
            self._agent_metrics.get_service_metadata()

        # Let's send our payload
        emitter_statuses = payload.emit(log, self.agentConfig, self.emitters,
                                        self.continue_running)
        self.emit_duration = timer.step()

        # Persist the status of the collection run.
        try:
            CollectorStatus(check_statuses, emitter_statuses,
                            self.hostname_metadata_cache).persist()
        except Exception:
            log.exception("Error persisting collector status")

        if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0:
            log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" %
                     (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2)))
            if self.run_count == FLUSH_LOGGING_INITIAL:
                log.info("First flushes done, next flushes will be logged every %s flushes." %
                         FLUSH_LOGGING_PERIOD)
        else:
            log.debug("Finished run #%s. Collection time: %ss. Emit time: %ss" %
                      (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2)))

        return payload

    @staticmethod
    def run_single_check(check, verbose=True):
        log.info("Running check %s" % check.name)
        instance_statuses = []
        metric_count = 0
        event_count = 0
        service_check_count = 0
        check_stats = None

        try:
            # Run the check.
            instance_statuses = check.run()

            # Collect the metrics and events.
            current_check_metrics = check.get_metrics()
            current_check_events = check.get_events()
            current_service_checks = check.get_service_checks()
            current_service_metadata = check.get_service_metadata()

            check_stats = check._get_internal_profiling_stats()

            # Save the status of the check.
            metric_count = len(current_check_metrics)
            event_count = len(current_check_events)
            service_check_count = len(current_service_checks)

            print "Metrics: \n{0}".format(pprint.pformat(current_check_metrics))
            print "Events: \n{0}".format(pprint.pformat(current_check_events))
            print "Service Checks: \n{0}".format(pprint.pformat(current_service_checks))
            print "Service Metadata: \n{0}".format(pprint.pformat(current_service_metadata))

        except Exception:
            log.exception("Error running check %s" % check.name)

        check_status = CheckStatus(
            check.name, instance_statuses, metric_count,
            event_count, service_check_count,
            library_versions=check.get_library_info(),
            source_type_name=check.SOURCE_TYPE_NAME or check.name,
            check_stats=check_stats
        )

        return check_status

    def _emit(self, payload):
        """ Send the payload via the emitters. """
        statuses = []
        for emitter in self.emitters:
            # Don't try to send to an emitter if we're stopping/
            if not self.continue_running:
                return statuses
            name = emitter.__name__
            emitter_status = EmitterStatus(name)
            try:
                emitter(payload, log, self.agentConfig)
            except Exception, e:
                log.exception("Error running emitter: %s" % emitter.__name__)
                emitter_status = EmitterStatus(name, e)
            statuses.append(emitter_status)
        return statuses
Example #10
0
class Collector(object):
    """
    The collector is responsible for collecting data from each check and
    passing it along to the emitters, who send it to their final destination.
    """
    def __init__(self, agentConfig, emitters, systemStats, hostname):
        self.emit_duration = None
        self.agentConfig = agentConfig
        self.hostname = hostname
        # system stats is generated by config.get_system_stats
        self.agentConfig['system_stats'] = systemStats
        # agent config is used during checks, system_stats can be accessed through the config
        self.os = get_os()
        self.plugins = None
        self.emitters = emitters
        self.check_timings = agentConfig.get('check_timings')
        self.push_times = {
            'host_metadata': {
                'start': time.time(),
                'interval':
                int(agentConfig.get('metadata_interval', 4 * 60 * 60))
            },
            'external_host_tags': {
                'start': time.time() - 3 * 60,  # Wait for the checks to init
                'interval': int(agentConfig.get('external_host_tags', 5 * 60))
            },
            'agent_checks': {
                'start': time.time(),
                'interval':
                int(agentConfig.get('agent_checks_interval', 10 * 60))
            },
        }
        socket.setdefaulttimeout(15)
        self.run_count = 0
        self.continue_running = True
        self.hostname_metadata_cache = None
        self.initialized_checks_d = []
        self.init_failed_checks_d = {}

        # Unix System Checks
        self._unix_system_checks = {
            'io': u.IO(log),
            'load': u.Load(log),
            'memory': u.Memory(log),
            'processes': u.Processes(log),
            'cpu': u.Cpu(log),
            'system': u.System(log)
        }

        # Win32 System `Checks
        self._win32_system_checks = {
            'io': w32.IO(log),
            'proc': w32.Processes(log),
            'memory': w32.Memory(log),
            'network': w32.Network(log),
            'cpu': w32.Cpu(log),
            'system': w32.System(log)
        }

        # Old-style metric checks
        self._ganglia = Ganglia(log)
        self._dogstream = Dogstreams.init(log, self.agentConfig)
        self._ddforwarder = DdForwarder(log, self.agentConfig)

        # Agent performance metrics check
        self._agent_metrics = None

        self._metrics_checks = []

        # Custom metric checks
        for module_spec in [
                s.strip()
                for s in self.agentConfig.get('custom_checks', '').split(',')
        ]:
            if len(module_spec) == 0:
                continue
            try:
                self._metrics_checks.append(
                    modules.load(module_spec, 'Check')(log))
                log.info("Registered custom check %s" % module_spec)
                log.warning(
                    "Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version"
                )
            except Exception:
                log.exception('Unable to load custom check module %s' %
                              module_spec)

        # Resource Checks
        self._resources_checks = [ResProcesses(log, self.agentConfig)]

    def stop(self):
        """
        Tell the collector to stop at the next logical point.
        """
        # This is called when the process is being killed, so
        # try to stop the collector as soon as possible.
        # Most importantly, don't try to submit to the emitters
        # because the forwarder is quite possibly already killed
        # in which case we'll get a misleading error in the logs.
        # Best to not even try.
        self.continue_running = False
        for check in self.initialized_checks_d:
            check.stop()

    @staticmethod
    def _stats_for_display(raw_stats):
        return pprint.pformat(raw_stats, indent=4)

    @log_exceptions(log)
    def run(self, checksd=None, start_event=True, configs_reloaded=False):
        """
        Collect data from each check and submit their data.
        """
        log.debug("Found {num_checks} checks".format(
            num_checks=len(checksd['initialized_checks'])))
        timer = Timer()
        if not Platform.is_windows():
            cpu_clock = time.clock()
        self.run_count += 1
        log.debug("Starting collection run #%s" % self.run_count)

        if checksd:
            self.initialized_checks_d = checksd[
                'initialized_checks']  # is a list of AgentCheck instances
            self.init_failed_checks_d = checksd[
                'init_failed_checks']  # is of type {check_name: {error, traceback}}

        payload = AgentPayload()

        # Find the AgentMetrics check and pop it out
        # This check must run at the end of the loop to collect info on agent performance
        if not self._agent_metrics or configs_reloaded:
            for check in self.initialized_checks_d:
                if check.name == AGENT_METRICS_CHECK_NAME:
                    self._agent_metrics = check
                    self.initialized_checks_d.remove(check)
                    break

        # Initialize payload
        self._build_payload(payload)

        metrics = payload['metrics']
        events = payload['events']
        service_checks = payload['service_checks']

        # Run the system checks. Checks will depend on the OS
        if Platform.is_windows():
            # Win32 system checks
            try:
                metrics.extend(self._win32_system_checks['memory'].check(
                    self.agentConfig))
                metrics.extend(self._win32_system_checks['cpu'].check(
                    self.agentConfig))
                metrics.extend(self._win32_system_checks['network'].check(
                    self.agentConfig))
                metrics.extend(self._win32_system_checks['io'].check(
                    self.agentConfig))
                metrics.extend(self._win32_system_checks['proc'].check(
                    self.agentConfig))
                metrics.extend(self._win32_system_checks['system'].check(
                    self.agentConfig))
            except Exception:
                log.exception('Unable to fetch Windows system metrics.')
        else:
            # Unix system checks
            sys_checks = self._unix_system_checks

            load = sys_checks['load'].check(self.agentConfig)
            payload.update(load)

            system = sys_checks['system'].check(self.agentConfig)
            payload.update(system)

            memory = sys_checks['memory'].check(self.agentConfig)

            if memory:
                memstats = {
                    'memPhysUsed': memory.get('physUsed'),
                    'memPhysPctUsable': memory.get('physPctUsable'),
                    'memPhysFree': memory.get('physFree'),
                    'memPhysTotal': memory.get('physTotal'),
                    'memPhysUsable': memory.get('physUsable'),
                    'memSwapUsed': memory.get('swapUsed'),
                    'memSwapFree': memory.get('swapFree'),
                    'memSwapPctFree': memory.get('swapPctFree'),
                    'memSwapTotal': memory.get('swapTotal'),
                    'memCached': memory.get('physCached'),
                    'memBuffers': memory.get('physBuffers'),
                    'memShared': memory.get('physShared'),
                    'memSlab': memory.get('physSlab'),
                    'memPageTables': memory.get('physPageTables'),
                    'memSwapCached': memory.get('swapCached')
                }
                payload.update(memstats)

            ioStats = sys_checks['io'].check(self.agentConfig)
            if ioStats:
                payload['ioStats'] = ioStats

            processes = sys_checks['processes'].check(self.agentConfig)
            payload.update({'processes': processes})

            cpuStats = sys_checks['cpu'].check(self.agentConfig)
            if cpuStats:
                payload.update(cpuStats)

        # Run old-style checks
        gangliaData = self._ganglia.check(self.agentConfig)
        dogstreamData = self._dogstream.check(self.agentConfig)
        ddforwarderData = self._ddforwarder.check(self.agentConfig)

        if gangliaData is not False and gangliaData is not None:
            payload['ganglia'] = gangliaData

        # dogstream
        if dogstreamData:
            dogstreamEvents = dogstreamData.get('dogstreamEvents', None)
            if dogstreamEvents:
                if 'dogstream' in payload['events']:
                    events['dogstream'].extend(dogstreamEvents)
                else:
                    events['dogstream'] = dogstreamEvents
                del dogstreamData['dogstreamEvents']

            payload.update(dogstreamData)

        # metrics about the forwarder
        if ddforwarderData:
            payload['datadog'] = ddforwarderData

        # Resources checks
        if not Platform.is_windows():
            has_resource = False
            for resources_check in self._resources_checks:
                try:
                    resources_check.check()
                    snaps = resources_check.pop_snapshots()
                    if snaps:
                        has_resource = True
                        res_value = {
                            'snaps': snaps,
                            'format_version':
                            resources_check.get_format_version()
                        }
                        res_format = resources_check.describe_format_if_needed(
                        )
                        if res_format is not None:
                            res_value['format_description'] = res_format
                        payload['resources'][
                            resources_check.RESOURCE_KEY] = res_value
                except Exception:
                    log.exception("Error running resource check %s" %
                                  resources_check.RESOURCE_KEY)

            if has_resource:
                payload['resources']['meta'] = {
                    'api_key': self.agentConfig['api_key'],
                    'host': payload['internalHostname'],
                }

        # newer-style checks (not checks.d style)
        for metrics_check in self._metrics_checks:
            res = metrics_check.check(self.agentConfig)
            if res:
                metrics.extend(res)

        # checks.d checks
        check_statuses = []
        for check in self.initialized_checks_d:
            if not self.continue_running:
                return
            log.info("Running check %s" % check.name)
            instance_statuses = []
            metric_count = 0
            event_count = 0
            service_check_count = 0
            check_start_time = time.time()
            check_stats = None

            try:
                # Run the check.
                instance_statuses = check.run()

                # Collect the metrics and events.
                current_check_metrics = check.get_metrics()
                current_check_events = check.get_events()
                check_stats = check._get_internal_profiling_stats()

                # Collect metadata
                current_check_metadata = check.get_service_metadata()

                # Save metrics & events for the payload.
                metrics.extend(current_check_metrics)
                if current_check_events:
                    if check.name not in events:
                        events[check.name] = current_check_events
                    else:
                        events[check.name] += current_check_events

                # Save the status of the check.
                metric_count = len(current_check_metrics)
                event_count = len(current_check_events)

            except Exception:
                log.exception("Error running check %s" % check.name)

            check_status = CheckStatus(
                check.name,
                instance_statuses,
                metric_count,
                event_count,
                service_check_count,
                service_metadata=current_check_metadata,
                library_versions=check.get_library_info(),
                source_type_name=check.SOURCE_TYPE_NAME or check.name,
                check_stats=check_stats)

            # Service check for Agent checks failures
            service_check_tags = ["check:%s" % check.name]
            if check_status.status == STATUS_OK:
                status = AgentCheck.OK
            elif check_status.status == STATUS_ERROR:
                status = AgentCheck.CRITICAL
            check.service_check('datadog.agent.check_status',
                                status,
                                tags=service_check_tags)

            # Collect the service checks and save them in the payload
            current_check_service_checks = check.get_service_checks()
            if current_check_service_checks:
                service_checks.extend(current_check_service_checks)
            service_check_count = len(current_check_service_checks)

            # Update the check status with the correct service_check_count
            check_status.service_check_count = service_check_count
            check_statuses.append(check_status)

            check_run_time = time.time() - check_start_time
            log.debug("Check %s ran in %.2f s" % (check.name, check_run_time))

            # Intrument check run timings if enabled.
            if self.check_timings:
                metric = 'datadog.agent.check_run_time'
                meta = {'tags': ["check:%s" % check.name]}
                metrics.append((metric, time.time(), check_run_time, meta))

        for check_name, info in self.init_failed_checks_d.iteritems():
            if not self.continue_running:
                return
            check_status = CheckStatus(check_name,
                                       None,
                                       None,
                                       None,
                                       None,
                                       init_failed_error=info['error'],
                                       init_failed_traceback=info['traceback'])
            check_statuses.append(check_status)

        # Add a service check for the agent
        service_checks.append(
            create_service_check('datadog.agent.up',
                                 AgentCheck.OK,
                                 hostname=self.hostname))

        # Store the metrics and events in the payload.
        payload['metrics'] = metrics
        payload['events'] = events
        payload['service_checks'] = service_checks

        # Populate metadata
        self._populate_payload_metadata(payload, check_statuses, start_event)

        collect_duration = timer.step()

        if self._agent_metrics:
            metric_context = {
                'collection_time': collect_duration,
                'emit_time': self.emit_duration,
            }
            if not Platform.is_windows():
                metric_context['cpu_time'] = time.clock() - cpu_clock

            self._agent_metrics.set_metric_context(payload, metric_context)
            self._agent_metrics.run()
            agent_stats = self._agent_metrics.get_metrics()
            payload['metrics'].extend(agent_stats)
            if self.agentConfig.get('developer_mode'):
                log.debug("\n Agent developer mode stats: \n {0}".format(
                    Collector._stats_for_display(agent_stats)))

        # Let's send our payload
        emitter_statuses = payload.emit(log, self.agentConfig, self.emitters,
                                        self.continue_running)
        self.emit_duration = timer.step()

        # Persist the status of the collection run.
        try:
            CollectorStatus(check_statuses, emitter_statuses,
                            self.hostname_metadata_cache).persist()
        except Exception:
            log.exception("Error persisting collector status")

        if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0:
            log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" %
                     (self.run_count, round(collect_duration,
                                            2), round(self.emit_duration, 2)))
            if self.run_count == FLUSH_LOGGING_INITIAL:
                log.info(
                    "First flushes done, next flushes will be logged every %s flushes."
                    % FLUSH_LOGGING_PERIOD)
        else:
            log.debug(
                "Finished run #%s. Collection time: %ss. Emit time: %ss" %
                (self.run_count, round(collect_duration,
                                       2), round(self.emit_duration, 2)))

        return payload

    @staticmethod
    def run_single_check(check, verbose=True):
        log.info("Running check %s" % check.name)
        instance_statuses = []
        metric_count = 0
        event_count = 0
        service_check_count = 0
        check_stats = None

        try:
            # Run the check.
            instance_statuses = check.run()

            # Collect the metrics and events.
            current_check_metrics = check.get_metrics()
            current_check_events = check.get_events()
            current_service_checks = check.get_service_checks()
            current_service_metadata = check.get_service_metadata()

            check_stats = check._get_internal_profiling_stats()

            # Save the status of the check.
            metric_count = len(current_check_metrics)
            event_count = len(current_check_events)
            service_check_count = len(current_service_checks)

            print "Metrics: \n{0}".format(
                pprint.pformat(current_check_metrics))
            print "Events: \n{0}".format(pprint.pformat(current_check_events))
            print "Service Checks: \n{0}".format(
                pprint.pformat(current_service_checks))
            print "Service Metadata: \n{0}".format(
                pprint.pformat(current_service_metadata))

        except Exception:
            log.exception("Error running check %s" % check.name)

        check_status = CheckStatus(check.name,
                                   instance_statuses,
                                   metric_count,
                                   event_count,
                                   service_check_count,
                                   library_versions=check.get_library_info(),
                                   source_type_name=check.SOURCE_TYPE_NAME
                                   or check.name,
                                   check_stats=check_stats)

        return check_status

    def _emit(self, payload):
        """ Send the payload via the emitters. """
        statuses = []
        for emitter in self.emitters:
            # Don't try to send to an emitter if we're stopping/
            if not self.continue_running:
                return statuses
            name = emitter.__name__
            emitter_status = EmitterStatus(name)
            try:
                emitter(payload, log, self.agentConfig)
            except Exception, e:
                log.exception("Error running emitter: %s" % emitter.__name__)
                emitter_status = EmitterStatus(name, e)
            statuses.append(emitter_status)
        return statuses
Example #11
0
class Collector(object):
    """
    The collector is responsible for collecting data from each check and
    passing it along to the emitters, who send it to their final destination.
    """
    def __init__(self, agentConfig, emitters, systemStats, hostname):
        self.emit_duration = None
        self.agentConfig = agentConfig
        self.hostname = hostname
        # system stats is generated by config.get_system_stats
        self.agentConfig['system_stats'] = systemStats
        # agent config is used during checks, system_stats can be accessed through the config
        self.os = get_os()
        self.plugins = None
        self.emitters = emitters
        self.check_timings = agentConfig.get('check_timings')
        self.push_times = {
            'host_metadata': {
                'start': time.time(),
                'interval': int(agentConfig.get('metadata_interval', 4 * 60 * 60))
            },
            'external_host_tags': {
                'start': time.time() - 3 * 60,  # Wait for the checks to init
                'interval': int(agentConfig.get('external_host_tags', 5 * 60))
            },
            'agent_checks': {
                'start': time.time(),
                'interval': int(agentConfig.get('agent_checks_interval', 10 * 60))
            },
            'processes': {
                'start': time.time(),
                'interval': int(agentConfig.get('processes_interval', 60))
            }
        }
        socket.setdefaulttimeout(15)
        self.run_count = 0
        self.continue_running = True
        self.hostname_metadata_cache = None
        self.initialized_checks_d = []
        self.init_failed_checks_d = {}

        # Unix System Checks
        self._unix_system_checks = {
            'io': u.IO(log),
            'load': u.Load(log),
            'memory': u.Memory(log),
            'processes': u.Processes(log),
            'cpu': u.Cpu(log),
            'system': u.System(log)
        }

        # Win32 System `Checks
        self._win32_system_checks = {
            'io': w32.IO(log),
            'proc': w32.Processes(log),
            'memory': w32.Memory(log),
            'network': w32.Network(log),
            'cpu': w32.Cpu(log),
            'system': w32.System(log)
        }

        # Old-style metric checks
        self._ganglia = Ganglia(log)
        self._dogstream = Dogstreams.init(log, self.agentConfig)
        self._ddforwarder = DdForwarder(log, self.agentConfig)

        # Agent performance metrics check
        self._agent_metrics = None

        self._metrics_checks = []

        # Custom metric checks
        for module_spec in [s.strip() for s in self.agentConfig.get('custom_checks', '').split(',')]:
            if len(module_spec) == 0:
                continue
            try:
                self._metrics_checks.append(modules.load(module_spec, 'Check')(log))
                log.info("Registered custom check %s" % module_spec)
                log.warning("Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version")
            except Exception:
                log.exception('Unable to load custom check module %s' % module_spec)

    def stop(self):
        """
        Tell the collector to stop at the next logical point.
        """
        # This is called when the process is being killed, so
        # try to stop the collector as soon as possible.
        # Most importantly, don't try to submit to the emitters
        # because the forwarder is quite possibly already killed
        # in which case we'll get a misleading error in the logs.
        # Best to not even try.
        self.continue_running = False
        for check in self.initialized_checks_d:
            check.stop()

    @staticmethod
    def _stats_for_display(raw_stats):
        return pprint.pformat(raw_stats, indent=4)

    @log_exceptions(log)
    def run(self, checksd=None, start_event=True, configs_reloaded=False):
        """
        Collect data from each check and submit their data.
        """
        log.debug("Found {num_checks} checks".format(num_checks=len(checksd['initialized_checks'])))
        timer = Timer()
        if not Platform.is_windows():
            cpu_clock = time.clock()
        self.run_count += 1
        log.debug("Starting collection run #%s" % self.run_count)

        if checksd:
            self.initialized_checks_d = checksd['initialized_checks']  # is a list of AgentCheck instances
            self.init_failed_checks_d = checksd['init_failed_checks']  # is of type {check_name: {error, traceback}}

        payload = AgentPayload()

        # Find the AgentMetrics check and pop it out
        # This check must run at the end of the loop to collect info on agent performance
        if not self._agent_metrics or configs_reloaded:
            for check in self.initialized_checks_d:
                if check.name == AGENT_METRICS_CHECK_NAME:
                    self._agent_metrics = check
                    self.initialized_checks_d.remove(check)
                    break

        # Initialize payload
        self._build_payload(payload)

        metrics = payload['metrics']
        events = payload['events']
        service_checks = payload['service_checks']

        # Run the system checks. Checks will depend on the OS
        if Platform.is_windows():
            # Win32 system checks
            try:
                metrics.extend(self._win32_system_checks['memory'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['cpu'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['network'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['io'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['proc'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['system'].check(self.agentConfig))
            except Exception:
                log.exception('Unable to fetch Windows system metrics.')
        else:
            # Unix system checks
            sys_checks = self._unix_system_checks

            load = sys_checks['load'].check(self.agentConfig)
            payload.update(load)

            system = sys_checks['system'].check(self.agentConfig)
            payload.update(system)

            memory = sys_checks['memory'].check(self.agentConfig)

            if memory:
                memstats = {
                    'memPhysUsed': memory.get('physUsed'),
                    'memPhysPctUsable': memory.get('physPctUsable'),
                    'memPhysFree': memory.get('physFree'),
                    'memPhysTotal': memory.get('physTotal'),
                    'memPhysUsable': memory.get('physUsable'),
                    'memSwapUsed': memory.get('swapUsed'),
                    'memSwapFree': memory.get('swapFree'),
                    'memSwapPctFree': memory.get('swapPctFree'),
                    'memSwapTotal': memory.get('swapTotal'),
                    'memCached': memory.get('physCached'),
                    'memBuffers': memory.get('physBuffers'),
                    'memShared': memory.get('physShared'),
                    'memSlab': memory.get('physSlab'),
                    'memPageTables': memory.get('physPageTables'),
                    'memSwapCached': memory.get('swapCached')
                }
                payload.update(memstats)

            ioStats = sys_checks['io'].check(self.agentConfig)
            if ioStats:
                payload['ioStats'] = ioStats

            processes = sys_checks['processes'].check(self.agentConfig)
            payload.update({'processes': processes})

            cpuStats = sys_checks['cpu'].check(self.agentConfig)
            if cpuStats:
                payload.update(cpuStats)

        # Run old-style checks
        gangliaData = self._ganglia.check(self.agentConfig)
        dogstreamData = self._dogstream.check(self.agentConfig)
        ddforwarderData = self._ddforwarder.check(self.agentConfig)

        if gangliaData is not False and gangliaData is not None:
            payload['ganglia'] = gangliaData

        # dogstream
        if dogstreamData:
            dogstreamEvents = dogstreamData.get('dogstreamEvents', None)
            if dogstreamEvents:
                if 'dogstream' in payload['events']:
                    events['dogstream'].extend(dogstreamEvents)
                else:
                    events['dogstream'] = dogstreamEvents
                del dogstreamData['dogstreamEvents']

            payload.update(dogstreamData)

        # metrics about the forwarder
        if ddforwarderData:
            payload['datadog'] = ddforwarderData

        # process collector of gohai (compliant with payload of legacy "resources checks")
        if not Platform.is_windows() and self._should_send_additional_data('processes'):
            gohai_processes = self._run_gohai_processes()
            if gohai_processes:
                try:
                    gohai_processes_json = json.loads(gohai_processes)
                    processes_payload = {
                        'snaps': [gohai_processes_json.get('processes')],
                        'format_version': 1
                    }
                    if self._is_first_run():
                        processes_payload['format_description'] = PROCESSES_FORMAT_DESCRIPTION

                    payload['resources'] = {
                        'processes': processes_payload,
                        'meta': {
                            'host': payload['internalHostname'],
                        }
                    }
                except Exception:
                    log.exception("Error running gohai processes collection")

        # newer-style checks (not checks.d style)
        for metrics_check in self._metrics_checks:
            res = metrics_check.check(self.agentConfig)
            if res:
                metrics.extend(res)

        # checks.d checks
        check_statuses = []
        for check in self.initialized_checks_d:
            if not self.continue_running:
                return
            log.info("Running check %s" % check.name)
            instance_statuses = []
            metric_count = 0
            event_count = 0
            service_check_count = 0
            check_start_time = time.time()
            check_stats = None

            try:
                # Run the check.
                instance_statuses = check.run()

                # Collect the metrics and events.
                current_check_metrics = check.get_metrics()
                current_check_events = check.get_events()
                check_stats = check._get_internal_profiling_stats()

                # Collect metadata
                current_check_metadata = check.get_service_metadata()

                # Save metrics & events for the payload.
                metrics.extend(current_check_metrics)
                if current_check_events:
                    if check.name not in events:
                        events[check.name] = current_check_events
                    else:
                        events[check.name] += current_check_events

                # Save the status of the check.
                metric_count = len(current_check_metrics)
                event_count = len(current_check_events)

            except Exception:
                log.exception("Error running check %s" % check.name)

            check_status = CheckStatus(
                check.name, instance_statuses, metric_count,
                event_count, service_check_count, service_metadata=current_check_metadata,
                library_versions=check.get_library_info(),
                source_type_name=check.SOURCE_TYPE_NAME or check.name,
                check_stats=check_stats
            )

            # Service check for Agent checks failures
            service_check_tags = ["check:%s" % check.name]
            if check_status.status == STATUS_OK:
                status = AgentCheck.OK
            elif check_status.status == STATUS_ERROR:
                status = AgentCheck.CRITICAL
            check.service_check('datadog.agent.check_status', status, tags=service_check_tags)

            # Collect the service checks and save them in the payload
            current_check_service_checks = check.get_service_checks()
            if current_check_service_checks:
                service_checks.extend(current_check_service_checks)
            service_check_count = len(current_check_service_checks)

            # Update the check status with the correct service_check_count
            check_status.service_check_count = service_check_count
            check_statuses.append(check_status)

            check_run_time = time.time() - check_start_time
            log.debug("Check %s ran in %.2f s" % (check.name, check_run_time))

            # Intrument check run timings if enabled.
            if self.check_timings:
                metric = 'datadog.agent.check_run_time'
                meta = {'tags': ["check:%s" % check.name]}
                metrics.append((metric, time.time(), check_run_time, meta))

        for check_name, info in self.init_failed_checks_d.iteritems():
            if not self.continue_running:
                return
            check_status = CheckStatus(check_name, None, None, None, None,
                                       init_failed_error=info['error'],
                                       init_failed_traceback=info['traceback'])
            check_statuses.append(check_status)

        # Add a service check for the agent
        service_checks.append(create_service_check('datadog.agent.up', AgentCheck.OK,
                              hostname=self.hostname))

        # Store the metrics and events in the payload.
        payload['metrics'] = metrics
        payload['events'] = events
        payload['service_checks'] = service_checks

        # Populate metadata
        self._populate_payload_metadata(payload, check_statuses, start_event)

        collect_duration = timer.step()

        if self._agent_metrics:
            metric_context = {
                'collection_time': collect_duration,
                'emit_time': self.emit_duration,
            }
            if not Platform.is_windows():
                metric_context['cpu_time'] = time.clock() - cpu_clock

            self._agent_metrics.set_metric_context(payload, metric_context)
            self._agent_metrics.run()
            agent_stats = self._agent_metrics.get_metrics()
            payload['metrics'].extend(agent_stats)
            if self.agentConfig.get('developer_mode'):
                log.debug("\n Agent developer mode stats: \n {0}".format(
                    Collector._stats_for_display(agent_stats))
                )
            # Flush metadata for the Agent Metrics check. Otherwise they'll just accumulate and leak.
            self._agent_metrics.get_service_metadata()

        # Let's send our payload
        emitter_statuses = payload.emit(log, self.agentConfig, self.emitters,
                                        self.continue_running)
        self.emit_duration = timer.step()

        # Persist the status of the collection run.
        try:
            CollectorStatus(check_statuses, emitter_statuses,
                            self.hostname_metadata_cache).persist()
        except Exception:
            log.exception("Error persisting collector status")

        if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0:
            log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" %
                     (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2)))
            if self.run_count == FLUSH_LOGGING_INITIAL:
                log.info("First flushes done, next flushes will be logged every %s flushes." %
                         FLUSH_LOGGING_PERIOD)
        else:
            log.debug("Finished run #%s. Collection time: %ss. Emit time: %ss" %
                      (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2)))

        return payload

    @staticmethod
    def run_single_check(check, verbose=True):
        log.info("Running check %s" % check.name)
        instance_statuses = []
        metric_count = 0
        event_count = 0
        service_check_count = 0
        check_stats = None

        try:
            # Run the check.
            instance_statuses = check.run()

            # Collect the metrics and events.
            current_check_metrics = check.get_metrics()
            current_check_events = check.get_events()
            current_service_checks = check.get_service_checks()
            current_service_metadata = check.get_service_metadata()

            check_stats = check._get_internal_profiling_stats()

            # Save the status of the check.
            metric_count = len(current_check_metrics)
            event_count = len(current_check_events)
            service_check_count = len(current_service_checks)

            print "Metrics: \n{0}".format(pprint.pformat(current_check_metrics))
            print "Events: \n{0}".format(pprint.pformat(current_check_events))
            print "Service Checks: \n{0}".format(pprint.pformat(current_service_checks))
            print "Service Metadata: \n{0}".format(pprint.pformat(current_service_metadata))

        except Exception:
            log.exception("Error running check %s" % check.name)

        check_status = CheckStatus(
            check.name, instance_statuses, metric_count,
            event_count, service_check_count,
            library_versions=check.get_library_info(),
            source_type_name=check.SOURCE_TYPE_NAME or check.name,
            check_stats=check_stats
        )

        return check_status

    def _emit(self, payload):
        """ Send the payload via the emitters. """
        statuses = []
        for emitter in self.emitters:
            # Don't try to send to an emitter if we're stopping/
            if not self.continue_running:
                return statuses
            name = emitter.__name__
            emitter_status = EmitterStatus(name)
            try:
                emitter(payload, log, self.agentConfig)
            except Exception as e:
                log.exception("Error running emitter: %s" % emitter.__name__)
                emitter_status = EmitterStatus(name, e)
            statuses.append(emitter_status)
        return statuses

    def _is_first_run(self):
        return self.run_count <= 1

    def _build_payload(self, payload):
        """
        Build the payload skeleton, so it contains all of the generic payload data.
        """
        now = time.time()

        payload['collection_timestamp'] = now
        payload['os'] = self.os
        payload['python'] = sys.version
        payload['agentVersion'] = self.agentConfig['version']
        payload['apiKey'] = self.agentConfig['api_key']
        payload['events'] = {}
        payload['metrics'] = []
        payload['service_checks'] = []
        payload['resources'] = {}
        payload['internalHostname'] = self.hostname
        payload['uuid'] = get_uuid()
        payload['host-tags'] = {}
        payload['external_host_tags'] = {}

    def _populate_payload_metadata(self, payload, check_statuses, start_event=True):
        """
        Periodically populate the payload with metadata related to the system, host, and/or checks.
        """
        now = time.time()

        # Include system stats on first postback
        if start_event and self._is_first_run():
            payload['systemStats'] = self.agentConfig.get('system_stats', {})
            # Also post an event in the newsfeed
            payload['events']['System'] = [{
                'api_key': self.agentConfig['api_key'],
                'host': payload['internalHostname'],
                'timestamp': now,
                'event_type':'Agent Startup',
                'msg_text': 'Version %s' % get_version()
            }]

        # Periodically send the host metadata.
        if self._should_send_additional_data('host_metadata'):
            # gather metadata with gohai
            gohai_metadata = self._run_gohai_metadata()
            if gohai_metadata:
                payload['gohai'] = gohai_metadata

            payload['systemStats'] = get_system_stats(
                proc_path=self.agentConfig.get('procfs_path', '/proc').rstrip('/')
            )
            payload['meta'] = self._get_hostname_metadata()

            self.hostname_metadata_cache = payload['meta']
            # Add static tags from the configuration file
            host_tags = []
            if self.agentConfig['tags'] is not None:
                host_tags.extend([unicode(tag.strip())
                                 for tag in self.agentConfig['tags'].split(",")])

            if self.agentConfig['collect_ec2_tags']:
                host_tags.extend(EC2.get_tags(self.agentConfig))

            if host_tags:
                payload['host-tags']['system'] = host_tags

            # If required by the user, let's create the dd_check:xxx host tags
            if self.agentConfig['create_dd_check_tags']:
                app_tags_list = [DD_CHECK_TAG.format(c.name) for c in self.initialized_checks_d]
                app_tags_list.extend([DD_CHECK_TAG.format(cname) for cname
                                      in JMXFiles.get_jmx_appnames()])

                if 'system' not in payload['host-tags']:
                    payload['host-tags']['system'] = []

                payload['host-tags']['system'].extend(app_tags_list)

            GCE_tags = GCE.get_tags(self.agentConfig)
            if GCE_tags is not None:
                payload['host-tags'][GCE.SOURCE_TYPE_NAME] = GCE_tags

            # Log the metadata on the first run
            if self._is_first_run():
                log.info("Hostnames: %s, tags: %s" %
                         (repr(self.hostname_metadata_cache), payload['host-tags']))

        # Periodically send extra hosts metadata (vsphere)
        # Metadata of hosts that are not the host where the agent runs, not all the checks use
        # that
        external_host_tags = []
        if self._should_send_additional_data('external_host_tags'):
            for check in self.initialized_checks_d:
                try:
                    getter = getattr(check, 'get_external_host_tags')
                    check_tags = getter()
                    external_host_tags.extend(check_tags)
                except AttributeError:
                    pass

        if external_host_tags:
            payload['external_host_tags'] = external_host_tags

        # Periodically send agent_checks metadata
        if self._should_send_additional_data('agent_checks'):
            # Add agent checks statuses and error/warning messages
            agent_checks = []
            for check in check_statuses:
                if check.instance_statuses is not None:
                    for i, instance_status in enumerate(check.instance_statuses):
                        agent_checks.append(
                            (
                                check.name, check.source_type_name,
                                instance_status.instance_id,
                                instance_status.status,
                                # put error message or list of warning messages in the same field
                                # it will be handled by the UI
                                instance_status.error or instance_status.warnings or "",
                                check.service_metadata[i]
                            )
                        )
                else:
                    agent_checks.append(
                        (
                            check.name, check.source_type_name,
                            "initialization",
                            check.status, repr(check.init_failed_error)
                        )
                    )
            payload['agent_checks'] = agent_checks
            payload['meta'] = self.hostname_metadata_cache  # add hostname metadata

    def _get_hostname_metadata(self):
        """
        Returns a dictionnary that contains hostname metadata.
        """
        metadata = EC2.get_metadata(self.agentConfig)
        if metadata.get('hostname'):
            metadata['ec2-hostname'] = metadata.get('hostname')
            del metadata['hostname']

        if self.agentConfig.get('hostname'):
            metadata['agent-hostname'] = self.agentConfig.get('hostname')
        else:
            try:
                metadata["socket-hostname"] = socket.gethostname()
            except Exception:
                pass
        try:
            metadata["socket-fqdn"] = socket.getfqdn()
        except Exception:
            pass

        metadata["hostname"] = self.hostname
        metadata["timezones"] = sanitize_tzname(time.tzname)

        # Add cloud provider aliases
        host_aliases = GCE.get_host_aliases(self.agentConfig)
        if host_aliases:
            metadata['host_aliases'] = host_aliases

        return metadata

    def _should_send_additional_data(self, data_name):
        if self._is_first_run():
            return True
        # If the interval has passed, send the metadata again
        now = time.time()
        if now - self.push_times[data_name]['start'] >= self.push_times[data_name]['interval']:
            log.debug('%s interval has passed. Sending it.' % data_name)
            self.push_times[data_name]['start'] = now
            return True

        return False

    def _run_gohai_metadata(self):
        return self._run_gohai(['--exclude', 'processes'])

    def _run_gohai_processes(self):
        return self._run_gohai(['--only', 'processes'])

    def _run_gohai(self, options):
        output = None
        try:
            if not Platform.is_windows():
                command = "gohai"
            else:
                command = "gohai\gohai.exe"
            output, err, _ = get_subprocess_output([command] + options, log)
            if err:
                log.warning("GOHAI LOG | {0}".format(err))
        except OSError as e:
            if e.errno == 2:  # file not found, expected when install from source
                log.info("gohai file not found")
            else:
                log.warning("Unexpected OSError when running gohai %s", e)
        except Exception as e:
            log.warning("gohai command failed with error %s", e)

        return output
Example #12
0
    def __init__(self, agentConfig, emitters, systemStats, hostname):
        self.emit_duration = None
        self.agentConfig = agentConfig
        self.hostname = hostname
        # system stats is generated by config.get_system_stats
        self.agentConfig["system_stats"] = systemStats
        # agent config is used during checks, system_stats can be accessed through the config
        self.os = get_os()
        self.plugins = None
        self.emitters = emitters
        self.check_timings = agentConfig.get("check_timings")
        self.push_times = {
            "host_metadata": {"start": time.time(), "interval": int(agentConfig.get("metadata_interval", 4 * 60 * 60))},
            "external_host_tags": {
                "start": time.time() - 3 * 60,  # Wait for the checks to init
                "interval": int(agentConfig.get("external_host_tags", 5 * 60)),
            },
            "agent_checks": {"start": time.time(), "interval": int(agentConfig.get("agent_checks_interval", 10 * 60))},
            "processes": {"start": time.time(), "interval": int(agentConfig.get("processes_interval", 60))},
        }
        socket.setdefaulttimeout(15)
        self.run_count = 0
        self.continue_running = True
        self.hostname_metadata_cache = None
        self.initialized_checks_d = []
        self.init_failed_checks_d = {}

        # Unix System Checks
        self._unix_system_checks = {
            "io": u.IO(log),
            "load": u.Load(log),
            "memory": u.Memory(log),
            "processes": u.Processes(log),
            "cpu": u.Cpu(log),
            "system": u.System(log),
        }

        # Win32 System `Checks
        self._win32_system_checks = {
            "io": w32.IO(log),
            "proc": w32.Processes(log),
            "memory": w32.Memory(log),
            "network": w32.Network(log),
            "cpu": w32.Cpu(log),
            "system": w32.System(log),
        }

        # Old-style metric checks
        self._ganglia = Ganglia(log) if self.agentConfig.get("ganglia_host", "") != "" else None
        self._dogstream = None if self.agentConfig.get("dogstreams") is None else Dogstreams.init(log, self.agentConfig)

        # Agent performance metrics check
        self._agent_metrics = None

        self._metrics_checks = []

        # Custom metric checks
        for module_spec in [s.strip() for s in self.agentConfig.get("custom_checks", "").split(",")]:
            if len(module_spec) == 0:
                continue
            try:
                self._metrics_checks.append(modules.load(module_spec, "Check")(log))
                log.info("Registered custom check %s" % module_spec)
                log.warning(
                    "Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version"
                )
            except Exception:
                log.exception("Unable to load custom check module %s" % module_spec)
Example #13
0
    def __init__(self, agentConfig, emitters, systemStats):
        self.agentConfig = agentConfig
        # system stats is generated by config.get_system_stats
        self.agentConfig['system_stats'] = systemStats
        # agent config is used during checks, system_stats can be accessed through the config
        self.os = getOS()
        self.plugins = None
        self.emitters = emitters
        self.metadata_interval = int(
            agentConfig.get('metadata_interval', 10 * 60))
        self.metadata_start = time.time()
        socket.setdefaulttimeout(15)
        self.run_count = 0
        self.continue_running = True

        # Unix System Checks
        self._unix_system_checks = {
            'disk': u.Disk(checks_logger),
            'io': u.IO(),
            'load': u.Load(checks_logger),
            'memory': u.Memory(checks_logger),
            'network': u.Network(checks_logger),
            'processes': u.Processes(),
            'cpu': u.Cpu(checks_logger)
        }

        # Win32 System `Checks
        self._win32_system_checks = {
            'disk': w32.Disk(checks_logger),
            'io': w32.IO(checks_logger),
            'proc': w32.Processes(checks_logger),
            'memory': w32.Memory(checks_logger),
            'network': w32.Network(checks_logger),
            'cpu': w32.Cpu(checks_logger)
        }

        # Old-style metric checks
        self._couchdb = CouchDb(checks_logger)
        self._mongodb = MongoDb(checks_logger)
        self._mysql = MySql(checks_logger)
        self._rabbitmq = RabbitMq()
        self._ganglia = Ganglia(checks_logger)
        self._cassandra = Cassandra()
        self._dogstream = Dogstreams.init(checks_logger, self.agentConfig)
        self._ddforwarder = DdForwarder(checks_logger, self.agentConfig)
        self._ec2 = EC2(checks_logger)

        # Metric Checks
        self._metrics_checks = [
            ElasticSearch(checks_logger),
            Jvm(checks_logger),
            Tomcat(checks_logger),
            ActiveMQ(checks_logger),
            Solr(checks_logger),
            WMICheck(checks_logger),
            Memcache(checks_logger),
        ]

        # Custom metric checks
        for module_spec in [
                s.strip()
                for s in self.agentConfig.get('custom_checks', '').split(',')
        ]:
            if len(module_spec) == 0: continue
            try:
                self._metrics_checks.append(
                    modules.load(module_spec, 'Check')(checks_logger))
                logger.info("Registered custom check %s" % module_spec)
            except Exception, e:
                logger.exception('Unable to load custom check module %s' %
                                 module_spec)
Example #14
0
class checks(object):
    def __init__(self, agentConfig, emitters):
        self.agentConfig = agentConfig
        self.plugins = None
        self.emitters = emitters
        self.os = None
        
        self.checksLogger = logging.getLogger('checks')
        socket.setdefaulttimeout(15)
        
        self._apache = Apache(self.checksLogger)
        self._nginx = Nginx(self.checksLogger)
        self._disk = Disk(self.checksLogger)
        self._io = IO()
        self._load = Load(self.checksLogger)
        self._memory = Memory(self.checksLogger)
        self._network = Network(self.checksLogger)
        self._processes = Processes()
        self._cpu = Cpu()
        self._couchdb = CouchDb(self.checksLogger)
        self._mongodb = MongoDb(self.checksLogger)
        self._mysql = MySql(self.checksLogger)
        self._pgsql = PostgreSql(self.checksLogger)
        self._rabbitmq = RabbitMq()
        self._ganglia = Ganglia(self.checksLogger)
        self._cassandra = Cassandra()
        self._redis = Redis(self.checksLogger)
        self._jvm = Jvm(self.checksLogger)
        self._tomcat = Tomcat(self.checksLogger)
        self._activemq = ActiveMQ(self.checksLogger)
        self._solr = Solr(self.checksLogger)
        self._memcache = Memcache(self.checksLogger)
        self._dogstream = Dogstreams.init(self.checksLogger, self.agentConfig)
        self._ddforwarder = DdForwarder(self.checksLogger, self.agentConfig)

        # All new checks should be metrics checks:
        self._metrics_checks = [
            Cacti(self.checksLogger),
            Redis(self.checksLogger),
            Varnish(self.checksLogger),
            ElasticSearch(self.checksLogger),
            ]
        self._event_checks = [Hudson(), Nagios(socket.gethostname())]
        self._resources_checks = [ResProcesses(self.checksLogger,self.agentConfig)]

        self._ec2 = EC2(self.checksLogger)
    
    #
    # Checks - FIXME migrating to the new Check interface is a WIP
    #
    @recordsize 
    def getApacheStatus(self):
        return self._apache.check(self.agentConfig)

    @recordsize 
    def getCouchDBStatus(self):
        return self._couchdb.check(self.agentConfig)
    
    @recordsize
    def getDiskUsage(self):
        return self._disk.check(self.agentConfig)

    @recordsize
    def getIOStats(self):
        return self._io.check(self.checksLogger, self.agentConfig)
            
    @recordsize
    def getLoadAvrgs(self):
        return self._load.check(self.agentConfig)

    @recordsize 
    def getMemoryUsage(self):
        return self._memory.check(self.agentConfig)
        
    @recordsize     
    def getMongoDBStatus(self):
        return self._mongodb.check(self.agentConfig)

    @recordsize
    def getMySQLStatus(self):
        return self._mysql.check(self.agentConfig)
   
    @recordsize
    def getPgSQLStatus(self):
        return self._pgsql.check(self.agentConfig)
 
    @recordsize
    def getNetworkTraffic(self):
        return self._network.check(self.agentConfig)
    
    @recordsize
    def getNginxStatus(self):
        return self._nginx.check(self.agentConfig)
        
    @recordsize
    def getProcesses(self):
        return self._processes.check(self.checksLogger, self.agentConfig)
        
    @recordsize
    def getRabbitMQStatus(self):
        return self._rabbitmq.check(self.checksLogger, self.agentConfig)

    @recordsize
    def getGangliaData(self):
        return self._ganglia.check(self.agentConfig)

    @recordsize
    def getCassandraData(self):
        return self._cassandra.check(self.checksLogger, self.agentConfig)

    @recordsize
    def getJvmData(self):
        return self._jvm.check(self.agentConfig)

    @recordsize
    def getTomcatData(self):
        return self._tomcat.check(self.agentConfig)

    @recordsize
    def getActiveMQData(self):
        return self._activemq.check(self.agentConfig)

    @recordsize
    def getSolrData(self):
        return self._solr.check(self.agentConfig)

    @recordsize
    def getMemcacheData(self):
        return self._memcache.check(self.agentConfig)

    @recordsize
    def getDogstreamData(self):
        return self._dogstream.check(self.agentConfig)

    @recordsize
    def getDdforwarderData(self):
        return self._ddforwarder.check(self.agentConfig)

    @recordsize
    def getCPUStats(self):
        return self._cpu.check(self.checksLogger, self.agentConfig)

    @recordsize
    def get_metadata(self):
        metadata = self._ec2.get_metadata()
        if metadata.get('hostname'):
            metadata['ec2-hostname'] = metadata.get('hostname')

        if self.agentConfig.get('hostname'):
            metadata['agent-hostname'] = self.agentConfig.get('hostname')

        try:
            metadata["hostname"] = socket.gethostname()
        except:
            pass
        try:
            metadata["fqdn"] = socket.getfqdn()
        except:
            pass

        return metadata

    def doChecks(self, firstRun=False, systemStats=False):
        """Actual work
        """
        self.checksLogger.info("Starting checks")

        apacheStatus = self.getApacheStatus()
        diskUsage = self.getDiskUsage()
        loadAvrgs = self.getLoadAvrgs()
        memory = self.getMemoryUsage()
        mysqlStatus = self.getMySQLStatus()
        pgsqlStatus = self.getPgSQLStatus()
        networkTraffic = self.getNetworkTraffic()
        nginxStatus = self.getNginxStatus()
        processes = self.getProcesses()
        rabbitmq = self.getRabbitMQStatus()
        mongodb = self.getMongoDBStatus()
        couchdb = self.getCouchDBStatus()
        ioStats = self.getIOStats()
        cpuStats = self.getCPUStats()
        gangliaData = self.getGangliaData()
        cassandraData = self.getCassandraData()
        jvmData = self.getJvmData()
        tomcatData = self.getTomcatData()
        activeMQData = self.getActiveMQData()
        solrData = self.getSolrData()
        memcacheData = self.getMemcacheData()
        dogstreamData = self.getDogstreamData()
        ddforwarderData = self.getDdforwarderData()

        checksData = {
            'collection_timestamp': time.time(),
            'os' : self.os,
            'python': sys.version,
            'agentVersion' : self.agentConfig['version'], 
            'loadAvrg1' : loadAvrgs['1'], 
            'loadAvrg5' : loadAvrgs['5'], 
            'loadAvrg15' : loadAvrgs['15'], 
            'memPhysUsed' : memory.get('physUsed'), 
            'memPhysFree' : memory.get('physFree'), 
            'memPhysTotal' : memory.get('physTotal'), 
            'memPhysUsable' : memory.get('physUsable'), 
            'memSwapUsed' : memory.get('swapUsed'), 
            'memSwapFree' : memory.get('swapFree'), 
            'memSwapTotal' : memory.get('swapTotal'), 
            'memCached' : memory.get('physCached'), 
            'memBuffers': memory.get('physBuffers'),
            'memShared': memory.get('physShared'),
            'networkTraffic' : networkTraffic, 
            'processes' : processes,
            'apiKey': self.agentConfig['apiKey'],
            'events': {},
            'resources': {},
        }

        if diskUsage is not False and len(diskUsage) == 2:
            checksData["diskUsage"] = diskUsage[0]
            checksData["inodes"] = diskUsage[1]
            
        if cpuStats is not False and cpuStats is not None:
            checksData.update(cpuStats)

        if gangliaData is not False and gangliaData is not None:
            checksData['ganglia'] = gangliaData
           
        if cassandraData is not False and cassandraData is not None:
            checksData['cassandra'] = cassandraData
 
        # Apache Status
        if apacheStatus: 
            checksData.update(apacheStatus)
            
        # MySQL Status
        if mysqlStatus:
            checksData.update(mysqlStatus)
       
        # PostgreSQL status
        if pgsqlStatus: 
            checksData['postgresql'] = pgsqlStatus

        # Nginx Status
        if nginxStatus:
            checksData.update(nginxStatus)
            
        # RabbitMQ
        if rabbitmq:
            checksData['rabbitMQ'] = rabbitmq
        
        # MongoDB
        if mongodb:
            if mongodb.has_key('events'):
                checksData['events']['Mongo'] = mongodb['events']['Mongo']
                del mongodb['events']
            checksData['mongoDB'] = mongodb
            
        # CouchDB
        if couchdb:
            checksData['couchDB'] = couchdb
        
        if ioStats:
            checksData['ioStats'] = ioStats
            
        if jvmData:
            checksData['jvm'] = jvmData

        if tomcatData:
            checksData['tomcat'] = tomcatData

        if activeMQData:
            checksData['activemq'] = activeMQData

        if solrData:
            checksData['solr'] = solrData

        if memcacheData:
            checksData['memcache'] = memcacheData
        
        if dogstreamData:
            dogstreamEvents = dogstreamData.get('dogstreamEvents', None)
            if dogstreamEvents:
                if 'dogstream' in checksData['events']:
                    checksData['events']['dogstream'].extend(dogstreamEvents)
                else:
                    checksData['events']['dogstream'] = dogstreamEvents
                del dogstreamData['dogstreamEvents']

            checksData.update(dogstreamData)

        if ddforwarderData:
            checksData['datadog'] = ddforwarderData
 
        # Include server indentifiers
        checksData['internalHostname'] = gethostname(self.agentConfig)
        checksData['uuid'] = getUuid()
        self.checksLogger.debug('doChecks: added uuid %s' % checksData['uuid'])
        
        # Process the event checks. 
        for event_check in self._event_checks:
            event_data = event_check.check(self.checksLogger, self.agentConfig)
            if event_data:
                checksData['events'][event_check.key] = event_data
       
       # Include system stats on first postback
        if firstRun:
            checksData['systemStats'] = systemStats
            # Add static tags from the configuration file
            if self.agentConfig['tags'] is not None:
                checksData['tags'] = self.agentConfig['tags']
            # Also post an event in the newsfeed
            checksData['events']['System'] = [{'api_key': self.agentConfig['apiKey'],
                                               'host': checksData['internalHostname'],
                                               'timestamp': int(time.mktime(datetime.datetime.now().timetuple())),
                                               'event_type':'Agent Startup',
                                               'msg_text': 'Version %s' % get_version()
                                            }]

            # Collect metadata
            checksData['meta'] = self.get_metadata()

        # Resources checks
        has_resource = False
        for resources_check in self._resources_checks:
            resources_check.check()
            snaps = resources_check.pop_snapshots()
            if snaps:
                has_resource = True
                res_value = { 'snaps': snaps,
                              'format_version': resources_check.get_format_version() }                              
                res_format = resources_check.describe_format_if_needed()
                if res_format is not None:
                    res_value['format_description'] = res_format
                checksData['resources'][resources_check.RESOURCE_KEY] = res_value
 
        if has_resource:
            checksData['resources']['meta'] = {
                        'api_key': self.agentConfig['apiKey'],
                        'host': checksData['internalHostname'],
                    }

        metrics = []
        for metrics_check in self._metrics_checks:
            res = metrics_check.check(self.agentConfig)
            if res:
                metrics.extend(res)
        checksData['metrics'] = metrics

        # Send back data
        self.checksLogger.debug("checksData: %s" % checksData)
        for emitter in self.emitters:
            emitter(checksData, self.checksLogger, self.agentConfig)
        self.checksLogger.info("Checks done")
Example #15
0
    def __init__(self, agentConfig, emitters, systemStats, hostname):
        self.emit_duration = None
        self.agentConfig = agentConfig
        self.hostname = hostname
        # system stats is generated by config.get_system_stats
        self.agentConfig['system_stats'] = systemStats
        # agent config is used during checks, system_stats can be accessed through the config
        self.os = get_os()
        self.plugins = None
        self.emitters = emitters
        self.check_timings = agentConfig.get('check_timings')
        self.push_times = {
            'host_metadata': {
                'start': time.time(),
                'interval': int(agentConfig.get('metadata_interval', 4 * 60 * 60))
            },
            'external_host_tags': {
                'start': time.time() - 3 * 60,  # Wait for the checks to init
                'interval': int(agentConfig.get('external_host_tags', 5 * 60))
            },
            'agent_checks': {
                'start': time.time(),
                'interval': int(agentConfig.get('agent_checks_interval', 10 * 60))
            },
            'processes': {
                'start': time.time(),
                'interval': int(agentConfig.get('processes_interval', 60))
            }
        }
        socket.setdefaulttimeout(15)
        self.run_count = 0
        self.continue_running = True
        self.hostname_metadata_cache = None
        self.initialized_checks_d = []
        self.init_failed_checks_d = {}

        if Platform.is_linux() and psutil is not None:
            procfs_path = agentConfig.get('procfs_path', '/proc').rstrip('/')
            psutil.PROCFS_PATH = procfs_path

        # Unix System Checks
        self._unix_system_checks = {
            'io': u.IO(log),
            'load': u.Load(log),
            'memory': u.Memory(log),
            'processes': u.Processes(log),
            'cpu': u.Cpu(log),
            'system': u.System(log)
        }

        # Win32 System `Checks
        self._win32_system_checks = {
            'io': w32.IO(log),
            'proc': w32.Processes(log),
            'memory': w32.Memory(log),
            'network': w32.Network(log),
            'cpu': w32.Cpu(log),
            'system': w32.System(log)
        }

        # Old-style metric checks
        self._ganglia = Ganglia(log) if self.agentConfig.get('ganglia_host', '') != '' else None
        self._dogstream = None if self.agentConfig.get('dogstreams') is None else Dogstreams.init(log, self.agentConfig)

        # Agent performance metrics check
        self._agent_metrics = None

        self._metrics_checks = []

        # Custom metric checks
        for module_spec in [s.strip() for s in self.agentConfig.get('custom_checks', '').split(',')]:
            if len(module_spec) == 0:
                continue
            try:
                self._metrics_checks.append(modules.load(module_spec, 'Check')(log))
                log.info("Registered custom check %s" % module_spec)
                log.warning("Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version")
            except Exception:
                log.exception('Unable to load custom check module %s' % module_spec)
Example #16
0
class Collector(object):
    def __init__(self, agentConfig, emitters, systemStats, hostname):
        self.ip = get_ip(agentConfig)
        self.emit_duration = None
        self.agentConfig = agentConfig
        self.hostname = hostname
        self.agentConfig['system_stats'] = systemStats
        self.os = get_os()
        self.plugins = None
        self.emitters = emitters
        self.check_timings = agentConfig.get('check_timings')
        self.push_times = {
            'host_metadata': {
                'start': time.time(),
                'interval': int(agentConfig.get('metadata_interval', 4 * 60 * 60))
            },
            'external_host_tags': {
                'start': time.time() - 3 * 60,
                'interval': int(agentConfig.get('external_host_tags', 5 * 60))
            },
            'agent_checks': {
                'start': time.time(),
                'interval': int(agentConfig.get('agent_checks_interval', 10 * 60))
            },
        }
        socket.setdefaulttimeout(15)
        self.run_count = 0
        self.continue_running = True
        self.hostname_metadata_cache = None
        self.initialized_checks_d = []
        self.init_failed_checks_d = {}

        self._unix_system_checks = {
            'io': u.IO(log),
            'load': u.Load(log),
            'memory': u.Memory(log),
            'processes': u.Processes(log),
            'cpu': u.Cpu(log),
            'system': u.System(log)
        }

        self._win32_system_checks = {
            'io': w32.IO(log),
            'proc': w32.Processes(log),
            'memory': w32.Memory(log),
            'network': w32.Network(log),
            'cpu': w32.Cpu(log),
            'system': w32.System(log)
        }

        self._ganglia = Ganglia(log)
        self._monitorstream = monitorstreams.init(log, self.agentConfig)
        self._ddforwarder = DdForwarder(log, self.agentConfig)

        self._agent_metrics = None

        self._metrics_checks = []

        for module_spec in [s.strip() for s in self.agentConfig.get('custom_checks', '').split(',')]:
            if len(module_spec) == 0:
                continue
            try:
                self._metrics_checks.append(modules.load(module_spec, 'Check')(log))
                log.info("Registered custom check %s" % module_spec)
                log.warning(
                    "Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version")
            except Exception:
                log.exception('Unable to load custom check module %s' % module_spec)

    def stop(self):

        self.continue_running = False
        for check in self.initialized_checks_d:
            check.stop()

    @staticmethod
    def _stats_for_display(raw_stats):
        return pprint.pformat(raw_stats, indent=4)

    @log_exceptions(log)
    def run(self, checksd=None, start_event=True, configs_reloaded=False):

        log.debug("Found {num_checks} checks".format(num_checks=len(checksd['initialized_checks'])))
        timer = Timer()
        if not Platform.is_windows():
            cpu_clock = time.clock()
        self.run_count += 1
        log.debug("Starting collection run #%s" % self.run_count)

        if checksd:
            self.initialized_checks_d = checksd['initialized_checks']
            self.init_failed_checks_d = checksd['init_failed_checks']

        payload = AgentPayload()

        if not self._agent_metrics or configs_reloaded:
            for check in self.initialized_checks_d:
                if check.name == AGENT_METRICS_CHECK_NAME:
                    self._agent_metrics = check
                    self.initialized_checks_d.remove(check)
                    break

        self._build_payload(payload)

        metrics = payload['metrics']
        events = payload['events']
        service_checks = payload['service_checks']

        if Platform.is_windows():
            try:
                metrics.extend(self._win32_system_checks['memory'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['cpu'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['network'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['io'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['proc'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['system'].check(self.agentConfig))
            except Exception:
                log.exception('Unable to fetch Windows system metrics.')
        else:
            sys_checks = self._unix_system_checks

            load = sys_checks['load'].check(self.agentConfig)
            payload.update(load)

            system = sys_checks['system'].check(self.agentConfig)
            payload.update(system)

            memory = sys_checks['memory'].check(self.agentConfig)

            if memory:
                memstats = {
                    'memPhysUsed': memory.get('physUsed'),
                    'memPhysPctUsage': memory.get('physPctUsage'),
                    'memPhysFree': memory.get('physFree'),
                    'memPhysTotal': memory.get('physTotal'),
                    'memPhysUsable': memory.get('physUsable'),
                    'memSwapUsed': memory.get('swapUsed'),
                    'memSwapFree': memory.get('swapFree'),
                    'memSwapPctFree': memory.get('swapPctFree'),
                    'memSwapTotal': memory.get('swapTotal'),
                    'memCached': memory.get('physCached'),
                    'memBuffers': memory.get('physBuffers'),
                    'memShared': memory.get('physShared'),
                    'memSlab': memory.get('physSlab'),
                    'memPageTables': memory.get('physPageTables'),
                    'memSwapCached': memory.get('swapCached')
                }
                payload.update(memstats)

            ioStats = sys_checks['io'].check(self.agentConfig)
            if ioStats:
                payload['ioStats'] = ioStats

            processes = sys_checks['processes'].check(self.agentConfig)
            payload.update({'processes': processes})

            cpuStats = sys_checks['cpu'].check(self.agentConfig)
            if cpuStats:
                payload.update(cpuStats)

        gangliaData = self._ganglia.check(self.agentConfig)
        monitorstreamData = self._monitorstream.check(self.agentConfig)
        ddforwarderData = self._ddforwarder.check(self.agentConfig)

        if gangliaData is not False and gangliaData is not None:
            payload['ganglia'] = gangliaData

        if monitorstreamData:
            monitorstreamEvents = monitorstreamData.get('monitorstreamEvents', None)
            if monitorstreamEvents:
                if 'monitorstream' in payload['events']:
                    events['monitorstream'].extend(monitorstreamEvents)
                else:
                    events['monitorstream'] = monitorstreamEvents
                del monitorstreamData['monitorstreamEvents']

            payload.update(monitorstreamData)

        if ddforwarderData:
            payload['datamonitor'] = ddforwarderData

        for metrics_check in self._metrics_checks:
            res = metrics_check.check(self.agentConfig)
            if res:
                metrics.extend(res)

        check_statuses = []
        for check in self.initialized_checks_d:
            if not self.continue_running:
                return
            log.info("Running check %s" % check.name)
            instance_statuses = []
            metric_count = 0
            event_count = 0
            service_check_count = 0
            check_start_time = time.time()
            check_stats = None

            try:
                instance_statuses = check.run()

                current_check_metrics = check.get_metrics()
                current_check_events = check.get_events()
                check_stats = check._get_internal_profiling_stats()

                current_check_metadata = check.get_service_metadata()

                metrics.extend(current_check_metrics)
                if current_check_events:
                    if check.name not in events:
                        events[check.name] = current_check_events
                    else:
                        events[check.name] += current_check_events

                metric_count = len(current_check_metrics)
                event_count = len(current_check_events)

            except Exception:
                log.exception("Error running check %s" % check.name)

            check_status = CheckStatus(
                check.name, instance_statuses, metric_count,
                event_count, service_check_count, service_metadata=current_check_metadata,
                library_versions=check.get_library_info(),
                source_type_name=check.SOURCE_TYPE_NAME or check.name,
                check_stats=check_stats
            )

            service_check_tags = ["check:%s" % check.name]
            if check_status.status == STATUS_OK:
                status = AgentCheck.OK
            elif check_status.status == STATUS_ERROR:
                status = AgentCheck.CRITICAL
            check.service_check('datamonitor.agent.check_status', status, tags=service_check_tags)

            current_check_service_checks = check.get_service_checks()
            if current_check_service_checks:
                service_checks.extend(current_check_service_checks)
            service_check_count = len(current_check_service_checks)

            check_status.service_check_count = service_check_count
            check_statuses.append(check_status)

            check_run_time = time.time() - check_start_time
            log.debug("Check %s ran in %.2f s" % (check.name, check_run_time))

            if self.check_timings:
                metric = 'datamonitor.agent.check_run_time'
                meta = {'tags': ["check:%s" % check.name]}
                metrics.append((metric, time.time(), check_run_time, meta))

        for check_name, info in self.init_failed_checks_d.iteritems():
            if not self.continue_running:
                return
            check_status = CheckStatus(check_name, None, None, None, None,
                                       init_failed_error=info['error'],
                                       init_failed_traceback=info['error'])
            check_statuses.append(check_status)

        service_checks.append(create_service_check('datamonitor.agent.up', AgentCheck.OK,
                                                   hostname=self.hostname))

        payload['metrics'] = metrics
        payload['events'] = events
        payload['service_checks'] = service_checks

        self._populate_payload_metadata(payload, check_statuses, start_event)

        collect_duration = timer.step()

        if self._agent_metrics:
            metric_context = {
                'collection_time': collect_duration,
                'emit_time': self.emit_duration,
            }
            if not Platform.is_windows():
                metric_context['cpu_time'] = time.clock() - cpu_clock

            self._agent_metrics.set_metric_context(payload, metric_context)
            self._agent_metrics.run()
            agent_stats = self._agent_metrics.get_metrics()
            payload['metrics'].extend(agent_stats)
            if self.agentConfig.get('developer_mode'):
                log.debug("\n Agent developer mode stats: \n {0}".format(
                    Collector._stats_for_display(agent_stats))
                )
            self._agent_metrics.get_service_metadata()

        emitter_statuses = payload.emit(log, self.agentConfig, self.emitters,
                                        self.continue_running)
        self.emit_duration = timer.step()

        try:
            CollectorStatus(check_statuses, emitter_statuses,
                            self.hostname_metadata_cache).persist()
        except Exception:
            log.exception("Error persisting collector status")

        if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0:
            log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" %
                     (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2)))
            if self.run_count == FLUSH_LOGGING_INITIAL:
                log.info("First flushes done, next flushes will be logged every %s flushes." %
                         FLUSH_LOGGING_PERIOD)
        else:
            log.debug("Finished run #%s. Collection time: %ss. Emit time: %ss" %
                      (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2)))

        return payload

    @staticmethod
    def run_single_check(check, verbose=True):
        log.info("Running check %s" % check.name)
        instance_statuses = []
        metric_count = 0
        event_count = 0
        service_check_count = 0
        check_stats = None

        try:

            instance_statuses = check.run()

            current_check_metrics = check.get_metrics()
            current_check_events = check.get_events()
            current_service_checks = check.get_service_checks()
            current_service_metadata = check.get_service_metadata()

            check_stats = check._get_internal_profiling_stats()

            metric_count = len(current_check_metrics)
            event_count = len(current_check_events)
            service_check_count = len(current_service_checks)

            print "Metrics: \n{0}".format(pprint.pformat(current_check_metrics))
            print "Events: \n{0}".format(pprint.pformat(current_check_events))
            print "Service Checks: \n{0}".format(pprint.pformat(current_service_checks))
            print "Service Metadata: \n{0}".format(pprint.pformat(current_service_metadata))

        except Exception:
            log.exception("Error running check %s" % check.name)

        check_status = CheckStatus(
            check.name, instance_statuses, metric_count,
            event_count, service_check_count,
            library_versions=check.get_library_info(),
            source_type_name=check.SOURCE_TYPE_NAME or check.name,
            check_stats=check_stats
        )

        return check_status

    def _emit(self, payload):
        statuses = []
        for emitter in self.emitters:
            if not self.continue_running:
                return statuses
            name = emitter.__name__
            emitter_status = EmitterStatus(name)
            try:
                emitter(payload, log, self.agentConfig)
            except Exception, e:
                log.exception("Error running emitter: %s" % emitter.__name__)
                emitter_status = EmitterStatus(name, e)
            statuses.append(emitter_status)
        return statuses
Example #17
0
    def __init__(self, agentConfig, emitters, systemStats, hostname):
        self.ip = get_ip(agentConfig)
        self.emit_duration = None
        self.agentConfig = agentConfig
        self.hostname = hostname
        self.agentConfig['system_stats'] = systemStats
        self.os = get_os()
        self.plugins = None
        self.emitters = emitters
        self.check_timings = agentConfig.get('check_timings')
        self.push_times = {
            'host_metadata': {
                'start': time.time(),
                'interval': int(agentConfig.get('metadata_interval', 4 * 60 * 60))
            },
            'external_host_tags': {
                'start': time.time() - 3 * 60,
                'interval': int(agentConfig.get('external_host_tags', 5 * 60))
            },
            'agent_checks': {
                'start': time.time(),
                'interval': int(agentConfig.get('agent_checks_interval', 10 * 60))
            },
        }
        socket.setdefaulttimeout(15)
        self.run_count = 0
        self.continue_running = True
        self.hostname_metadata_cache = None
        self.initialized_checks_d = []
        self.init_failed_checks_d = {}

        self._unix_system_checks = {
            'io': u.IO(log),
            'load': u.Load(log),
            'memory': u.Memory(log),
            'processes': u.Processes(log),
            'cpu': u.Cpu(log),
            'system': u.System(log)
        }

        self._win32_system_checks = {
            'io': w32.IO(log),
            'proc': w32.Processes(log),
            'memory': w32.Memory(log),
            'network': w32.Network(log),
            'cpu': w32.Cpu(log),
            'system': w32.System(log)
        }

        self._ganglia = Ganglia(log)
        self._monitorstream = monitorstreams.init(log, self.agentConfig)
        self._ddforwarder = DdForwarder(log, self.agentConfig)

        self._agent_metrics = None

        self._metrics_checks = []

        for module_spec in [s.strip() for s in self.agentConfig.get('custom_checks', '').split(',')]:
            if len(module_spec) == 0:
                continue
            try:
                self._metrics_checks.append(modules.load(module_spec, 'Check')(log))
                log.info("Registered custom check %s" % module_spec)
                log.warning(
                    "Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version")
            except Exception:
                log.exception('Unable to load custom check module %s' % module_spec)
Example #18
0
    def __init__(self, agentConfig, emitters, systemStats, hostname):
        self.emit_duration = None
        self.agentConfig = agentConfig
        self.hostname = hostname
        # system stats is generated by config.get_system_stats
        self.agentConfig['system_stats'] = systemStats
        # agent config is used during checks, system_stats can be accessed through the config
        self.os = get_os()
        self.plugins = None
        self.emitters = emitters
        self.check_timings = agentConfig.get('check_timings')
        self.push_times = {
            'host_metadata': {
                'start': time.time(),
                'interval': int(agentConfig.get('metadata_interval', 4 * 60 * 60))
            },
            'external_host_tags': {
                'start': time.time() - 3 * 60,  # Wait for the checks to init
                'interval': int(agentConfig.get('external_host_tags', 5 * 60))
            },
            'agent_checks': {
                'start': time.time(),
                'interval': int(agentConfig.get('agent_checks_interval', 10 * 60))
            },
            'processes': {
                'start': time.time(),
                'interval': int(agentConfig.get('processes_interval', 60))
            }
        }
        socket.setdefaulttimeout(15)
        self.run_count = 0
        self.continue_running = True
        self.hostname_metadata_cache = None
        self.initialized_checks_d = []
        self.init_failed_checks_d = {}

        # Unix System Checks
        self._unix_system_checks = {
            'io': u.IO(log),
            'load': u.Load(log),
            'memory': u.Memory(log),
            'processes': u.Processes(log),
            'cpu': u.Cpu(log),
            'system': u.System(log)
        }

        # Win32 System `Checks
        self._win32_system_checks = {
            'io': w32.IO(log),
            'proc': w32.Processes(log),
            'memory': w32.Memory(log),
            'network': w32.Network(log),
            'cpu': w32.Cpu(log),
            'system': w32.System(log)
        }

        # Old-style metric checks
        self._ganglia = Ganglia(log)
        self._dogstream = Dogstreams.init(log, self.agentConfig)
        self._ddforwarder = DdForwarder(log, self.agentConfig)

        # Agent performance metrics check
        self._agent_metrics = None

        self._metrics_checks = []

        # Custom metric checks
        for module_spec in [s.strip() for s in self.agentConfig.get('custom_checks', '').split(',')]:
            if len(module_spec) == 0:
                continue
            try:
                self._metrics_checks.append(modules.load(module_spec, 'Check')(log))
                log.info("Registered custom check %s" % module_spec)
                log.warning("Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version")
            except Exception:
                log.exception('Unable to load custom check module %s' % module_spec)
Example #19
0
class Collector(object):
    """
    The collector is responsible for collecting data from each check and
    passing it along to the emitters, who send it to their final destination.
    """

    def __init__(self, agentConfig, emitters, systemStats, hostname):
        self.emit_duration = None
        self.agentConfig = agentConfig
        self.hostname = hostname
        # system stats is generated by config.get_system_stats
        self.agentConfig["system_stats"] = systemStats
        # agent config is used during checks, system_stats can be accessed through the config
        self.os = get_os()
        self.plugins = None
        self.emitters = emitters
        self.check_timings = agentConfig.get("check_timings")
        self.push_times = {
            "host_metadata": {"start": time.time(), "interval": int(agentConfig.get("metadata_interval", 4 * 60 * 60))},
            "external_host_tags": {
                "start": time.time() - 3 * 60,  # Wait for the checks to init
                "interval": int(agentConfig.get("external_host_tags", 5 * 60)),
            },
            "agent_checks": {"start": time.time(), "interval": int(agentConfig.get("agent_checks_interval", 10 * 60))},
        }
        socket.setdefaulttimeout(15)
        self.run_count = 0
        self.continue_running = True
        self.hostname_metadata_cache = None
        self.initialized_checks_d = []
        self.init_failed_checks_d = {}

        # Unix System Checks
        self._unix_system_checks = {
            "io": u.IO(log),
            "load": u.Load(log),
            "memory": u.Memory(log),
            "processes": u.Processes(log),
            "cpu": u.Cpu(log),
            "system": common.System(log),
        }

        # Win32 System `Checks
        self._win32_system_checks = {
            "io": w32.IO(log),
            "proc": w32.Processes(log),
            "memory": w32.Memory(log),
            "network": w32.Network(log),
            "cpu": w32.Cpu(log),
            "system": common.System(log),
        }

        # Old-style metric checks
        self._ganglia = Ganglia(log)
        self._dogstream = Dogstreams.init(log, self.agentConfig)
        self._ddforwarder = DdForwarder(log, self.agentConfig)

        # Agent performance metrics check
        self._agent_metrics = None

        self._metrics_checks = []

        # Custom metric checks
        for module_spec in [s.strip() for s in self.agentConfig.get("custom_checks", "").split(",")]:
            if len(module_spec) == 0:
                continue
            try:
                self._metrics_checks.append(modules.load(module_spec, "Check")(log))
                log.info("Registered custom check %s" % module_spec)
                log.warning(
                    "Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version"
                )
            except Exception:
                log.exception("Unable to load custom check module %s" % module_spec)

        # Resource Checks
        self._resources_checks = [ResProcesses(log, self.agentConfig)]

    def stop(self):
        """
        Tell the collector to stop at the next logical point.
        """
        # This is called when the process is being killed, so
        # try to stop the collector as soon as possible.
        # Most importantly, don't try to submit to the emitters
        # because the forwarder is quite possibly already killed
        # in which case we'll get a misleading error in the logs.
        # Best to not even try.
        self.continue_running = False
        for check in self.initialized_checks_d:
            check.stop()

    @staticmethod
    def _stats_for_display(raw_stats):
        return pprint.pformat(raw_stats, indent=4)

    @log_exceptions(log)
    def run(self, checksd=None, start_event=True, configs_reloaded=False):
        """
        Collect data from each check and submit their data.
        """
        log.debug("Found {num_checks} checks".format(num_checks=len(checksd["initialized_checks"])))
        timer = Timer()
        if not Platform.is_windows():
            cpu_clock = time.clock()
        self.run_count += 1
        log.debug("Starting collection run #%s" % self.run_count)

        if checksd:
            self.initialized_checks_d = checksd["initialized_checks"]  # is a list of AgentCheck instances
            self.init_failed_checks_d = checksd["init_failed_checks"]  # is of type {check_name: {error, traceback}}

        payload = AgentPayload()

        # Find the AgentMetrics check and pop it out
        # This check must run at the end of the loop to collect info on agent performance
        if not self._agent_metrics or configs_reloaded:
            for check in self.initialized_checks_d:
                if check.name == AGENT_METRICS_CHECK_NAME:
                    self._agent_metrics = check
                    self.initialized_checks_d.remove(check)
                    break

        # Initialize payload
        self._build_payload(payload)

        metrics = payload["metrics"]
        events = payload["events"]
        service_checks = payload["service_checks"]

        # Run the system checks. Checks will depend on the OS
        if Platform.is_windows():
            # Win32 system checks
            try:
                metrics.extend(self._win32_system_checks["memory"].check(self.agentConfig))
                metrics.extend(self._win32_system_checks["cpu"].check(self.agentConfig))
                metrics.extend(self._win32_system_checks["network"].check(self.agentConfig))
                metrics.extend(self._win32_system_checks["io"].check(self.agentConfig))
                metrics.extend(self._win32_system_checks["proc"].check(self.agentConfig))
            except Exception:
                log.exception("Unable to fetch Windows system metrics.")
        else:
            # Unix system checks
            sys_checks = self._unix_system_checks

            load = sys_checks["load"].check(self.agentConfig)
            payload.update(load)

            system = sys_checks["system"].check(self.agentConfig)
            payload.update(system)

            memory = sys_checks["memory"].check(self.agentConfig)

            if memory:
                memstats = {
                    "memPhysUsed": memory.get("physUsed"),
                    "memPhysPctUsable": memory.get("physPctUsable"),
                    "memPhysFree": memory.get("physFree"),
                    "memPhysTotal": memory.get("physTotal"),
                    "memPhysUsable": memory.get("physUsable"),
                    "memSwapUsed": memory.get("swapUsed"),
                    "memSwapFree": memory.get("swapFree"),
                    "memSwapPctFree": memory.get("swapPctFree"),
                    "memSwapTotal": memory.get("swapTotal"),
                    "memCached": memory.get("physCached"),
                    "memBuffers": memory.get("physBuffers"),
                    "memShared": memory.get("physShared"),
                    "memSlab": memory.get("physSlab"),
                    "memPageTables": memory.get("physPageTables"),
                    "memSwapCached": memory.get("swapCached"),
                }
                payload.update(memstats)

            ioStats = sys_checks["io"].check(self.agentConfig)
            if ioStats:
                payload["ioStats"] = ioStats

            processes = sys_checks["processes"].check(self.agentConfig)
            payload.update({"processes": processes})

            cpuStats = sys_checks["cpu"].check(self.agentConfig)
            if cpuStats:
                payload.update(cpuStats)

        # Run old-style checks
        gangliaData = self._ganglia.check(self.agentConfig)
        dogstreamData = self._dogstream.check(self.agentConfig)
        ddforwarderData = self._ddforwarder.check(self.agentConfig)

        if gangliaData is not False and gangliaData is not None:
            payload["ganglia"] = gangliaData

        # dogstream
        if dogstreamData:
            dogstreamEvents = dogstreamData.get("dogstreamEvents", None)
            if dogstreamEvents:
                if "dogstream" in payload["events"]:
                    events["dogstream"].extend(dogstreamEvents)
                else:
                    events["dogstream"] = dogstreamEvents
                del dogstreamData["dogstreamEvents"]

            payload.update(dogstreamData)

        # metrics about the forwarder
        if ddforwarderData:
            payload["datadog"] = ddforwarderData

        # Resources checks
        if not Platform.is_windows():
            has_resource = False
            for resources_check in self._resources_checks:
                try:
                    resources_check.check()
                    snaps = resources_check.pop_snapshots()
                    if snaps:
                        has_resource = True
                        res_value = {"snaps": snaps, "format_version": resources_check.get_format_version()}
                        res_format = resources_check.describe_format_if_needed()
                        if res_format is not None:
                            res_value["format_description"] = res_format
                        payload["resources"][resources_check.RESOURCE_KEY] = res_value
                except Exception:
                    log.exception("Error running resource check %s" % resources_check.RESOURCE_KEY)

            if has_resource:
                payload["resources"]["meta"] = {
                    "api_key": self.agentConfig["api_key"],
                    "host": payload["internalHostname"],
                }

        # newer-style checks (not checks.d style)
        for metrics_check in self._metrics_checks:
            res = metrics_check.check(self.agentConfig)
            if res:
                metrics.extend(res)

        # checks.d checks
        check_statuses = []
        for check in self.initialized_checks_d:
            if not self.continue_running:
                return
            log.info("Running check %s" % check.name)
            instance_statuses = []
            metric_count = 0
            event_count = 0
            service_check_count = 0
            check_start_time = time.time()
            check_stats = None

            try:
                # Run the check.
                instance_statuses = check.run()

                # Collect the metrics and events.
                current_check_metrics = check.get_metrics()
                current_check_events = check.get_events()
                check_stats = check._get_internal_profiling_stats()

                # Collect metadata
                current_check_metadata = check.get_service_metadata()

                # Save metrics & events for the payload.
                metrics.extend(current_check_metrics)
                if current_check_events:
                    if check.name not in events:
                        events[check.name] = current_check_events
                    else:
                        events[check.name] += current_check_events

                # Save the status of the check.
                metric_count = len(current_check_metrics)
                event_count = len(current_check_events)

            except Exception:
                log.exception("Error running check %s" % check.name)

            check_status = CheckStatus(
                check.name,
                instance_statuses,
                metric_count,
                event_count,
                service_check_count,
                service_metadata=current_check_metadata,
                library_versions=check.get_library_info(),
                source_type_name=check.SOURCE_TYPE_NAME or check.name,
                check_stats=check_stats,
            )

            # Service check for Agent checks failures
            service_check_tags = ["check:%s" % check.name]
            if check_status.status == STATUS_OK:
                status = AgentCheck.OK
            elif check_status.status == STATUS_ERROR:
                status = AgentCheck.CRITICAL
            check.service_check("datadog.agent.check_status", status, tags=service_check_tags)

            # Collect the service checks and save them in the payload
            current_check_service_checks = check.get_service_checks()
            if current_check_service_checks:
                service_checks.extend(current_check_service_checks)
            service_check_count = len(current_check_service_checks)

            # Update the check status with the correct service_check_count
            check_status.service_check_count = service_check_count
            check_statuses.append(check_status)

            check_run_time = time.time() - check_start_time
            log.debug("Check %s ran in %.2f s" % (check.name, check_run_time))

            # Intrument check run timings if enabled.
            if self.check_timings:
                metric = "datadog.agent.check_run_time"
                meta = {"tags": ["check:%s" % check.name]}
                metrics.append((metric, time.time(), check_run_time, meta))

        for check_name, info in self.init_failed_checks_d.iteritems():
            if not self.continue_running:
                return
            check_status = CheckStatus(
                check_name,
                None,
                None,
                None,
                None,
                init_failed_error=info["error"],
                init_failed_traceback=info["traceback"],
            )
            check_statuses.append(check_status)

        # Add a service check for the agent
        service_checks.append(create_service_check("datadog.agent.up", AgentCheck.OK, hostname=self.hostname))

        # Store the metrics and events in the payload.
        payload["metrics"] = metrics
        payload["events"] = events
        payload["service_checks"] = service_checks

        # Populate metadata
        self._populate_payload_metadata(payload, check_statuses, start_event)

        collect_duration = timer.step()

        if self._agent_metrics:
            metric_context = {"collection_time": collect_duration, "emit_time": self.emit_duration}
            if not Platform.is_windows():
                metric_context["cpu_time"] = time.clock() - cpu_clock

            self._agent_metrics.set_metric_context(payload, metric_context)
            self._agent_metrics.run()
            agent_stats = self._agent_metrics.get_metrics()
            payload["metrics"].extend(agent_stats)
            if self.agentConfig.get("developer_mode"):
                log.debug("\n Agent developer mode stats: \n {0}".format(Collector._stats_for_display(agent_stats)))

        # Let's send our payload
        emitter_statuses = payload.emit(log, self.agentConfig, self.emitters, self.continue_running)
        self.emit_duration = timer.step()

        # Persist the status of the collection run.
        try:
            CollectorStatus(check_statuses, emitter_statuses, self.hostname_metadata_cache).persist()
        except Exception:
            log.exception("Error persisting collector status")

        if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0:
            log.info(
                "Finished run #%s. Collection time: %ss. Emit time: %ss"
                % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))
            )
            if self.run_count == FLUSH_LOGGING_INITIAL:
                log.info("First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD)
        else:
            log.debug(
                "Finished run #%s. Collection time: %ss. Emit time: %ss"
                % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))
            )

        return payload

    @staticmethod
    def run_single_check(check, verbose=True):
        log.info("Running check %s" % check.name)
        instance_statuses = []
        metric_count = 0
        event_count = 0
        service_check_count = 0
        check_stats = None

        try:
            # Run the check.
            instance_statuses = check.run()

            # Collect the metrics and events.
            current_check_metrics = check.get_metrics()
            current_check_events = check.get_events()
            current_service_checks = check.get_service_checks()
            current_service_metadata = check.get_service_metadata()

            check_stats = check._get_internal_profiling_stats()

            # Save the status of the check.
            metric_count = len(current_check_metrics)
            event_count = len(current_check_events)
            service_check_count = len(current_service_checks)

            print "Metrics: \n{0}".format(pprint.pformat(current_check_metrics))
            print "Events: \n{0}".format(pprint.pformat(current_check_events))
            print "Service Checks: \n{0}".format(pprint.pformat(current_service_checks))
            print "Service Metadata: \n{0}".format(pprint.pformat(current_service_metadata))

        except Exception:
            log.exception("Error running check %s" % check.name)

        check_status = CheckStatus(
            check.name,
            instance_statuses,
            metric_count,
            event_count,
            service_check_count,
            library_versions=check.get_library_info(),
            source_type_name=check.SOURCE_TYPE_NAME or check.name,
            check_stats=check_stats,
        )

        return check_status

    def _emit(self, payload):
        """ Send the payload via the emitters. """
        statuses = []
        for emitter in self.emitters:
            # Don't try to send to an emitter if we're stopping/
            if not self.continue_running:
                return statuses
            name = emitter.__name__
            emitter_status = EmitterStatus(name)
            try:
                emitter(payload, log, self.agentConfig)
            except Exception, e:
                log.exception("Error running emitter: %s" % emitter.__name__)
                emitter_status = EmitterStatus(name, e)
            statuses.append(emitter_status)
        return statuses