Example #1
0
    def __init__(self,
                 port,
                 agentConfig,
                 watchdog=True,
                 skip_ssl_validation=False):
        self._port = int(port)
        self._agentConfig = agentConfig
        self._metrics = {}
        MetricTransaction.set_application(self)
        MetricTransaction.set_endpoints()
        self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY,
                                              MAX_QUEUE_SIZE, THROTTLING_DELAY)
        MetricTransaction.set_tr_manager(self._tr_manager)

        self._watchdog = None
        self.skip_ssl_validation = skip_ssl_validation or agentConfig.get(
            'skip_ssl_validation', False)
        if self.skip_ssl_validation:
            log.info(
                "Skipping SSL hostname validation, useful when using a transparent proxy"
            )

        if watchdog:
            watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER
            self._watchdog = Watchdog(watchdog_timeout,
                                      max_mem_mb=agentConfig.get(
                                          'limit_memory_consumption', None))
    def __init__(self,
                 interval,
                 metrics_aggregator,
                 api_host,
                 api_key=None,
                 use_watchdog=False):
        threading.Thread.__init__(self)
        self.daemon = True
        self.interval = int(interval)
        self.finished = threading.Event()
        self.metrics_aggregator = metrics_aggregator
        self.flush_count = 0

        self.watchdog = None
        if use_watchdog:
            from util import Watchdog
            self.watchdog = Watchdog(WATCHDOG_TIMEOUT)

        self.api_key = api_key
        self.api_host = api_host

        self.http_conn_cls = http_client.HTTPSConnection

        match = re.match('^(https?)://(.*)', api_host)

        if match:
            self.api_host = match.group(2)
            if match.group(1) == 'http':
                self.http_conn_cls = http_client.HTTPConnection
Example #3
0
 def _get_watchdog(self, check_freq, agentConfig):
     watchdog = None
     if agentConfig.get("watchdog", True):
         watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER,
             max_mem_mb=agentConfig.get('limit_memory_consumption', None))
         watchdog.reset()
     return watchdog
Example #4
0
    def __init__(self, port, agentConfig, watchdog=True):
        self._port = int(port)
        self._agentConfig = agentConfig
        self._metrics = {}
        MetricTransaction.set_application(self)
        MetricTransaction.set_endpoints()
        self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY,
                                              MAX_QUEUE_SIZE, THROTTLING_DELAY)
        MetricTransaction.set_tr_manager(self._tr_manager)

        self._watchdog = None
        if watchdog:
            watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER
            self._watchdog = Watchdog(watchdog_timeout)
Example #5
0
    def run(self, agentConfig=None, run_forever=True):
        """Main loop of the collector"""
        agentLogger = logging.getLogger('agent')
        systemStats = get_system_stats()
        agentLogger.debug('System Properties: ' + str(systemStats))

        if agentConfig is None:
            agentConfig = get_config()

        # Load the checks.d checks
        checksd = load_check_directory(agentConfig)

        # Try to fetch instance Id from EC2 if not hostname has been set
        # in the config file.
        # DEPRECATED
        if agentConfig.get('hostname') is None and agentConfig.get('use_ec2_instance_id'):
            instanceId = EC2.get_instance_id()
            if instanceId is not None:
                agentLogger.info("Running on EC2, instanceId: %s" % instanceId)
                agentConfig['hostname'] = instanceId
            else:
                agentLogger.info('Not running on EC2, using hostname to identify this server')

        emitters = [http_emitter]
        for emitter_spec in [s.strip() for s in agentConfig.get('custom_emitters', '').split(',')]:
            if len(emitter_spec) == 0: continue
            emitters.append(modules.load(emitter_spec, 'emitter'))

        check_freq = int(agentConfig['check_freq'])

        # Checks instance
        c = checks(agentConfig, emitters)

        # Watchdog
        watchdog = None
        if agentConfig.get("watchdog", True):
            watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER)
            watchdog.reset()

        # Run checks once, to get once-in-a-run data
        c.doChecks(True, systemStats, checksd)

        # Main loop
        while run_forever:
            if watchdog is not None:
                watchdog.reset()
            time.sleep(check_freq)
            c.doChecks(checksd=checksd)
Example #6
0
    def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False, event_chunk_size=None):
        threading.Thread.__init__(self)
        self.interval = int(interval)
        self.finished = threading.Event()
        self.metrics_aggregator = metrics_aggregator
        self.flush_count = 0
        self.log_count = 0

        self.watchdog = None
        if use_watchdog:
            from util import Watchdog
            self.watchdog = Watchdog(WATCHDOG_TIMEOUT)

        self.api_key = api_key
        self.api_host = api_host
        self.event_chunk_size = event_chunk_size or EVENT_CHUNK_SIZE

        self.http_conn_cls = http_client.HTTPSConnection

        match = re.match('^(https?)://(.*)', api_host)

        if match:
            self.api_host = match.group(2)
            if match.group(1) == 'http':
                self.http_conn_cls = http_client.HTTPConnection
Example #7
0
    def __init__(self, port, agentConfig, watchdog=True,
                 skip_ssl_validation=False, use_simple_http_client=False):
        self._port = int(port)
        self._agentConfig = agentConfig
        self._metrics = {}
        AgentTransaction.set_application(self)
        AgentTransaction.set_endpoints(agentConfig['endpoints'])
        AgentTransaction.set_request_timeout(agentConfig['forwarder_timeout'])

        max_parallelism = self.NO_PARALLELISM
        # Multiple endpoints => enable parallelism
        if len(agentConfig['endpoints']) > 1:
            max_parallelism = self.DEFAULT_PARALLELISM

        self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY,
                                              MAX_QUEUE_SIZE, THROTTLING_DELAY,
                                              max_parallelism=max_parallelism)
        AgentTransaction.set_tr_manager(self._tr_manager)

        self._watchdog = None
        self.skip_ssl_validation = skip_ssl_validation or agentConfig.get('skip_ssl_validation', False)
        self.use_simple_http_client = use_simple_http_client
        if self.skip_ssl_validation:
            log.info("Skipping SSL hostname validation, useful when using a transparent proxy")

        # Monitor activity
        if watchdog:
            watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER / 1000
            self._watchdog = Watchdog(
                watchdog_timeout,
                max_mem_mb=agentConfig.get('limit_memory_consumption', None),
                max_resets=WATCHDOG_HIGH_ACTIVITY_THRESHOLD
            )
Example #8
0
    def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False, event_chunk_size=None):
        threading.Thread.__init__(self)
        self.interval = int(interval)
        self.finished = threading.Event()
        self.metrics_aggregator = metrics_aggregator
        self.flush_count = 0
        self.log_count = 0

        self.watchdog = None
        if use_watchdog:
            from util import Watchdog
            self.watchdog = Watchdog(WATCHDOG_TIMEOUT)

        self.api_key = api_key
        self.api_host = api_host
        self.event_chunk_size = event_chunk_size or EVENT_CHUNK_SIZE
Example #9
0
    def __init__(self,
                 port,
                 agentConfig,
                 watchdog=True,
                 skip_ssl_validation=False,
                 use_simple_http_client=False):
        self._port = int(port)
        self._agentConfig = agentConfig
        self._metrics = {}
        AgentTransaction.set_application(self)
        AgentTransaction.set_endpoints()
        self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY,
                                              MAX_QUEUE_SIZE, THROTTLING_DELAY)
        AgentTransaction.set_tr_manager(self._tr_manager)

        self._watchdog = None
        self.skip_ssl_validation = skip_ssl_validation or agentConfig.get(
            'skip_ssl_validation', False)
        self.use_simple_http_client = use_simple_http_client
        if self.skip_ssl_validation:
            log.info(
                "Skipping SSL hostname validation, useful when using a transparent proxy"
            )

        # Monitor activity
        if watchdog:
            watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER / 1000
            self._watchdog = Watchdog(
                watchdog_timeout,
                max_mem_mb=agentConfig.get('limit_memory_consumption', None),
                max_resets=WATCHDOG_HIGH_ACTIVITY_THRESHOLD)
Example #10
0
 def use_lots_of_memory(self):
     # Skip this step on travis
     if os.environ.get('TRAVIS', False): return
     a = Application(12345, {})
     a._watchdog = Watchdog(30, 50)
     a._tr_manager = MemoryHogTxManager()
     a.run()
Example #11
0
 def __init__(self, port, agentConfig):
     self._port = port
     self._agentConfig = agentConfig
     self._metrics = {}
     self._watchdog = Watchdog(TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER)
     MetricTransaction.set_application(self)
     self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY,
         MAX_QUEUE_SIZE, THROTTLING_DELAY)
     MetricTransaction.set_tr_manager(self._tr_manager)
Example #12
0
    def __init__(self,
                 port,
                 agentConfig,
                 watchdog=True,
                 skip_ssl_validation=False,
                 use_simple_http_client=False):
        self._port = int(port)
        self._agentConfig = agentConfig
        self._metrics = {}
        AgentTransaction.set_application(self)
        AgentTransaction.set_endpoints(agentConfig['endpoints'])
        if agentConfig['endpoints'] == {}:
            log.warning(
                u"No valid endpoint found. Forwarder will drop all incoming payloads."
            )
        AgentTransaction.set_request_timeout(agentConfig['forwarder_timeout'])

        max_parallelism = self.NO_PARALLELISM
        # Multiple endpoints => enable parallelism
        if len(agentConfig['endpoints']) > 1:
            max_parallelism = self.DEFAULT_PARALLELISM

        self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY,
                                              MAX_QUEUE_SIZE,
                                              THROTTLING_DELAY,
                                              max_parallelism=max_parallelism)
        AgentTransaction.set_tr_manager(self._tr_manager)

        self._watchdog = None
        self.skip_ssl_validation = skip_ssl_validation or agentConfig.get(
            'skip_ssl_validation', False)
        self.use_simple_http_client = use_simple_http_client
        if self.skip_ssl_validation:
            log.info(
                "Skipping SSL hostname validation, useful when using a transparent proxy"
            )

        # Monitor activity
        if watchdog:
            watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER / 1000
            self._watchdog = Watchdog(
                watchdog_timeout,
                max_mem_mb=agentConfig.get('limit_memory_consumption', None),
                max_resets=WATCHDOG_HIGH_ACTIVITY_THRESHOLD)
Example #13
0
    def __init__(self, port, agentConfig, watchdog=True):
        self._port = int(port)
        self._agentConfig = agentConfig
        self._metrics = {}
        MetricTransaction.set_application(self)
        MetricTransaction.set_endpoints()
        self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY, MAX_QUEUE_SIZE, THROTTLING_DELAY)
        MetricTransaction.set_tr_manager(self._tr_manager)

        self._watchdog = None
        if watchdog:
            watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER
            self._watchdog = Watchdog(watchdog_timeout)
Example #14
0
    def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False, event_chunk_size=None):
        threading.Thread.__init__(self)
        self.interval = int(interval)
        self.finished = threading.Event()
        self.metrics_aggregator = metrics_aggregator
        self.flush_count = 0
        self.log_count = 0

        self.watchdog = None
        if use_watchdog:
            from util import Watchdog
            self.watchdog = Watchdog(WATCHDOG_TIMEOUT)

        self.api_key = api_key
        self.api_host = api_host
        self.event_chunk_size = event_chunk_size or EVENT_CHUNK_SIZE
Example #15
0
    def __init__(self, port, agentConfig, watchdog=True, skip_ssl_validation=False):
        self._port = int(port)
        self._agentConfig = agentConfig
        self._metrics = {}
        MetricTransaction.set_application(self)
        MetricTransaction.set_endpoints()
        self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY,
            MAX_QUEUE_SIZE, THROTTLING_DELAY)
        MetricTransaction.set_tr_manager(self._tr_manager)

        self._watchdog = None
        self.skip_ssl_validation = skip_ssl_validation or agentConfig.get('skip_ssl_validation', False)
        if self.skip_ssl_validation:
            log.info("Skipping SSL hostname validation, useful when using a transparent proxy")

        if watchdog:
            watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER
            self._watchdog = Watchdog(watchdog_timeout,
                max_mem_mb=agentConfig.get('limit_memory_consumption', None))
Example #16
0
    def run(self, agentConfig=None, run_forever=True):
        """Main loop of the collector"""
        agentLogger = logging.getLogger('agent')
        systemStats = get_system_stats()

        if agentConfig is None:
            agentConfig = get_config()

        # Load the checks.d checks
        checksd = load_check_directory(agentConfig)

        # Try to fetch instance Id from EC2 if not hostname has been set
        # in the config file.
        # DEPRECATED
        if agentConfig.get('hostname') is None and agentConfig.get(
                'use_ec2_instance_id'):
            instanceId = EC2.get_instance_id()
            if instanceId is not None:
                agentLogger.info("Running on EC2, instanceId: %s" % instanceId)
                agentConfig['hostname'] = instanceId
            else:
                agentLogger.info(
                    'Not running on EC2, using hostname to identify this server'
                )

        emitters = [http_emitter]
        for emitter_spec in [
                s.strip()
                for s in agentConfig.get('custom_emitters', '').split(',')
        ]:
            if len(emitter_spec) == 0: continue
            emitters.append(modules.load(emitter_spec, 'emitter'))

        check_freq = int(agentConfig['check_freq'])

        # Checks instance
        collector = Collector(agentConfig, emitters, systemStats)

        # Watchdog
        watchdog = None
        if agentConfig.get("watchdog", True):
            watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER)
            watchdog.reset()

        # Main loop
        while run_forever:
            collector.run(checksd=checksd)
            if watchdog is not None:
                watchdog.reset()
            time.sleep(check_freq)
Example #17
0
    def test_watchdog_frenesy_detection(self, mock_restarted):
        """
        Watchdog restarts the process on suspicious high activity.
        """
        # Limit the restart timeframe for test purpose
        Watchdog._RESTART_TIMEFRAME = 1

        # Create a watchdog with a low activity tolerancy
        process_watchdog = Watchdog(10, max_resets=3)
        ping_watchdog = process_watchdog.reset

        with self.set_time(1):
            # Can be reset 3 times within the watchdog timeframe
            for x in xrange(0, 3):
                ping_watchdog()

            # On the 4th attempt, the watchdog detects a suspicously high activity
            self.assertRaises(WatchdogKill, ping_watchdog)

        with self.set_time(3):
            # Gets back to normal when the activity timeframe expires.
            ping_watchdog()
Example #18
0
class Reporter(threading.Thread):
    """
    The reporter periodically sends the aggregated metrics to the
    server.
    """
    def __init__(self,
                 interval,
                 metrics_aggregator,
                 api_host,
                 api_key=None,
                 use_watchdog=False,
                 event_chunk_size=None):
        threading.Thread.__init__(self)
        self.interval = int(interval)
        self.finished = threading.Event()
        self.metrics_aggregator = metrics_aggregator
        self.flush_count = 0
        self.log_count = 0

        self.watchdog = None
        if use_watchdog:
            from util import Watchdog
            self.watchdog = Watchdog(WATCHDOG_TIMEOUT)

        self.api_key = api_key
        self.api_host = api_host
        self.event_chunk_size = event_chunk_size or EVENT_CHUNK_SIZE

        self.http_conn_cls = http_client.HTTPSConnection

        match = re.match('^(https?)://(.*)', api_host)

        if match:
            self.api_host = match.group(2)
            if match.group(1) == 'http':
                self.http_conn_cls = http_client.HTTPConnection

    def stop(self):
        log.info("Stopping reporter")
        self.finished.set()

    def run(self):

        log.info("Reporting to %s every %ss" % (self.api_host, self.interval))
        log.debug("Watchdog enabled: %s" % bool(self.watchdog))

        # Persist a start-up message.
        DogstatsdStatus().persist()

        while not self.finished.isSet(
        ):  # Use camel case isSet for 2.4 support.
            self.finished.wait(self.interval)
            self.metrics_aggregator.send_packet_count(
                'datadog.dogstatsd.packet.count')
            self.flush()
            if self.watchdog:
                self.watchdog.reset()

        # Clean up the status messages.
        log.debug("Stopped reporter")
        DogstatsdStatus.remove_latest_status()

    def flush(self):
        try:
            self.flush_count += 1
            self.log_count += 1
            packets_per_second = self.metrics_aggregator.packets_per_second(
                self.interval)
            packet_count = self.metrics_aggregator.total_count

            metrics = self.metrics_aggregator.flush()
            count = len(metrics)
            if self.flush_count % FLUSH_LOGGING_PERIOD == 0:
                self.log_count = 0
            if count:
                self.submit(metrics)

            events = self.metrics_aggregator.flush_events()
            event_count = len(events)
            if event_count:
                self.submit_events(events)

            should_log = self.flush_count <= FLUSH_LOGGING_INITIAL or self.log_count <= FLUSH_LOGGING_COUNT
            log_func = log.info
            if not should_log:
                log_func = log.debug
            log_func("Flush #%s: flushed %s metric%s and %s event%s" %
                     (self.flush_count, count, plural(count), event_count,
                      plural(event_count)))
            if self.flush_count == FLUSH_LOGGING_INITIAL:
                log.info(
                    "First flushes done, %s flushes will be logged every %s flushes."
                    % (FLUSH_LOGGING_COUNT, FLUSH_LOGGING_PERIOD))

            # Persist a status message.
            packet_count = self.metrics_aggregator.total_count
            DogstatsdStatus(
                flush_count=self.flush_count,
                packet_count=packet_count,
                packets_per_second=packets_per_second,
                metric_count=count,
                event_count=event_count,
            ).persist()

        except Exception:
            if self.finished.isSet():
                log.debug(
                    "Couldn't flush metrics, but that's expected as we're stopping"
                )
            else:
                log.exception("Error flushing metrics")

    def submit(self, metrics):
        # Copy and pasted from dogapi, because it's a bit of a pain to distribute python
        # dependencies with the agent.
        body, headers = serialize_metrics(metrics)
        method = 'POST'

        params = {}
        if self.api_key:
            params['api_key'] = self.api_key
        url = '/api/v1/series?%s' % urlencode(params)

        start_time = time()
        status = None
        conn = self.http_conn_cls(self.api_host)
        try:
            conn.request(method, url, body, headers)

            #FIXME: add timeout handling code here

            response = conn.getresponse()
            status = response.status
            response.close()
        finally:
            conn.close()
        duration = round((time() - start_time) * 1000.0, 4)
        log.debug("%s %s %s%s (%sms)" %
                  (status, method, self.api_host, url, duration))
        return duration

    def submit_events(self, events):
        headers = {'Content-Type': 'application/json'}
        method = 'POST'

        events_len = len(events)
        event_chunk_size = self.event_chunk_size

        for chunk in chunks(events, event_chunk_size):
            payload = {
                'apiKey': self.api_key,
                'events': {
                    'api': chunk
                },
                'uuid': get_uuid(),
                'internalHostname': get_hostname()
            }
            params = {}
            if self.api_key:
                params['api_key'] = self.api_key
            url = '/intake?%s' % urlencode(params)

            status = None
            conn = self.http_conn_cls(self.api_host)
            try:
                start_time = time()
                conn.request(method, url, json.dumps(payload), headers)

                response = conn.getresponse()
                status = response.status
                response.close()
                duration = round((time() - start_time) * 1000.0, 4)
                log.debug("%s %s %s%s (%sms)" %
                          (status, method, self.api_host, url, duration))

            finally:
                conn.close()
Example #19
0
 def fast_tornado(self):
     a = Application(12345, {})
     a._watchdog = Watchdog(6)
     a._tr_manager = MockTxManager()
     a.run()
Example #20
0
class Reporter(threading.Thread):
    """
    The reporter periodically sends the aggregated metrics to the
    server.
    """

    def __init__(self, interval, metrics_aggregator, api_host, api_key=None,
                 use_watchdog=False, event_chunk_size=None):
        threading.Thread.__init__(self)
        self.interval = int(interval)
        self.finished = threading.Event()
        self.metrics_aggregator = metrics_aggregator
        self.flush_count = 0
        self.log_count = 0

        self.watchdog = None
        if use_watchdog:
            from util import Watchdog
            self.watchdog = Watchdog(WATCHDOG_TIMEOUT)

        self.api_key = api_key
        self.api_host = api_host
        self.event_chunk_size = event_chunk_size or EVENT_CHUNK_SIZE

    def stop(self):
        log.info("Stopping reporter")
        self.finished.set()

    def run(self):

        log.info("Reporting to %s every %ss" % (self.api_host, self.interval))
        log.debug("Watchdog enabled: %s" % bool(self.watchdog))

        # Persist a start-up message.
        DogstatsdStatus().persist()

        while not self.finished.isSet():  # Use camel case isSet for 2.4 support.
            self.finished.wait(self.interval)
            self.metrics_aggregator.send_packet_count('datadog.dogstatsd.packet.count')
            self.flush()
            if self.watchdog:
                self.watchdog.reset()

        # Clean up the status messages.
        log.debug("Stopped reporter")
        DogstatsdStatus.remove_latest_status()

    def flush(self):
        try:
            self.flush_count += 1
            self.log_count += 1
            packets_per_second = self.metrics_aggregator.packets_per_second(self.interval)
            packet_count = self.metrics_aggregator.total_count

            metrics = self.metrics_aggregator.flush()
            count = len(metrics)
            if self.flush_count % FLUSH_LOGGING_PERIOD == 0:
                self.log_count = 0
            if count:
                self.submit(metrics)

            events = self.metrics_aggregator.flush_events()
            event_count = len(events)
            if event_count:
                self.submit_events(events)

            service_checks = self.metrics_aggregator.flush_service_checks()
            check_count = len(service_checks)
            if check_count:
                self.submit_service_checks(service_checks)

            should_log = self.flush_count <= FLUSH_LOGGING_INITIAL or self.log_count <= FLUSH_LOGGING_COUNT
            log_func = log.info
            if not should_log:
                log_func = log.debug
            log_func("Flush #%s: flushed %s metric%s, %s event%s, and %s service check run%s" % (self.flush_count, count, plural(count), event_count, plural(event_count), check_count, plural(check_count)))
            if self.flush_count == FLUSH_LOGGING_INITIAL:
                log.info("First flushes done, %s flushes will be logged every %s flushes." % (FLUSH_LOGGING_COUNT, FLUSH_LOGGING_PERIOD))

            # Persist a status message.
            packet_count = self.metrics_aggregator.total_count
            DogstatsdStatus(
                flush_count=self.flush_count,
                packet_count=packet_count,
                packets_per_second=packets_per_second,
                metric_count=count,
                event_count=event_count,
            ).persist()

        except Exception:
            if self.finished.isSet():
                log.debug("Couldn't flush metrics, but that's expected as we're stopping")
            else:
                log.exception("Error flushing metrics")

    def submit(self, metrics):
        body, headers = serialize_metrics(metrics)
        params = {}
        if self.api_key:
            params['api_key'] = self.api_key
        url = '%s/api/v1/series?%s' % (self.api_host, urlencode(params))
        self.submit_http(url, body, headers)

    def submit_events(self, events):
        headers = {'Content-Type':'application/json'}
        event_chunk_size = self.event_chunk_size

        for chunk in chunks(events, event_chunk_size):
            payload = {
                'apiKey': self.api_key,
                'events': {
                    'api': chunk
                },
                'uuid': get_uuid(),
                'internalHostname': get_hostname()
            }
            params = {}
            if self.api_key:
                params['api_key'] = self.api_key
            url = '%s/intake?%s' % (self.api_host, urlencode(params))

            self.submit_http(url, json.dumps(payload), headers)

    def submit_http(self, url, data, headers):
        headers["DD-Dogstatsd-Version"] = get_version()
        log.debug("Posting payload to %s" % url)
        try:
            start_time = time()
            r = requests.post(url, data=data, timeout=5, headers=headers)
            r.raise_for_status()

            if r.status_code >= 200 and r.status_code < 205:
                log.debug("Payload accepted")

            status = r.status_code
            duration = round((time() - start_time) * 1000.0, 4)
            log.debug("%s POST %s (%sms)" % (status, url, duration))
        except Exception:
            log.exception("Unable to post payload.")
            try:
                log.error("Received status code: {0}".format(r.status_code))
            except Exception:
                pass

    def submit_service_checks(self, service_checks):
        headers = {'Content-Type':'application/json'}

        params = {}
        if self.api_key:
            params['api_key'] = self.api_key

        url = '{0}/api/v1/check_run?{1}'.format(self.api_host, urlencode(params))
        self.submit_http(url, json.dumps(service_checks), headers)
class Reporter(threading.Thread):
    """
    The reporter periodically sends the aggregated metrics to the
    server.
    """
    def __init__(self,
                 interval,
                 metrics_aggregator,
                 api_host,
                 api_key=None,
                 use_watchdog=False):
        threading.Thread.__init__(self)
        self.daemon = True
        self.interval = int(interval)
        self.finished = threading.Event()
        self.metrics_aggregator = metrics_aggregator
        self.flush_count = 0

        self.watchdog = None
        if use_watchdog:
            from util import Watchdog
            self.watchdog = Watchdog(WATCHDOG_TIMEOUT)

        self.api_key = api_key
        self.api_host = api_host

        self.http_conn_cls = http_client.HTTPSConnection

        match = re.match('^(https?)://(.*)', api_host)

        if match:
            self.api_host = match.group(2)
            if match.group(1) == 'http':
                self.http_conn_cls = http_client.HTTPConnection

    def stop(self):
        self.finished.set()

    def run(self):
        logger.info("Reporting to %s every %ss" %
                    (self.api_host, self.interval))
        logger.debug("Watchdog enabled: %s" % bool(self.watchdog))
        while True:
            if self.finished.isSet(
            ):  # Use camel case version for 2.4 support.
                break
            self.finished.wait(self.interval)
            self.metrics_aggregator.send_packet_count(
                'datadog.dogstatsd.packet.count')
            self.flush()
            if self.watchdog:
                self.watchdog.reset()

    def flush(self):
        try:
            self.flush_count += 1
            metrics = self.metrics_aggregator.flush()
            count = len(metrics)
            if not count:
                logger.info("Flush #%s: No metrics to flush." %
                            self.flush_count)
                return
            logger.info("Flush #%s: flushing %s metrics" %
                        (self.flush_count, count))
            self.submit(metrics)
        except:
            logger.exception("Error flushing metrics")

    def submit(self, metrics):

        # HACK - Copy and pasted from dogapi, because it's a bit of a pain to distribute python
        # dependencies with the agent.
        conn = self.http_conn_cls(self.api_host)
        body = json.dumps({"series": metrics})
        headers = {'Content-Type': 'application/json'}
        method = 'POST'

        params = {}
        if self.api_key:
            params['api_key'] = self.api_key
        url = '/api/v1/series?%s' % urlencode(params)

        start_time = time()
        conn.request(method, url, body, headers)

        #FIXME: add timeout handling code here

        response = conn.getresponse()
        duration = round((time() - start_time) * 1000.0, 4)
        logger.info("%s %s %s%s (%sms)" %
                    (response.status, method, self.api_host, url, duration))
Example #22
0
class Application(tornado.web.Application):
    def __init__(self, port, agentConfig, watchdog=True):
        self._port = int(port)
        self._agentConfig = agentConfig
        self._metrics = {}
        MetricTransaction.set_application(self)
        MetricTransaction.set_endpoints()
        self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY, MAX_QUEUE_SIZE, THROTTLING_DELAY)
        MetricTransaction.set_tr_manager(self._tr_manager)

        self._watchdog = None
        if watchdog:
            watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER
            self._watchdog = Watchdog(watchdog_timeout)

    def appendMetric(self, prefix, name, host, device, ts, value):

        if self._metrics.has_key(prefix):
            metrics = self._metrics[prefix]
        else:
            metrics = {}
            self._metrics[prefix] = metrics

        if metrics.has_key(name):
            metrics[name].append([host, device, ts, value])
        else:
            metrics[name] = [[host, device, ts, value]]

    def _postMetrics(self):

        if len(self._metrics) > 0:
            self._metrics["uuid"] = getUuid()
            self._metrics["internalHostname"] = gethostname(self._agentConfig)
            self._metrics["apiKey"] = self._agentConfig["api_key"]
            MetricTransaction(self._metrics, {})
            self._metrics = {}

    def run(self):

        handlers = [
            (r"/intake/?", AgentInputHandler),
            (r"/api/v1/series/?", ApiInputHandler),
            (r"/status/?", StatusHandler),
        ]

        settings = dict(cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=", xsrf_cookies=False, debug=True)

        tornado.web.Application.__init__(self, handlers, **settings)
        http_server = tornado.httpserver.HTTPServer(self)
        http_server.listen(self._port)
        logging.info("Listening on port %d" % self._port)

        # Register callbacks
        self.mloop = tornado.ioloop.IOLoop.instance()

        def flush_trs():
            if self._watchdog:
                self._watchdog.reset()
            self._postMetrics()
            self._tr_manager.flush()

        tr_sched = tornado.ioloop.PeriodicCallback(flush_trs, TRANSACTION_FLUSH_INTERVAL, io_loop=self.mloop)

        # Register optional Graphite listener
        gport = self._agentConfig.get("graphite_listen_port", None)
        if gport is not None:
            logging.info("Starting graphite listener on port %s" % gport)
            from graphite import GraphiteServer

            gs = GraphiteServer(self, gethostname(self._agentConfig), io_loop=self.mloop)
            gs.listen(gport)

        # Start everything
        if self._watchdog:
            self._watchdog.reset()
        tr_sched.start()
        self.mloop.start()

    def stop(self):
        self.mloop.stop()
Example #23
0
class Application(tornado.web.Application):

    NO_PARALLELISM = 1
    DEFAULT_PARALLELISM = 5

    def __init__(self,
                 port,
                 agentConfig,
                 watchdog=True,
                 skip_ssl_validation=False,
                 use_simple_http_client=False):
        self._port = int(port)
        self._agentConfig = agentConfig
        self._metrics = {}
        AgentTransaction.set_application(self)
        AgentTransaction.set_endpoints(agentConfig['endpoints'])
        if agentConfig['endpoints'] == {}:
            log.warning(
                u"No valid endpoint found. Forwarder will drop all incoming payloads."
            )
        AgentTransaction.set_request_timeout(agentConfig['forwarder_timeout'])

        max_parallelism = self.NO_PARALLELISM
        # Multiple endpoints => enable parallelism
        if len(agentConfig['endpoints']) > 1:
            max_parallelism = self.DEFAULT_PARALLELISM

        self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY,
                                              MAX_QUEUE_SIZE,
                                              THROTTLING_DELAY,
                                              max_parallelism=max_parallelism)
        AgentTransaction.set_tr_manager(self._tr_manager)

        self._watchdog = None
        self.skip_ssl_validation = skip_ssl_validation or agentConfig.get(
            'skip_ssl_validation', False)
        self.use_simple_http_client = use_simple_http_client
        if self.skip_ssl_validation:
            log.info(
                "Skipping SSL hostname validation, useful when using a transparent proxy"
            )

        # Monitor activity
        if watchdog:
            watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER / 1000
            self._watchdog = Watchdog(
                watchdog_timeout,
                max_mem_mb=agentConfig.get('limit_memory_consumption', None),
                max_resets=WATCHDOG_HIGH_ACTIVITY_THRESHOLD)

    def log_request(self, handler):
        """ Override the tornado logging method.
        If everything goes well, log level is DEBUG.
        Otherwise it's WARNING or ERROR depending on the response code. """
        if handler.get_status() < 400:
            log_method = log.debug
        elif handler.get_status() < 500:
            log_method = log.warning
        else:
            log_method = log.error

        request_time = 1000.0 * handler.request.request_time()
        log_method(u"%d %s %.2fms", handler.get_status(),
                   handler._request_summary(), request_time)

    def appendMetric(self, prefix, name, host, device, ts, value):

        if prefix in self._metrics:
            metrics = self._metrics[prefix]
        else:
            metrics = {}
            self._metrics[prefix] = metrics

        if name in metrics:
            metrics[name].append([host, device, ts, value])
        else:
            metrics[name] = [[host, device, ts, value]]

    def _postMetrics(self):

        if len(self._metrics) > 0:
            self._metrics['uuid'] = get_uuid()
            self._metrics['internalHostname'] = get_hostname(self._agentConfig)
            self._metrics['apiKey'] = self._agentConfig['api_key']
            MetricTransaction(json.dumps(self._metrics),
                              headers={'Content-Type': 'application/json'})
            self._metrics = {}

    def run(self):
        handlers = [
            (r"/intake/?", AgentInputHandler),
            (r"/intake/metrics?", MetricsAgentInputHandler),
            (r"/intake/metadata?", MetadataAgentInputHandler),
            (r"/api/v1/series/?", ApiInputHandler),
            (r"/api/v1/check_run/?", ApiCheckRunHandler),
            (r"/status/?", StatusHandler),
        ]

        settings = dict(
            cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=",
            xsrf_cookies=False,
            debug=False,
            log_function=self.log_request)

        non_local_traffic = self._agentConfig.get("non_local_traffic", False)

        tornado.web.Application.__init__(self, handlers, **settings)
        http_server = tornado.httpserver.HTTPServer(self)

        try:
            # non_local_traffic must be == True to match, not just some non-false value
            if non_local_traffic is True:
                http_server.listen(self._port)
            else:
                # localhost in lieu of 127.0.0.1 to support IPv6
                try:
                    http_server.listen(self._port,
                                       address=self._agentConfig['bind_host'])
                except gaierror:
                    log.warning(
                        "localhost seems undefined in your host file, using 127.0.0.1 instead"
                    )
                    http_server.listen(self._port, address="127.0.0.1")
                except socket_error as e:
                    if "Errno 99" in str(e):
                        log.warning(
                            "IPv6 doesn't seem to be fully supported. Falling back to IPv4"
                        )
                        http_server.listen(self._port, address="127.0.0.1")
                    else:
                        raise
        except socket_error as e:
            log.exception(
                "Socket error %s. Is another application listening on the same port ? Exiting",
                e)
            sys.exit(1)
        except Exception as e:
            log.exception("Uncaught exception. Forwarder is exiting.")
            sys.exit(1)

        log.info("Listening on port %d" % self._port)

        # Register callbacks
        self.mloop = tornado.ioloop.IOLoop.current()

        logging.getLogger().setLevel(get_logging_config()['log_level']
                                     or logging.INFO)

        def flush_trs():
            if self._watchdog:
                self._watchdog.reset()
            self._postMetrics()
            self._tr_manager.flush()

        tr_sched = tornado.ioloop.PeriodicCallback(flush_trs,
                                                   TRANSACTION_FLUSH_INTERVAL,
                                                   io_loop=self.mloop)

        # Register optional Graphite listener
        gport = self._agentConfig.get("graphite_listen_port", None)
        if gport is not None:
            log.info("Starting graphite listener on port %s" % gport)
            from graphite import GraphiteServer
            gs = GraphiteServer(self,
                                get_hostname(self._agentConfig),
                                io_loop=self.mloop)
            if non_local_traffic is True:
                gs.listen(gport)
            else:
                gs.listen(gport, address="localhost")

        # Start everything
        if self._watchdog:
            self._watchdog.reset()
        tr_sched.start()

        self.mloop.start()
        log.info("Stopped")

    def stop(self):
        self.mloop.stop()
Example #24
0
 def slow_tornado(self):
     a = Application(12345, self.AGENT_CONFIG)
     a._watchdog = Watchdog(4)
     a._tr_manager = MockTxManager()
     a.run()
Example #25
0
class Reporter(threading.Thread):
    """
    The reporter periodically sends the aggregated metrics to the
    server.
    """

    def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False):
        threading.Thread.__init__(self)
        self.interval = int(interval)
        self.finished = threading.Event()
        self.metrics_aggregator = metrics_aggregator
        self.flush_count = 0

        self.watchdog = None
        if use_watchdog:
            from util import Watchdog
            self.watchdog = Watchdog(WATCHDOG_TIMEOUT)

        self.api_key = api_key
        self.api_host = api_host

        self.http_conn_cls = http_client.HTTPSConnection

        match = re.match('^(https?)://(.*)', api_host)

        if match:
            self.api_host = match.group(2)
            if match.group(1) == 'http':
                self.http_conn_cls = http_client.HTTPConnection

    def stop(self):
        log.info("Stopping reporter")
        self.finished.set()

    def run(self):

        log.info("Reporting to %s every %ss" % (self.api_host, self.interval))
        log.debug("Watchdog enabled: %s" % bool(self.watchdog))

        # Persist a start-up message.
        DogstatsdStatus().persist()

        while not self.finished.isSet(): # Use camel case isSet for 2.4 support.
            self.finished.wait(self.interval)
            self.metrics_aggregator.send_packet_count('datadog.dogstatsd.packet.count')
            self.flush()
            if self.watchdog:
                self.watchdog.reset()

        # Clean up the status messages.
        log.debug("Stopped reporter")
        DogstatsdStatus.remove_latest_status()

    def flush(self):
        try:
            self.flush_count += 1
            packets_per_second = self.metrics_aggregator.packets_per_second(self.interval)
            packet_count = self.metrics_aggregator.total_count

            metrics = self.metrics_aggregator.flush()
            count = len(metrics)
            should_log = self.flush_count < LOGGING_INTERVAL or self.flush_count % LOGGING_INTERVAL == 0
            if not count:
                if should_log:
                    log.info("Flush #%s: No metrics to flush." % self.flush_count)
            else:
                if should_log:
                    log.info("Flush #%s: flushing %s metrics" % (self.flush_count, count))
                self.submit(metrics)

            # Persist a status message.
            packet_count = self.metrics_aggregator.total_count
            DogstatsdStatus(
                flush_count=self.flush_count,
                packet_count=packet_count,
                packets_per_second=packets_per_second,
                metric_count=count).persist()

        except:
            log.exception("Error flushing metrics")

    def submit(self, metrics):
        # HACK - Copy and pasted from dogapi, because it's a bit of a pain to distribute python
        # dependencies with the agent.
        body = serialize(metrics)
        headers = {'Content-Type':'application/json'}
        method = 'POST'

        params = {}
        if self.api_key:
            params['api_key'] = self.api_key
        url = '/api/v1/series?%s' % urlencode(params)

        start_time = time()
        status = None
        conn = self.http_conn_cls(self.api_host)
        try:
            conn.request(method, url, body, headers)

            #FIXME: add timeout handling code here

            response = conn.getresponse()
            status = response.status
            response.close()
        finally:
            conn.close()
        duration = round((time() - start_time) * 1000.0, 4)
        log.debug("%s %s %s%s (%sms)" % (
                        status, method, self.api_host, url, duration))
        return duration
Example #26
0
class Reporter(threading.Thread):
    """
    The reporter periodically sends the aggregated metrics to the
    server.
    """

    def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False):
        threading.Thread.__init__(self)
        self.interval = int(interval)
        self.finished = threading.Event()
        self.metrics_aggregator = metrics_aggregator
        self.flush_count = 0

        self.watchdog = None
        if use_watchdog:
            from util import Watchdog
            self.watchdog = Watchdog(WATCHDOG_TIMEOUT)

        self.api_key = api_key
        self.api_host = api_host

        self.http_conn_cls = http_client.HTTPSConnection

        match = re.match('^(https?)://(.*)', api_host)

        if match:
            self.api_host = match.group(2)
            if match.group(1) == 'http':
                self.http_conn_cls = http_client.HTTPConnection

    def stop(self):
        log.info("Stopping reporter")
        self.finished.set()

    def run(self):

        log.info("Reporting to %s every %ss" % (self.api_host, self.interval))
        log.debug("Watchdog enabled: %s" % bool(self.watchdog))

        # Persist a start-up message.
        DogstatsdStatus().persist()

        while not self.finished.isSet(): # Use camel case isSet for 2.4 support.
            self.finished.wait(self.interval)
            self.metrics_aggregator.send_packet_count('datadog.dogstatsd.packet.count')
            self.flush()
            if self.watchdog:
                self.watchdog.reset()

        # Clean up the status messages.
        log.debug("Stopped reporter")
        DogstatsdStatus.remove_latest_status()

    def flush(self):
        try:
            self.flush_count += 1
            packets_per_second = self.metrics_aggregator.packets_per_second(self.interval)
            packet_count = self.metrics_aggregator.total_count

            metrics = self.metrics_aggregator.flush()
            count = len(metrics)
            should_log = self.flush_count < LOGGING_INTERVAL or self.flush_count % LOGGING_INTERVAL == 0
            if not count:
                if should_log:
                    log.info("Flush #%s: No metrics to flush." % self.flush_count)
            else:
                if should_log:
                    log.info("Flush #%s: flushing %s metrics" % (self.flush_count, count))
                self.submit(metrics)

            events = self.metrics_aggregator.flush_events()
            event_count = len(events)
            if not event_count:
                if should_log:
                    log.info("Flush #%s: No events to flush." % self.flush_count)
                else:
                    log.debug("Flush #%s: No events to flush." % self.flush_count)
            else:
                if should_log:
                    log.info("Flush #%s: flushing %s events" % (self.flush_count, len(events)))
                else:
                    log.debug("Flush #%s: flushing %s events" % (self.flush_count, len(events)))
                self.submit_events(events)

            # Persist a status message.
            packet_count = self.metrics_aggregator.total_count
            DogstatsdStatus(
                flush_count=self.flush_count,
                packet_count=packet_count,
                packets_per_second=packets_per_second,
                metric_count=count,
                event_count=event_count,
            ).persist()

        except Exception, e:
            log.exception("Error flushing metrics")
Example #27
0
 def busy_run(self):
     w = Watchdog(5)
     w.reset()
     x = 0
     while True:
         x = random()
Example #28
0
 def hanging_net(self):
     w = Watchdog(5)
     w.reset()
     x = url.urlopen("http://localhost:31834")
     print "ERROR Net call returned", x
     return True
Example #29
0
class Application(tornado.web.Application):

    NO_PARALLELISM = 1
    DEFAULT_PARALLELISM = 5

    def __init__(self, port, agentConfig, watchdog=True,
                 skip_ssl_validation=False, use_simple_http_client=False):
        self._port = int(port)
        self._agentConfig = agentConfig
        self._metrics = {}
        AgentTransaction.set_application(self)
        AgentTransaction.set_endpoints(agentConfig['endpoints'])
        AgentTransaction.set_request_timeout(agentConfig['forwarder_timeout'])

        max_parallelism = self.NO_PARALLELISM
        # Multiple endpoints => enable parallelism
        if len(agentConfig['endpoints']) > 1:
            max_parallelism = self.DEFAULT_PARALLELISM

        self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY,
                                              MAX_QUEUE_SIZE, THROTTLING_DELAY,
                                              max_parallelism=max_parallelism)
        AgentTransaction.set_tr_manager(self._tr_manager)

        self._watchdog = None
        self.skip_ssl_validation = skip_ssl_validation or agentConfig.get('skip_ssl_validation', False)
        self.use_simple_http_client = use_simple_http_client
        if self.skip_ssl_validation:
            log.info("Skipping SSL hostname validation, useful when using a transparent proxy")

        # Monitor activity
        if watchdog:
            watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER / 1000
            self._watchdog = Watchdog(
                watchdog_timeout,
                max_mem_mb=agentConfig.get('limit_memory_consumption', None),
                max_resets=WATCHDOG_HIGH_ACTIVITY_THRESHOLD
            )

    def log_request(self, handler):
        """ Override the tornado logging method.
        If everything goes well, log level is DEBUG.
        Otherwise it's WARNING or ERROR depending on the response code. """
        if handler.get_status() < 400:
            log_method = log.debug
        elif handler.get_status() < 500:
            log_method = log.warning
        else:
            log_method = log.error

        request_time = 1000.0 * handler.request.request_time()
        log_method(
            u"%d %s %.2fms",
            handler.get_status(),
            handler._request_summary(), request_time
        )

    def appendMetric(self, prefix, name, host, device, ts, value):

        if prefix in self._metrics:
            metrics = self._metrics[prefix]
        else:
            metrics = {}
            self._metrics[prefix] = metrics

        if name in metrics:
            metrics[name].append([host, device, ts, value])
        else:
            metrics[name] = [[host, device, ts, value]]

    def _postMetrics(self):

        if len(self._metrics) > 0:
            self._metrics['uuid'] = get_uuid()
            self._metrics['internalHostname'] = get_hostname(self._agentConfig)
            self._metrics['apiKey'] = self._agentConfig['api_key']
            MetricTransaction(json.dumps(self._metrics),
                              headers={'Content-Type': 'application/json'})
            self._metrics = {}

    def run(self):
        handlers = [
            (r"/intake/?", AgentInputHandler),
            (r"/intake/metrics?", MetricsAgentInputHandler),
            (r"/intake/metadata?", MetadataAgentInputHandler),
            (r"/api/v1/series/?", ApiInputHandler),
            (r"/api/v1/check_run/?", ApiCheckRunHandler),
            (r"/status/?", StatusHandler),
        ]

        settings = dict(
            cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=",
            xsrf_cookies=False,
            debug=False,
            log_function=self.log_request
        )

        non_local_traffic = self._agentConfig.get("non_local_traffic", False)

        tornado.web.Application.__init__(self, handlers, **settings)
        http_server = tornado.httpserver.HTTPServer(self)

        try:
            # non_local_traffic must be == True to match, not just some non-false value
            if non_local_traffic is True:
                http_server.listen(self._port)
            else:
                # localhost in lieu of 127.0.0.1 to support IPv6
                try:
                    http_server.listen(self._port, address=self._agentConfig['bind_host'])
                except gaierror:
                    log.warning("localhost seems undefined in your host file, using 127.0.0.1 instead")
                    http_server.listen(self._port, address="127.0.0.1")
                except socket_error as e:
                    if "Errno 99" in str(e):
                        log.warning("IPv6 doesn't seem to be fully supported. Falling back to IPv4")
                        http_server.listen(self._port, address="127.0.0.1")
                    else:
                        raise
        except socket_error as e:
            log.exception("Socket error %s. Is another application listening on the same port ? Exiting", e)
            sys.exit(1)
        except Exception as e:
            log.exception("Uncaught exception. Forwarder is exiting.")
            sys.exit(1)

        log.info("Listening on port %d" % self._port)

        # Register callbacks
        self.mloop = get_tornado_ioloop()

        logging.getLogger().setLevel(get_logging_config()['log_level'] or logging.INFO)

        def flush_trs():
            if self._watchdog:
                self._watchdog.reset()
            self._postMetrics()
            self._tr_manager.flush()

        tr_sched = tornado.ioloop.PeriodicCallback(flush_trs, TRANSACTION_FLUSH_INTERVAL,
                                                   io_loop=self.mloop)

        # Register optional Graphite listener
        gport = self._agentConfig.get("graphite_listen_port", None)
        if gport is not None:
            log.info("Starting graphite listener on port %s" % gport)
            from graphite import GraphiteServer
            gs = GraphiteServer(self, get_hostname(self._agentConfig), io_loop=self.mloop)
            if non_local_traffic is True:
                gs.listen(gport)
            else:
                gs.listen(gport, address="localhost")

        # Start everything
        if self._watchdog:
            self._watchdog.reset()
        tr_sched.start()

        self.mloop.start()
        log.info("Stopped")

    def stop(self):
        self.mloop.stop()
Example #30
0
 def _get_watchdog(self, check_freq, agentConfig):
     watchdog = None
     if agentConfig.get("watchdog", True):
         watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER)
         watchdog.reset()
     return watchdog
Example #31
0
class Reporter(threading.Thread):
    """
    The reporter periodically sends the aggregated metrics to the
    server.
    """
    def __init__(self,
                 interval,
                 metrics_aggregator,
                 api_host,
                 api_key=None,
                 use_watchdog=False):
        threading.Thread.__init__(self)
        self.interval = int(interval)
        self.finished = threading.Event()
        self.metrics_aggregator = metrics_aggregator
        self.flush_count = 0

        self.watchdog = None
        if use_watchdog:
            from util import Watchdog
            self.watchdog = Watchdog(WATCHDOG_TIMEOUT)

        self.api_key = api_key
        self.api_host = api_host

        self.http_conn_cls = http_client.HTTPSConnection

        match = re.match('^(https?)://(.*)', api_host)

        if match:
            self.api_host = match.group(2)
            if match.group(1) == 'http':
                self.http_conn_cls = http_client.HTTPConnection

    def stop(self):
        log.info("Stopping reporter")
        self.finished.set()

    def run(self):

        log.info("Reporting to %s every %ss" % (self.api_host, self.interval))
        log.debug("Watchdog enabled: %s" % bool(self.watchdog))

        # Persist a start-up message.
        DogstatsdStatus().persist()

        while not self.finished.isSet(
        ):  # Use camel case isSet for 2.4 support.
            self.finished.wait(self.interval)
            self.metrics_aggregator.send_packet_count(
                'datadog.dogstatsd.packet.count')
            self.flush()
            if self.watchdog:
                self.watchdog.reset()

        # Clean up the status messages.
        log.debug("Stopped reporter")
        DogstatsdStatus.remove_latest_status()

    def flush(self):
        try:
            self.flush_count += 1
            packets_per_second = self.metrics_aggregator.packets_per_second(
                self.interval)
            packet_count = self.metrics_aggregator.total_count

            metrics = self.metrics_aggregator.flush()
            count = len(metrics)
            should_log = self.flush_count < LOGGING_INTERVAL or self.flush_count % LOGGING_INTERVAL == 0
            if not count:
                if should_log:
                    log.info("Flush #%s: No metrics to flush." %
                             self.flush_count)
            else:
                if should_log:
                    log.info("Flush #%s: flushing %s metrics" %
                             (self.flush_count, count))
                self.submit(metrics)

            events = self.metrics_aggregator.flush_events()
            event_count = len(events)
            if not event_count:
                log.info("Flush #%s: No events to flush." % self.flush_count)
            else:
                log.info("Flush #%s: flushing %s events" %
                         (self.flush_count, len(events)))
                self.submit_events(events)

            # Persist a status message.
            packet_count = self.metrics_aggregator.total_count
            DogstatsdStatus(flush_count=self.flush_count,
                            packet_count=packet_count,
                            packets_per_second=packets_per_second,
                            metric_count=count,
                            event_count=event_count).persist()

        except:
            log.exception("Error flushing metrics")

    def submit(self, metrics):
        # HACK - Copy and pasted from dogapi, because it's a bit of a pain to distribute python
        # dependencies with the agent.
        body = serialize_metrics(metrics)
        headers = {'Content-Type': 'application/json'}
        method = 'POST'

        params = {}
        if self.api_key:
            params['api_key'] = self.api_key
        url = '/api/v1/series?%s' % urlencode(params)

        start_time = time()
        status = None
        conn = self.http_conn_cls(self.api_host)
        try:
            conn.request(method, url, body, headers)

            #FIXME: add timeout handling code here

            response = conn.getresponse()
            status = response.status
            response.close()
        finally:
            conn.close()
        duration = round((time() - start_time) * 1000.0, 4)
        log.debug("%s %s %s%s (%sms)" %
                  (status, method, self.api_host, url, duration))
        return duration

    def submit_events(self, events):
        headers = {'Content-Type': 'application/json'}
        method = 'POST'

        params = {}
        if self.api_key:
            params['api_key'] = self.api_key
        url = '/api/v1/events?%s' % urlencode(params)

        status = None
        conn = self.http_conn_cls(self.api_host)
        try:
            for event in events:
                start_time = time()
                body = serialize_event(event)
                log.debug('Sending event: %s' % body)
                conn.request(method, url, body, headers)

                response = conn.getresponse()
                status = response.status
                response.close()
                duration = round((time() - start_time) * 1000.0, 4)
                log.debug("%s %s %s%s (%sms)" %
                          (status, method, self.api_host, url, duration))
        finally:
            conn.close()
Example #32
0
 def normal_run(self):
     w = Watchdog(2)
     w.reset()
     for i in range(5):
         time.sleep(1)
         w.reset()
Example #33
0
class Application(tornado.web.Application):

    def __init__(self, port, agentConfig, watchdog=True):
        self._port = int(port)
        self._agentConfig = agentConfig
        self._metrics = {}
        MetricTransaction.set_application(self)
        MetricTransaction.set_endpoints()
        self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY,
            MAX_QUEUE_SIZE, THROTTLING_DELAY)
        MetricTransaction.set_tr_manager(self._tr_manager)

        self._watchdog = None
        if watchdog:
            watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER
            self._watchdog = Watchdog(watchdog_timeout,
                max_mem_mb=agentConfig.get('limit_memory_consumption', None))

    def log_request(self, handler):
        """ Override the tornado logging method.
        If everything goes well, log level is DEBUG.
        Otherwise it's WARNING or ERROR depending on the response code. """
        if handler.get_status() < 400:
            log_method = log.debug
        elif handler.get_status() < 500:
            log_method = log.warning
        else:
            log_method = log.error
        request_time = 1000.0 * handler.request.request_time()
        log_method("%d %s %.2fms", handler.get_status(),
                   handler._request_summary(), request_time)

    def appendMetric(self, prefix, name, host, device, ts, value):

        if self._metrics.has_key(prefix):
            metrics = self._metrics[prefix]
        else:
            metrics = {}
            self._metrics[prefix] = metrics

        if metrics.has_key(name):
            metrics[name].append([host, device, ts, value])
        else:
            metrics[name] = [[host, device, ts, value]]

    def _postMetrics(self):

        if len(self._metrics) > 0:
            self._metrics['uuid'] = get_uuid()
            self._metrics['internalHostname'] = get_hostname(self._agentConfig)
            self._metrics['apiKey'] = self._agentConfig['api_key']
            MetricTransaction(json.dumps(self._metrics),
                headers={'Content-Type': 'application/json'})
            self._metrics = {}

    def run(self):
        handlers = [
            (r"/intake/?", AgentInputHandler),
            (r"/api/v1/series/?", ApiInputHandler),
            (r"/status/?", StatusHandler),
        ]

        settings = dict(
            cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=",
            xsrf_cookies=False,
            debug=False,
            log_function=self.log_request
        )

        non_local_traffic = self._agentConfig.get("non_local_traffic", False)

        tornado.web.Application.__init__(self, handlers, **settings)
        http_server = tornado.httpserver.HTTPServer(self)

        # non_local_traffic must be == True to match, not just some non-false value
        if non_local_traffic is True:
            http_server.listen(self._port)
        else:
            # localhost in lieu of 127.0.0.1 to support IPv6
            try:
                http_server.listen(self._port, address = "localhost")
            except gaierror:
                log.warning("Warning localhost seems undefined in your host file, using 127.0.0.1 instead")
                http_server.listen(self._port, address = "127.0.0.1")

        log.info("Listening on port %d" % self._port)

        # Register callbacks
        self.mloop = tornado.ioloop.IOLoop.instance()

        logging.getLogger().setLevel(get_logging_config()['log_level'] or logging.INFO)

        def flush_trs():
            if self._watchdog:
                self._watchdog.reset()
            self._postMetrics()
            self._tr_manager.flush()

        tr_sched = tornado.ioloop.PeriodicCallback(flush_trs,TRANSACTION_FLUSH_INTERVAL,
            io_loop = self.mloop)

        # Register optional Graphite listener
        gport = self._agentConfig.get("graphite_listen_port", None)
        if gport is not None:
            log.info("Starting graphite listener on port %s" % gport)
            from graphite import GraphiteServer
            gs = GraphiteServer(self, get_hostname(self._agentConfig), io_loop=self.mloop)
            if non_local_traffic is True:
                gs.listen(gport)
            else:
                gs.listen(gport, address = "localhost")

        # Start everything
        if self._watchdog:
            self._watchdog.reset()
        tr_sched.start()

        self.mloop.start()
        log.info("Stopped")

    def stop(self):
        self.mloop.stop()
Example #34
0
 def busy_run(self):
     w = Watchdog(5)
     w.reset()
     x = 0
     while True:
         x = random()
Example #35
0
class Reporter(threading.Thread):
    """
    The reporter periodically sends the aggregated metrics to the
    server.
    """

    def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False):
        threading.Thread.__init__(self)
        self.interval = int(interval)
        self.finished = threading.Event()
        self.metrics_aggregator = metrics_aggregator
        self.flush_count = 0

        self.watchdog = None
        if use_watchdog:
            from util import Watchdog
            self.watchdog = Watchdog(WATCHDOG_TIMEOUT)

        self.api_key = api_key
        self.api_host = api_host

        self.http_conn_cls = http_client.HTTPSConnection

        match = re.match('^(https?)://(.*)', api_host)

        if match:
            self.api_host = match.group(2)
            if match.group(1) == 'http':
                self.http_conn_cls = http_client.HTTPConnection

    def stop(self):
        log.info("Stopping reporter")
        self.finished.set()

    def run(self):

        log.info("Reporting to %s every %ss" % (self.api_host, self.interval))
        log.debug("Watchdog enabled: %s" % bool(self.watchdog))

        # Persist a start-up message.
        DogstatsdStatus().persist()

        while not self.finished.isSet(): # Use camel case isSet for 2.4 support.
            self.finished.wait(self.interval)
            self.metrics_aggregator.send_packet_count('datadog.dogstatsd.packet.count')
            self.flush()
            if self.watchdog:
                self.watchdog.reset()

        # Clean up the status messages.
        log.debug("Stopped reporter")
        DogstatsdStatus.remove_latest_status()

    def flush(self):
        try:
            self.flush_count += 1
            packets_per_second = self.metrics_aggregator.packets_per_second(self.interval)
            packet_count = self.metrics_aggregator.total_count

            metrics = self.metrics_aggregator.flush()
            count = len(metrics)
            should_log = self.flush_count < LOGGING_INTERVAL or self.flush_count % LOGGING_INTERVAL == 0
            if not count:
                if should_log:
                    log.info("Flush #%s: No metrics to flush." % self.flush_count)
            else:
                if should_log:
                    log.info("Flush #%s: flushing %s metrics" % (self.flush_count, count))
                self.submit(metrics)

            events = self.metrics_aggregator.flush_events()
            event_count = len(events)
            if not event_count:
                if should_log:
                    log.info("Flush #%s: No events to flush." % self.flush_count)
                else:
                    log.debug("Flush #%s: No events to flush." % self.flush_count)
            else:
                if should_log:
                    log.info("Flush #%s: flushing %s events" % (self.flush_count, len(events)))
                else:
                    log.debug("Flush #%s: flushing %s events" % (self.flush_count, len(events)))
                self.submit_events(events)

            # Persist a status message.
            packet_count = self.metrics_aggregator.total_count
            DogstatsdStatus(
                flush_count=self.flush_count,
                packet_count=packet_count,
                packets_per_second=packets_per_second,
                metric_count=count,
                event_count=event_count
            ).persist()

        except Exception, e:
            log.exception("Error flushing metrics")
Example #36
0
 def hanging_net(self):
     w = Watchdog(5)
     w.reset()
     x = url.urlopen("http://localhost:31834")
     print "ERROR Net call returned", x
     return True
Example #37
0
 def fast_tornado(self):
     a = Application(12345, {"bind_host": "localhost"})
     a._watchdog = Watchdog(6)
     a._tr_manager = MockTxManager()
     a.run()
Example #38
0
class Application(tornado.web.Application):

    def __init__(self, port, agentConfig, watchdog=True):
        self._port = int(port)
        self._agentConfig = agentConfig
        self._metrics = {}
        MetricTransaction.set_application(self)
        MetricTransaction.set_endpoints()
        self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY,
            MAX_QUEUE_SIZE, THROTTLING_DELAY)
        MetricTransaction.set_tr_manager(self._tr_manager)

        self._watchdog = None
        if watchdog:
            watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER
            self._watchdog = Watchdog(watchdog_timeout)

    def appendMetric(self, prefix, name, host, device, ts, value):

        if self._metrics.has_key(prefix):
            metrics = self._metrics[prefix]
        else:
            metrics = {}
            self._metrics[prefix] = metrics

        if metrics.has_key(name):
            metrics[name].append([host, device, ts, value])
        else:
            metrics[name] = [[host, device, ts, value]]

    def _postMetrics(self):

        if len(self._metrics) > 0:
            self._metrics['uuid'] = get_uuid()
            self._metrics['internalHostname'] = gethostname(self._agentConfig)
            self._metrics['apiKey'] = self._agentConfig['api_key']
            MetricTransaction(self._metrics, {})
            self._metrics = {}

    def run(self):
        handlers = [
            (r"/intake/?", AgentInputHandler),
            (r"/api/v1/series/?", ApiInputHandler),
            (r"/status/?", StatusHandler),
        ]

        settings = dict(
            cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=",
            xsrf_cookies=False,
            debug=False,
        )

        non_local_traffic = self._agentConfig.get("non_local_traffic", False)

        tornado.web.Application.__init__(self, handlers, **settings)
        http_server = tornado.httpserver.HTTPServer(self)

        # set the root logger to warn so tornado is less chatty
        logging.getLogger().setLevel(logging.WARNING)

        # but keep the forwarder logger at the original level
        forwarder_logger = logging.getLogger('forwarder')
        log_config = get_logging_config()
        forwarder_logger.setLevel(log_config['log_level'] or logging.INFO)

        # non_local_traffic must be == True to match, not just some non-false value
        if non_local_traffic is True:
            http_server.listen(self._port)
        else:
            # localhost in lieu of 127.0.0.1 to support IPv6
            try:
                http_server.listen(self._port, address = "localhost")
            except gaierror:
                log.warning("Warning localhost seems undefined in your host file, using 127.0.0.1 instead")
                http_server.listen(self._port, address = "127.0.0.1")

        log.info("Listening on port %d" % self._port)

        # Register callbacks
        self.mloop = tornado.ioloop.IOLoop.instance()

        def flush_trs():
            if self._watchdog:
                self._watchdog.reset()
            self._postMetrics()
            self._tr_manager.flush()

        tr_sched = tornado.ioloop.PeriodicCallback(flush_trs,TRANSACTION_FLUSH_INTERVAL,
            io_loop = self.mloop)

        # Register optional Graphite listener
        gport = self._agentConfig.get("graphite_listen_port", None)
        if gport is not None:
            log.info("Starting graphite listener on port %s" % gport)
            from graphite import GraphiteServer
            gs = GraphiteServer(self, gethostname(self._agentConfig), io_loop=self.mloop)
            if non_local_traffic is True:
                gs.listen(gport)
            else:
                gs.listen(gport, address = "localhost")

        # Start everything
        if self._watchdog:
            self._watchdog.reset()
        tr_sched.start()

        self.mloop.start()
        log.info("Stopped")

    def stop(self):
        self.mloop.stop()
Example #39
0
class Application(tornado.web.Application):
    def __init__(self, port, agentConfig, watchdog=True):
        self._port = int(port)
        self._agentConfig = agentConfig
        self._metrics = {}
        MetricTransaction.set_application(self)
        MetricTransaction.set_endpoints()
        self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY,
                                              MAX_QUEUE_SIZE, THROTTLING_DELAY)
        MetricTransaction.set_tr_manager(self._tr_manager)

        self._watchdog = None
        if watchdog:
            watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER
            self._watchdog = Watchdog(watchdog_timeout)

    def appendMetric(self, prefix, name, host, device, ts, value):

        if self._metrics.has_key(prefix):
            metrics = self._metrics[prefix]
        else:
            metrics = {}
            self._metrics[prefix] = metrics

        if metrics.has_key(name):
            metrics[name].append([host, device, ts, value])
        else:
            metrics[name] = [[host, device, ts, value]]

    def _postMetrics(self):

        if len(self._metrics) > 0:
            self._metrics['uuid'] = get_uuid()
            self._metrics['internalHostname'] = gethostname(self._agentConfig)
            self._metrics['apiKey'] = self._agentConfig['api_key']
            MetricTransaction(self._metrics, {})
            self._metrics = {}

    def run(self):

        handlers = [
            (r"/intake/?", AgentInputHandler),
            (r"/api/v1/series/?", ApiInputHandler),
            (r"/status/?", StatusHandler),
        ]

        settings = dict(
            cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=",
            xsrf_cookies=False,
            debug=True,
        )

        tornado.web.Application.__init__(self, handlers, **settings)
        http_server = tornado.httpserver.HTTPServer(self)
        http_server.listen(self._port)
        logging.info("Listening on port %d" % self._port)

        # Register callbacks
        self.mloop = tornado.ioloop.IOLoop.instance()

        def flush_trs():
            if self._watchdog:
                self._watchdog.reset()
            self._postMetrics()
            self._tr_manager.flush()

        tr_sched = tornado.ioloop.PeriodicCallback(flush_trs,
                                                   TRANSACTION_FLUSH_INTERVAL,
                                                   io_loop=self.mloop)

        # Register optional Graphite listener
        gport = self._agentConfig.get("graphite_listen_port", None)
        if gport is not None:
            logging.info("Starting graphite listener on port %s" % gport)
            from graphite import GraphiteServer
            gs = GraphiteServer(self,
                                gethostname(self._agentConfig),
                                io_loop=self.mloop)
            gs.listen(gport)

        # Start everything
        if self._watchdog:
            self._watchdog.reset()
        tr_sched.start()

        self.mloop.start()
        logging.info("Stopped")

    def stop(self):
        self.mloop.stop()
Example #40
0
class Reporter(threading.Thread):
    """
    The reporter periodically sends the aggregated metrics to the
    server.
    """
    def __init__(self,
                 interval,
                 metrics_aggregator,
                 api_host,
                 api_key=None,
                 use_watchdog=False,
                 event_chunk_size=None):
        threading.Thread.__init__(self)
        self.interval = int(interval)
        self.finished = threading.Event()
        self.metrics_aggregator = metrics_aggregator
        self.flush_count = 0
        self.log_count = 0

        self.watchdog = None
        if use_watchdog:
            from util import Watchdog
            self.watchdog = Watchdog(WATCHDOG_TIMEOUT)

        self.api_key = api_key
        self.api_host = api_host
        self.event_chunk_size = event_chunk_size or EVENT_CHUNK_SIZE

        self.http_conn_cls = http_client.HTTPSConnection

        match = re.match('^(https?)://(.*)', api_host)

        if match:
            self.api_host = match.group(2)
            if match.group(1) == 'http':
                self.http_conn_cls = http_client.HTTPConnection

    def stop(self):
        log.info("Stopping reporter")
        self.finished.set()

    def run(self):

        log.info("Reporting to %s every %ss" % (self.api_host, self.interval))
        log.debug("Watchdog enabled: %s" % bool(self.watchdog))

        # Persist a start-up message.
        DogstatsdStatus().persist()

        while not self.finished.isSet(
        ):  # Use camel case isSet for 2.4 support.
            self.finished.wait(self.interval)
            self.metrics_aggregator.send_packet_count(
                'datadog.dogstatsd.packet.count')
            self.flush()
            if self.watchdog:
                self.watchdog.reset()

        # Clean up the status messages.
        log.debug("Stopped reporter")
        DogstatsdStatus.remove_latest_status()

    def flush(self):
        try:
            self.flush_count += 1
            self.log_count += 1
            packets_per_second = self.metrics_aggregator.packets_per_second(
                self.interval)
            packet_count = self.metrics_aggregator.total_count

            metrics = self.metrics_aggregator.flush()
            count = len(metrics)
            if self.flush_count % FLUSH_LOGGING_PERIOD == 0:
                self.log_count = 0
            if count:
                self.submit(metrics)

            events = self.metrics_aggregator.flush_events()
            event_count = len(events)
            if event_count:
                self.submit_events(events)

            should_log = self.flush_count <= FLUSH_LOGGING_INITIAL or self.log_count <= FLUSH_LOGGING_COUNT
            log_func = log.info
            if not should_log:
                log_func = log.debug
            log_func("Flush #%s: flushed %s metric%s and %s event%s" %
                     (self.flush_count, count, plural(count), event_count,
                      plural(event_count)))
            if self.flush_count == FLUSH_LOGGING_INITIAL:
                log.info(
                    "First flushes done, %s flushes will be logged every %s flushes."
                    % (FLUSH_LOGGING_COUNT, FLUSH_LOGGING_PERIOD))

            # Persist a status message.
            packet_count = self.metrics_aggregator.total_count
            DogstatsdStatus(
                flush_count=self.flush_count,
                packet_count=packet_count,
                packets_per_second=packets_per_second,
                metric_count=count,
                event_count=event_count,
            ).persist()

        except Exception, e:
            log.exception("Error flushing metrics")
Example #41
0
class Reporter(threading.Thread):
    """
    The reporter periodically sends the aggregated metrics to the
    server.
    """

    def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False, event_chunk_size=None):
        threading.Thread.__init__(self)
        self.interval = int(interval)
        self.finished = threading.Event()
        self.metrics_aggregator = metrics_aggregator
        self.flush_count = 0
        self.log_count = 0

        self.watchdog = None
        if use_watchdog:
            from util import Watchdog
            self.watchdog = Watchdog(WATCHDOG_TIMEOUT)

        self.api_key = api_key
        self.api_host = api_host
        self.event_chunk_size = event_chunk_size or EVENT_CHUNK_SIZE

        self.http_conn_cls = http_client.HTTPSConnection

        match = re.match('^(https?)://(.*)', api_host)

        if match:
            self.api_host = match.group(2)
            if match.group(1) == 'http':
                self.http_conn_cls = http_client.HTTPConnection

    def stop(self):
        log.info("Stopping reporter")
        self.finished.set()

    def run(self):

        log.info("Reporting to %s every %ss" % (self.api_host, self.interval))
        log.debug("Watchdog enabled: %s" % bool(self.watchdog))

        # Persist a start-up message.
        DogstatsdStatus().persist()

        while not self.finished.isSet(): # Use camel case isSet for 2.4 support.
            self.finished.wait(self.interval)
            self.metrics_aggregator.send_packet_count('datadog.dogstatsd.packet.count')
            self.flush()
            if self.watchdog:
                self.watchdog.reset()

        # Clean up the status messages.
        log.debug("Stopped reporter")
        DogstatsdStatus.remove_latest_status()

    def flush(self):
        try:
            self.flush_count += 1
            self.log_count += 1
            packets_per_second = self.metrics_aggregator.packets_per_second(self.interval)
            packet_count = self.metrics_aggregator.total_count

            metrics = self.metrics_aggregator.flush()
            count = len(metrics)
            if self.flush_count % FLUSH_LOGGING_PERIOD == 0:
                self.log_count = 0
            if count:
                self.submit(metrics)

            events = self.metrics_aggregator.flush_events()
            event_count = len(events)
            if event_count:
                self.submit_events(events)

            should_log = self.flush_count <= FLUSH_LOGGING_INITIAL or self.log_count <= FLUSH_LOGGING_COUNT
            log_func = log.info
            if not should_log:
                log_func = log.debug
            log_func("Flush #%s: flushed %s metric%s and %s event%s" % (self.flush_count, count, plural(count), event_count, plural(event_count)))
            if self.flush_count == FLUSH_LOGGING_INITIAL:
                log.info("First flushes done, %s flushes will be logged every %s flushes." % (FLUSH_LOGGING_COUNT, FLUSH_LOGGING_PERIOD))

            # Persist a status message.
            packet_count = self.metrics_aggregator.total_count
            DogstatsdStatus(
                flush_count=self.flush_count,
                packet_count=packet_count,
                packets_per_second=packets_per_second,
                metric_count=count,
                event_count=event_count,
            ).persist()

        except Exception:
            log.exception("Error flushing metrics")

    def submit(self, metrics):
        # Copy and pasted from dogapi, because it's a bit of a pain to distribute python
        # dependencies with the agent.
        body, headers = serialize_metrics(metrics)
        method = 'POST'

        params = {}
        if self.api_key:
            params['api_key'] = self.api_key
        url = '/api/v1/series?%s' % urlencode(params)

        start_time = time()
        status = None
        conn = self.http_conn_cls(self.api_host)
        try:
            conn.request(method, url, body, headers)

            #FIXME: add timeout handling code here

            response = conn.getresponse()
            status = response.status
            response.close()
        finally:
            conn.close()
        duration = round((time() - start_time) * 1000.0, 4)
        log.debug("%s %s %s%s (%sms)" % (
                        status, method, self.api_host, url, duration))
        return duration

    def submit_events(self, events):
        headers = {'Content-Type':'application/json'}
        method = 'POST'

        events_len = len(events)
        event_chunk_size = self.event_chunk_size

        for chunk in chunks(events, event_chunk_size):
            payload = {
                'apiKey': self.api_key,
                'events': {
                    'api': chunk
                },
                'uuid': get_uuid(),
                'internalHostname': get_hostname()
            }
            params = {}
            if self.api_key:
                params['api_key'] = self.api_key
            url = '/intake?%s' % urlencode(params)

            status = None
            conn = self.http_conn_cls(self.api_host)
            try:
                start_time = time()
                conn.request(method, url, json.dumps(payload), headers)

                response = conn.getresponse()
                status = response.status
                response.close()
                duration = round((time() - start_time) * 1000.0, 4)
                log.debug("%s %s %s%s (%sms)" % (
                                status, method, self.api_host, url, duration))

            finally:
                conn.close()
Example #42
0
 def normal_run(self):
     w = Watchdog(2)
     w.reset()
     for i in range(5):
         time.sleep(1)
         w.reset()
Example #43
0
class Reporter(threading.Thread):
    """
    The reporter periodically sends the aggregated metrics to the
    server.
    """

    def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False, event_chunk_size=None):
        threading.Thread.__init__(self)
        self.interval = int(interval)
        self.finished = threading.Event()
        self.metrics_aggregator = metrics_aggregator
        self.flush_count = 0
        self.log_count = 0

        self.watchdog = None
        if use_watchdog:
            from util import Watchdog
            self.watchdog = Watchdog(WATCHDOG_TIMEOUT)

        self.api_key = api_key
        self.api_host = api_host
        self.event_chunk_size = event_chunk_size or EVENT_CHUNK_SIZE

        self.http_conn_cls = http_client.HTTPSConnection

        match = re.match('^(https?)://(.*)', api_host)

        if match:
            self.api_host = match.group(2)
            if match.group(1) == 'http':
                self.http_conn_cls = http_client.HTTPConnection

    def stop(self):
        log.info("Stopping reporter")
        self.finished.set()

    def run(self):

        log.info("Reporting to %s every %ss" % (self.api_host, self.interval))
        log.debug("Watchdog enabled: %s" % bool(self.watchdog))

        # Persist a start-up message.
        DogstatsdStatus().persist()

        while not self.finished.isSet(): # Use camel case isSet for 2.4 support.
            self.finished.wait(self.interval)
            self.metrics_aggregator.send_packet_count('datadog.dogstatsd.packet.count')
            self.flush()
            if self.watchdog:
                self.watchdog.reset()

        # Clean up the status messages.
        log.debug("Stopped reporter")
        DogstatsdStatus.remove_latest_status()

    def flush(self):
        try:
            self.flush_count += 1
            self.log_count += 1
            packets_per_second = self.metrics_aggregator.packets_per_second(self.interval)
            packet_count = self.metrics_aggregator.total_count

            metrics = self.metrics_aggregator.flush()
            count = len(metrics)
            if self.flush_count % FLUSH_LOGGING_PERIOD == 0:
                self.log_count = 0
            if count:
                self.submit(metrics)

            events = self.metrics_aggregator.flush_events()
            event_count = len(events)
            if event_count:
                self.submit_events(events)

            should_log = self.flush_count <= FLUSH_LOGGING_INITIAL or self.log_count <= FLUSH_LOGGING_COUNT
            log_func = log.info
            if not should_log:
                log_func = log.debug
            log_func("Flush #%s: flushed %s metric%s and %s event%s" % (self.flush_count, count, plural(count), event_count, plural(event_count)))
            if self.flush_count == FLUSH_LOGGING_INITIAL:
                log.info("First flushes done, %s flushes will be logged every %s flushes." % (FLUSH_LOGGING_COUNT, FLUSH_LOGGING_PERIOD))

            # Persist a status message.
            packet_count = self.metrics_aggregator.total_count
            DogstatsdStatus(
                flush_count=self.flush_count,
                packet_count=packet_count,
                packets_per_second=packets_per_second,
                metric_count=count,
                event_count=event_count,
            ).persist()

        except Exception, e:
            log.exception("Error flushing metrics")
Example #44
0
class Application(tornado.web.Application):

    def __init__(self, port, agentConfig, watchdog=True, skip_ssl_validation=False):
        self._port = int(port)
        self._agentConfig = agentConfig
        self._metrics = {}
        MetricTransaction.set_application(self)
        MetricTransaction.set_endpoints()
        self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY,
            MAX_QUEUE_SIZE, THROTTLING_DELAY)
        MetricTransaction.set_tr_manager(self._tr_manager)

        self._watchdog = None
        self.skip_ssl_validation = skip_ssl_validation or agentConfig.get('skip_ssl_validation', False)
        if self.skip_ssl_validation:
            log.info("Skipping SSL hostname validation, useful when using a transparent proxy")

        if watchdog:
            watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER
            self._watchdog = Watchdog(watchdog_timeout,
                max_mem_mb=agentConfig.get('limit_memory_consumption', None))

    def log_request(self, handler):
        """ Override the tornado logging method.
        If everything goes well, log level is DEBUG.
        Otherwise it's WARNING or ERROR depending on the response code. """
        if handler.get_status() < 400:
            log_method = log.debug
        elif handler.get_status() < 500:
            log_method = log.warning
        else:
            log_method = log.error
        request_time = 1000.0 * handler.request.request_time()
        log_method("%d %s %.2fms", handler.get_status(),
                   handler._request_summary(), request_time)

    def appendMetric(self, prefix, name, host, device, ts, value):

        if self._metrics.has_key(prefix):
            metrics = self._metrics[prefix]
        else:
            metrics = {}
            self._metrics[prefix] = metrics

        if metrics.has_key(name):
            metrics[name].append([host, device, ts, value])
        else:
            metrics[name] = [[host, device, ts, value]]

    def _postMetrics(self):

        if len(self._metrics) > 0:
            self._metrics['uuid'] = get_uuid()
            self._metrics['internalHostname'] = get_hostname(self._agentConfig)
            self._metrics['apiKey'] = self._agentConfig['api_key']
            MetricTransaction(json.dumps(self._metrics),
                headers={'Content-Type': 'application/json'})
            self._metrics = {}

    def run(self):
        handlers = [
            (r"/intake/?", AgentInputHandler),
            (r"/api/v1/series/?", ApiInputHandler),
            (r"/status/?", StatusHandler),
        ]

        settings = dict(
            cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=",
            xsrf_cookies=False,
            debug=False,
            log_function=self.log_request
        )

        non_local_traffic = self._agentConfig.get("non_local_traffic", False)

        tornado.web.Application.__init__(self, handlers, **settings)
        http_server = tornado.httpserver.HTTPServer(self)

        # non_local_traffic must be == True to match, not just some non-false value
        if non_local_traffic is True:
            http_server.listen(self._port)
        else:
            # localhost in lieu of 127.0.0.1 to support IPv6
            try:
                http_server.listen(self._port, address = "localhost")
            except gaierror:
                log.warning("Warning localhost seems undefined in your host file, using 127.0.0.1 instead")
                http_server.listen(self._port, address = "127.0.0.1")

        log.info("Listening on port %d" % self._port)

        # Register callbacks
        self.mloop = get_tornado_ioloop()

        logging.getLogger().setLevel(get_logging_config()['log_level'] or logging.INFO)

        def flush_trs():
            if self._watchdog:
                self._watchdog.reset()
            self._postMetrics()
            self._tr_manager.flush()

        tr_sched = tornado.ioloop.PeriodicCallback(flush_trs,TRANSACTION_FLUSH_INTERVAL,
            io_loop = self.mloop)

        # Register optional Graphite listener
        gport = self._agentConfig.get("graphite_listen_port", None)
        if gport is not None:
            log.info("Starting graphite listener on port %s" % gport)
            from graphite import GraphiteServer
            gs = GraphiteServer(self, get_hostname(self._agentConfig), io_loop=self.mloop)
            if non_local_traffic is True:
                gs.listen(gport)
            else:
                gs.listen(gport, address = "localhost")

        # Start everything
        if self._watchdog:
            self._watchdog.reset()
        tr_sched.start()

        self.mloop.start()
        log.info("Stopped")

    def stop(self):
        self.mloop.stop()
class Reporter(threading.Thread):
    """
    The reporter periodically sends the aggregated metrics to the
    server.
    """

    def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False):
        threading.Thread.__init__(self)
        self.daemon = True
        self.interval = int(interval)
        self.finished = threading.Event()
        self.metrics_aggregator = metrics_aggregator
        self.flush_count = 0

        self.watchdog = None
        if use_watchdog:
            from util import Watchdog
            self.watchdog = Watchdog(WATCHDOG_TIMEOUT)

        self.api_key = api_key
        self.api_host = api_host

        self.http_conn_cls = http_client.HTTPSConnection

        match = re.match('^(https?)://(.*)', api_host)

        if match:
            self.api_host = match.group(2)
            if match.group(1) == 'http':
                self.http_conn_cls = http_client.HTTPConnection

    def stop(self):
        self.finished.set()

    def run(self):
        logger.info("Reporting to %s every %ss" % (self.api_host, self.interval))
        logger.debug("Watchdog enabled: %s" % bool(self.watchdog))
        while True:
            if self.finished.isSet(): # Use camel case version for 2.4 support.
                break
            self.finished.wait(self.interval)
            self.metrics_aggregator.send_packet_count('datadog.dogstatsd.packet.count')
            self.flush()
            if self.watchdog:
                self.watchdog.reset()

    def flush(self):
        try:
            self.flush_count += 1
            metrics = self.metrics_aggregator.flush()
            count = len(metrics)
            if not count:
                logger.info("Flush #%s: No metrics to flush." % self.flush_count)
                return
            logger.info("Flush #%s: flushing %s metrics" % (self.flush_count, count))
            self.submit(metrics)
        except:
            logger.exception("Error flushing metrics")

    def submit(self, metrics):

        # HACK - Copy and pasted from dogapi, because it's a bit of a pain to distribute python
        # dependencies with the agent.
        conn = self.http_conn_cls(self.api_host)
        body = json.dumps({"series" : metrics})
        headers = {'Content-Type':'application/json'}
        method = 'POST'

        params = {}
        if self.api_key:
            params['api_key'] = self.api_key
        url = '/api/v1/series?%s' % urlencode(params)

        start_time = time()
        conn.request(method, url, body, headers)

        #FIXME: add timeout handling code here

        response = conn.getresponse()
        duration = round((time() - start_time) * 1000.0, 4)
        logger.info("%s %s %s%s (%sms)" % (
                        response.status, method, self.api_host, url, duration))
Example #46
0
 def _get_watchdog(self, check_freq, agentConfig):
     watchdog = None
     if agentConfig.get("watchdog", True):
         watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER)
         watchdog.reset()
     return watchdog