Exemple #1
0
    def test_apptags(self):
        '''
        Tests that the app tags are sent if specified so
        '''
        agentConfig = {
            'api_key': 'test_apikey',
            'collect_ec2_tags': False,
            'collect_instance_metadata': False,
            'create_dd_check_tags': True,
            'version': 'test',
            'tags': '',
        }

        # Run a single checks.d check as part of the collector.
        redis_config = {
            "init_config": {},
            "instances": [{"host": "localhost", "port": 6379}]
        }
        checks = [load_check('redisdb', redis_config, agentConfig)]

        c = Collector(agentConfig, [], {}, get_hostname(agentConfig))
        payload = c.run({
            'initialized_checks': checks,
            'init_failed_checks': {}
        })

        # We check that the redis DD_CHECK_TAG is sent in the payload
        self.assertTrue('dd_check:redisdb' in payload['host-tags']['system'])
Exemple #2
0
    def __init__(self, args):
        win32serviceutil.ServiceFramework.__init__(self, args)
        self.hWaitStop = win32event.CreateEvent(None, 0, 0, None)
        config = get_config(parse_args=False)

        # Setup the correct options so the agent will use the forwarder
        opts, args = Values({
            'autorestart': False,
            'dd_url': None,
            'use_forwarder': True,
            'disabled_dd': False,
            'profile': False
        }), []
        agentConfig = get_config(parse_args=False, options=opts)
        self.hostname = get_hostname(agentConfig)

        # Watchdog for Windows
        self._collector_heartbeat, self._collector_send_heartbeat = multiprocessing.Pipe(False)
        self._collector_failed_heartbeats = 0
        self._max_failed_heartbeats = \
            MAX_FAILED_HEARTBEATS * agentConfig['check_freq'] / SERVICE_SLEEP_INTERVAL

        # Watch JMXFetch restarts
        self._MAX_JMXFETCH_RESTARTS = 3
        self._count_jmxfetch_restarts = 0

        # Keep a list of running processes so we can start/end as needed.
        # Processes will start started in order and stopped in reverse order.
        self.procs = {
            'forwarder': ProcessWatchDog("forwarder", DDForwarder(config, self.hostname)),
            'collector': ProcessWatchDog("collector", DDAgent(agentConfig, self.hostname,
                                         heartbeat=self._collector_send_heartbeat)),
            'dogstatsd': ProcessWatchDog("dogstatsd", DogstatsdProcess(config, self.hostname)),
            'jmxfetch': ProcessWatchDog("jmxfetch", JMXFetchProcess(config, self.hostname), 3),
        }
Exemple #3
0
    def reload_configs(self, checks_to_reload=set()):
        """Reload the agent configuration and checksd configurations.
           Can also reload only an explicit set of checks."""
        log.info("Attempting a configuration reload...")
        hostname = get_hostname(self._agentConfig)

        # if no check was given, reload them all
        if not checks_to_reload:
            log.debug("No check list was passed, reloading every check")
            # stop checks
            for check in self._checksd.get('initialized_checks', []):
                check.stop()

            self._checksd = load_check_directory(self._agentConfig, hostname)
        else:
            new_checksd = copy(self._checksd)

            self.refresh_specific_checks(hostname, new_checksd, checks_to_reload)
            # once the reload is done, replace existing checks with the new ones
            self._checksd = new_checksd

        # Logging
        num_checks = len(self._checksd['initialized_checks'])
        if num_checks > 0:
            opt_msg = " (refreshed %s checks)" % len(checks_to_reload) if checks_to_reload else ''

            msg = "Check reload was successful. Running {num_checks} checks{opt_msg}.".format(
                num_checks=num_checks, opt_msg=opt_msg)
            log.info(msg)
        else:
            log.info("No checksd configs found")
    def test_apptags(self):
        '''
        Tests that the app tags are sent if specified so
        '''
        agentConfig = {
            'agent_key': 'test_agentkey',
            'collect_ec2_tags': False,
            'collect_orchestrator_tags': False,
            'collect_instance_metadata': False,
            'create_sd_check_tags': True,
            'version': 'test',
            'tags': '',
        }

        # Run a single checks.d check as part of the collector.
        disk_config = {
            "init_config": {},
            "instances": [{}]
        }

        checks = [load_check('disk', disk_config, agentConfig)]

        c = Collector(agentConfig, [], {}, get_hostname(agentConfig))
        payload = c.run({
            'initialized_checks': checks,
            'init_failed_checks': {}
        })

        # We check that the redis SD_CHECK_TAG is sent in the payload
        self.assertTrue('sd_check:disk' in payload['host-tags']['system'])
Exemple #5
0
def init(config_path=None, use_watchdog=False, use_forwarder=False, args=None):
    """Configure the server and the reporting thread.
    """
    c = get_config(parse_args=False, cfg_path=config_path)

    if (not c['use_dogstatsd'] and
            (args and args[0] in ['start', 'restart'] or not args)):
        log.info("Dogstatsd is disabled. Exiting")
        # We're exiting purposefully, so exit with zero (supervisor's expected
        # code). HACK: Sleep a little bit so supervisor thinks we've started cleanly
        # and thus can exit cleanly.
        sleep(4)
        sys.exit(0)

    port = c['dogstatsd_port']
    interval = DOGSTATSD_FLUSH_INTERVAL
    api_key = c['api_key']
    aggregator_interval = DOGSTATSD_AGGREGATOR_BUCKET_SIZE
    non_local_traffic = c['non_local_traffic']
    forward_to_host = c.get('statsd_forward_host')
    forward_to_port = c.get('statsd_forward_port')
    event_chunk_size = c.get('event_chunk_size')
    recent_point_threshold = c.get('recent_point_threshold', None)
    server_host = c['bind_host']

    target = c['dd_url']
    if use_forwarder:
        target = c['dogstatsd_target']

    hostname = get_hostname(c)

    # Create the aggregator (which is the point of communication between the
    # server and reporting threads.
    assert 0 < interval

    aggregator = MetricsBucketAggregator(
        hostname,
        aggregator_interval,
        recent_point_threshold=recent_point_threshold,
        formatter=get_formatter(c),
        histogram_aggregates=c.get('histogram_aggregates'),
        histogram_percentiles=c.get('histogram_percentiles'),
        utf8_decoding=c['utf8_decoding']
    )

    # Start the reporting thread.
    reporter = Reporter(interval, aggregator, target, api_key, use_watchdog, event_chunk_size)

    # NOTICE: when `non_local_traffic` is passed we need to bind to any interface on the box. The forwarder uses
    # Tornado which takes care of sockets creation (more than one socket can be used at once depending on the
    # network settings), so it's enough to just pass an empty string '' to the library.
    # In Dogstatsd we use a single, fullstack socket, so passing '' as the address doesn't work and we default to
    # '0.0.0.0'. If someone needs to bind Dogstatsd to the IPv6 '::', they need to turn off `non_local_traffic` and
    # use the '::' meta address as `bind_host`.
    if non_local_traffic:
        server_host = '0.0.0.0'

    server = Server(aggregator, server_host, port, forward_to_host=forward_to_host, forward_to_port=forward_to_port)

    return reporter, server, c
Exemple #6
0
    def check(self, agentConfig):
        process_exclude_args = agentConfig.get('exclude_process_args', False)
        if process_exclude_args:
            ps_arg = 'aux'
        else:
            ps_arg = 'auxww'
        # Get output from ps
        try:
            output, _, _ = get_subprocess_output(['ps', ps_arg], self.logger)
            processLines = output.splitlines()  # Also removes a trailing empty line

            del processLines[0]  # Removes the headers
        except Exception:
            self.logger.exception('getProcesses')
            return False

        processes = []

        for line in processLines:
            line = line.split(None, 10)
            processes.append(map(lambda s: s.strip(), line))

        return {'processes':   processes,
                'apiKey':      agentConfig['api_key'],
                'host':        get_hostname(agentConfig)}
Exemple #7
0
    def test_collector(self):
        agentConfig = {
            'api_key': 'test_apikey',
            'check_timings': True,
            'collect_ec2_tags': True,
            'collect_instance_metadata': False,
            'create_dd_check_tags': False,
            'version': 'test',
            'tags': '',
        }

        # Run a single checks.d check as part of the collector.
        redis_config = {
            "init_config": {},
            "instances": [{"host": "localhost", "port": 6379}]
        }
        checks = [load_check('redisdb', redis_config, agentConfig)]

        c = Collector(agentConfig, [], {}, get_hostname(agentConfig))
        payload = c.run({
            'initialized_checks': checks,
            'init_failed_checks': {}
        })
        metrics = payload['metrics']

        # Check that we got a timing metric for all checks.
        timing_metrics = [m for m in metrics
            if m[0] == 'datadog.agent.check_run_time']
        all_tags = []
        for metric in timing_metrics:
            all_tags.extend(metric[3]['tags'])
        for check in checks:
            tag = "check:%s" % check.name
            assert tag in all_tags, all_tags
Exemple #8
0
def load_check(name, config, agentConfig):
    if not _is_sdk():
        checksd_path = agentConfig.get('additional_checksd', get_checksd_path(get_os()))

        # find (in checksd_path) and load the check module
        fd, filename, desc = imp.find_module(name, [checksd_path])
        check_module = imp.load_module(name, fd, filename, desc)
    else:
        check_module = _load_sdk_module(name) # parent module

    check_class = None
    classes = inspect.getmembers(check_module, inspect.isclass)
    for _, clsmember in classes:
        if clsmember == AgentCheck:
            continue
        if issubclass(clsmember, AgentCheck):
            check_class = clsmember
            if AgentCheck in clsmember.__bases__:
                continue
            else:
                break
    if check_class is None:
        raise Exception("Unable to import check %s. Missing a class that inherits AgentCheck" % name)

    init_config = config.get('init_config', {})
    instances = config.get('instances')
    agentConfig['checksd_hostname'] = get_hostname(agentConfig)

    # init the check class
    try:
        return check_class(name, init_config, agentConfig, instances=instances)
    except TypeError as e:
        raise Exception("Check is using old API, {0}".format(e))
    except Exception:
        raise
Exemple #9
0
    def __init__(self, name, init_config, agentConfig, instances=None):
        """
        Initialize a new check.

        :param name: The name of the check
        :param init_config: The config for initializing the check
        :param agentConfig: The global configuration for the agent
        :param instances: A list of configuration objects for each instance.
        """
        from aggregator import MetricsAggregator

        self._enabled_checks.append(name)
        self._enabled_checks = list(set(self._enabled_checks))

        self.name = name
        self.init_config = init_config or {}
        self.agentConfig = agentConfig
        self.in_developer_mode = agentConfig.get("developer_mode") and psutil
        self._internal_profiling_stats = None
        self.default_integration_http_timeout = float(agentConfig.get("default_integration_http_timeout", 9))

        self.hostname = agentConfig.get("checksd_hostname") or get_hostname(agentConfig)
        self.log = logging.getLogger("%s.%s" % (__name__, name))

        self.min_collection_interval = self.init_config.get(
            "min_collection_interval", self.DEFAULT_MIN_COLLECTION_INTERVAL
        )

        self.aggregator = MetricsAggregator(
            self.hostname,
            expiry_seconds=self.min_collection_interval + self.DEFAULT_EXPIRY_SECONDS,
            formatter=agent_formatter,
            recent_point_threshold=agentConfig.get("recent_point_threshold", None),
            histogram_aggregates=agentConfig.get("histogram_aggregates"),
            histogram_percentiles=agentConfig.get("histogram_percentiles"),
        )

        self.events = []
        self.service_checks = []
        self.instances = instances or []
        self.warnings = []
        self.library_versions = None
        self.last_collection_time = defaultdict(int)
        self._instance_metadata = []
        self.svc_metadata = []
        self.historate_dict = {}

        # Set proxy settings
        self.proxy_settings = get_proxy(self.agentConfig)
        self._use_proxy = False if init_config is None else init_config.get("use_agent_proxy", True)
        self.proxies = {"http": None, "https": None}
        if self.proxy_settings and self._use_proxy:
            uri = "{host}:{port}".format(host=self.proxy_settings["host"], port=self.proxy_settings["port"])
            if self.proxy_settings["user"] and self.proxy_settings["password"]:
                uri = "{user}:{password}@{uri}".format(
                    user=self.proxy_settings["user"], password=self.proxy_settings["password"], uri=uri
                )
            self.proxies["http"] = "http://{uri}".format(uri=uri)
            self.proxies["https"] = "https://{uri}".format(uri=uri)
Exemple #10
0
    def _postMetrics(self):

        if len(self._metrics) > 0:
            self._metrics['uuid'] = get_uuid()
            self._metrics['internalHostname'] = get_hostname(self._agentConfig)
            self._metrics['apiKey'] = self._agentConfig['api_key']
            MetricTransaction(json.dumps(self._metrics),
                              headers={'Content-Type': 'application/json'})
            self._metrics = {}
Exemple #11
0
def sd_configcheck(agentConfig):
    if agentConfig.get('service_discovery', False):
        # set the TRACE_CONFIG flag to True to make load_check_directory return
        # the source of config objects.
        # Then call load_check_directory here and pass the result to get_sd_configcheck
        # to avoid circular imports
        agentConfig[TRACE_CONFIG] = True
        configs = {
            # check_name: (config_source, config)
        }
        print("\nLoading check configurations...\n\n")
        configs = load_check_directory(agentConfig, get_hostname(agentConfig))
        get_sd_configcheck(agentConfig, configs)
Exemple #12
0
 def __init__(self, cmdline=False, case_id=None):
     self._case_id = case_id
     self._cmdline = cmdline
     self._init_tarfile()
     self._init_permissions_file()
     self._save_logs_path()
     self._config = get_config()
     self._api_key = self._config.get('api_key')
     self._url = "{0}{1}".format(
         get_url_endpoint(self._config.get('dd_url'), endpoint_type='flare'),
         self.DATADOG_SUPPORT_URL
     )
     self._hostname = get_hostname(self._config)
     self._prefix = "datadog-{0}".format(self._hostname)
Exemple #13
0
    def __init__(self, interval, metrics_aggregator, api_host, api_key=None,
                 use_watchdog=False, event_chunk_size=None):
        threading.Thread.__init__(self)
        self.interval = int(interval)
        self.finished = threading.Event()
        self.metrics_aggregator = metrics_aggregator
        self.flush_count = 0
        self.log_count = 0
        self.hostname = get_hostname()

        self.watchdog = None
        if use_watchdog:
            self.watchdog = Watchdog.create(WATCHDOG_TIMEOUT)

        self.api_key = api_key
        self.api_host = api_host
        self.event_chunk_size = event_chunk_size or EVENT_CHUNK_SIZE
Exemple #14
0
    def test_check(self):
        self.run_check_twice(self.config)

        shared_tag = ['instance_url:http://localhost:3835/stats']

        self._test_frontend_metrics(shared_tag)
        self._test_backend_metrics(shared_tag)

        # check was run 2 times
        #       - FRONTEND is reporting OPEN that we ignore
        #       - only the BACKEND aggregate is reporting UP -> OK
        #       - The 3 individual servers are returning no check -> UNKNOWN
        self._test_service_checks()

        # Make sure the service checks aren't tagged with an empty hostname.
        self.assertEquals(self.service_checks[0]['host_name'], get_hostname())

        self.coverage_report()
Exemple #15
0
    def submit_events(self, events):
        headers = {'Content-Type':'application/json'}
        event_chunk_size = self.event_chunk_size

        for chunk in chunks(events, event_chunk_size):
            payload = {
                'apiKey': self.api_key,
                'events': {
                    'api': chunk
                },
                'uuid': get_uuid(),
                'internalHostname': get_hostname()
            }
            params = {}
            if self.api_key:
                params['api_key'] = self.api_key
            url = '%s/intake?%s' % (self.api_host, urlencode(params))

            self.submit_http(url, json.dumps(payload), headers)
Exemple #16
0
    def test_check(self):
        self.run_check_twice(self.config)

        shared_tag = ['instance_url:http://localhost:3835/stats']

        self._test_frontend_metrics(shared_tag)
        self._test_backend_metrics(shared_tag)

        # check was run 2 times
        #       - FRONTEND is reporting OPEN that we ignore
        #       - only the BACKEND aggregate is reporting UP -> OK
        #       - The 3 individual servers are returning no check -> UNKNOWN
        self._test_service_checks()

        # Make sure the service checks aren't tagged with an empty hostname.
        self.assertEquals(self.service_checks[0]['host_name'],
                          get_hostname(config=self.config))

        self.coverage_report()
Exemple #17
0
    def submit_events(self, events):
        headers = {'Content-Type': 'application/json'}
        event_chunk_size = self.event_chunk_size

        for chunk in chunks(events, event_chunk_size):
            payload = {
                'apiKey': self.api_key,
                'events': {
                    'api': chunk
                },
                'uuid': get_uuid(),
                'internalHostname': get_hostname()
            }
            params = {}
            if self.api_key:
                params['api_key'] = self.api_key
            url = '%s/intake?%s' % (self.api_host, urlencode(params))

            self.submit_http(url, json.dumps(payload), headers)
Exemple #18
0
    def reload_configs(self, checks_to_reload=set()):
        """Reload the agent configuration and checksd configurations.
           Can also reload only an explicit set of checks."""
        log.info("Attempting a configuration reload...")
        hostname = get_hostname(self._agentConfig)
        jmx_sd_configs = None

        # if no check was given, reload them all
        if not checks_to_reload:
            log.debug("No check list was passed, reloading every check")
            # stop checks
            for check in self._checksd.get('initialized_checks', []):
                check.stop()

            self._checksd = load_check_directory(self._agentConfig, hostname)
            if self._jmx_service_discovery_enabled:
                jmx_sd_configs = generate_jmx_configs(self._agentConfig, hostname)
        else:
            new_checksd = copy(self._checksd)

            jmx_checks = [check for check in checks_to_reload if check in JMX_CHECKS]
            py_checks = set(checks_to_reload) - set(jmx_checks)
            self.refresh_specific_checks(hostname, new_checksd, py_checks)
            if self._jmx_service_discovery_enabled:
                jmx_sd_configs = generate_jmx_configs(self._agentConfig, hostname, jmx_checks)

            # once the reload is done, replace existing checks with the new ones
            self._checksd = new_checksd

        if jmx_sd_configs:
            self._submit_jmx_service_discovery(jmx_sd_configs)

        # Logging
        num_checks = len(self._checksd['initialized_checks'])
        if num_checks > 0:
            opt_msg = " (refreshed %s checks)" % len(checks_to_reload) if checks_to_reload else ''

            msg = "Check reload was successful. Running {num_checks} checks{opt_msg}.".format(
                num_checks=num_checks, opt_msg=opt_msg)
            log.info(msg)
        else:
            log.info("No checksd configs found")
Exemple #19
0
    def collection(self):
        while not self._event.is_set():
            try:
                current_ts = time.monotonic()

                if self._meta_ts is None or (
                        current_ts - self._meta_ts
                ) >= self._config.get('host_metadata_interval'):
                    metadata = get_metadata(
                        get_hostname(),
                        AGENT_VERSION,
                        start_event=(self._meta_ts is None))
                    self._serializer.submit_metadata(metadata)
                    self._meta_ts = current_ts

                self._collector.run_checks()
                self._serializer.serialize_and_push()
            except Exception:
                log.exception("Unexpected error in last collection run")

            time.sleep(self._config.get('min_collection_interval'))
Exemple #20
0
def load_check(name, config, agentConfig):
    if not _is_sdk():
        checksd_path = get_checksd_path(get_os())

        # find (in checksd_path) and load the check module
        fd, filename, desc = imp.find_module(name, [checksd_path])
        check_module = imp.load_module(name, fd, filename, desc)
    else:
        check_module = __import__("check")

    check_class = None
    classes = inspect.getmembers(check_module, inspect.isclass)
    for _, clsmember in classes:
        if clsmember == AgentCheck:
            continue
        if issubclass(clsmember, AgentCheck):
            check_class = clsmember
            if AgentCheck in clsmember.__bases__:
                continue
            else:
                break
    if check_class is None:
        raise Exception(
            "Unable to import check %s. Missing a class that inherits AgentCheck"
            % name)

    init_config = config.get('init_config', {})
    instances = config.get('instances')
    agentConfig['checksd_hostname'] = get_hostname(agentConfig)

    # init the check class
    try:
        return check_class(name,
                           init_config=init_config,
                           agentConfig=agentConfig,
                           instances=instances)
    except TypeError as e:
        raise Exception("Check is using old API, {0}".format(e))
    except Exception:
        raise
Exemple #21
0
    def __init__(self,
                 interval,
                 metrics_aggregator,
                 api_host,
                 api_key=None,
                 use_watchdog=False,
                 event_chunk_size=None):
        threading.Thread.__init__(self)
        self.interval = int(interval)
        self.finished = threading.Event()
        self.metrics_aggregator = metrics_aggregator
        self.flush_count = 0
        self.log_count = 0
        self.hostname = get_hostname()

        self.watchdog = None
        if use_watchdog:
            self.watchdog = Watchdog.create(WATCHDOG_TIMEOUT)

        self.api_key = api_key
        self.api_host = api_host
        self.event_chunk_size = event_chunk_size or EVENT_CHUNK_SIZE
Exemple #22
0
    def test_collector(self):
        agentConfig = {
            'api_key': 'test_apikey',
            'check_timings': True,
            'collect_ec2_tags': True,
            'collect_instance_metadata': False,
            'create_dd_check_tags': False,
            'version': 'test',
            'tags': '',
        }

        # Run a single checks.d check as part of the collector.
        redis_config = {
            "init_config": {},
            "instances": [{
                "host": "localhost",
                "port": 6379
            }]
        }

        checks = [load_check('redisdb', redis_config, agentConfig)]

        c = Collector(agentConfig, [], {}, get_hostname(agentConfig))
        payload = c.run({
            'initialized_checks': checks,
            'init_failed_checks': {}
        })
        metrics = payload['metrics']

        # Check that we got a timing metric for all checks.
        timing_metrics = [
            m for m in metrics if m[0] == 'datadog.agent.check_run_time'
        ]
        all_tags = []
        for metric in timing_metrics:
            all_tags.extend(metric[3]['tags'])
        for check in checks:
            tag = "check:%s" % check.name
            assert tag in all_tags, all_tags
def start_graphite_listener(port):
    echo_server = GraphiteServer(None, get_hostname(None))
    echo_server.listen(port)
    IOLoop.instance().start()
Exemple #24
0
    def __init__(self, name, init_config, agentConfig, instances=None):
        """
        Initialize a new check.

        :param name: The name of the check
        :param init_config: The config for initializing the check
        :param agentConfig: The global configuration for the agent
        :param instances: A list of configuration objects for each instance.
        """
        from aggregator import MetricsAggregator

        self._enabled_checks.append(name)
        self._enabled_checks = list(set(self._enabled_checks))

        self.name = name
        self.init_config = init_config or {}
        self.agentConfig = agentConfig
        self.in_developer_mode = agentConfig.get('developer_mode') and psutil
        self._internal_profiling_stats = None
        self.default_integration_http_timeout = float(
            agentConfig.get('default_integration_http_timeout', 9))

        self.hostname = agentConfig.get('checksd_hostname') or get_hostname(
            agentConfig)
        self.log = logging.getLogger('%s.%s' % (__name__, name))

        self.min_collection_interval = self.init_config.get(
            'min_collection_interval', self.DEFAULT_MIN_COLLECTION_INTERVAL)

        self.aggregator = MetricsAggregator(
            self.hostname,
            expiry_seconds=self.min_collection_interval +
            self.DEFAULT_EXPIRY_SECONDS,
            formatter=agent_formatter,
            recent_point_threshold=agentConfig.get('recent_point_threshold',
                                                   None),
            histogram_aggregates=agentConfig.get('histogram_aggregates'),
            histogram_percentiles=agentConfig.get('histogram_percentiles'))

        self.events = []
        self.service_checks = []
        self.instances = instances or []
        self.warnings = []
        self.library_versions = None
        self.last_collection_time = defaultdict(int)
        self._instance_metadata = []
        self.svc_metadata = []
        self.historate_dict = {}

        # Set proxy settings
        self.proxy_settings = get_proxy(self.agentConfig)
        self._use_proxy = False if init_config is None else init_config.get(
            "use_agent_proxy", True)
        self.proxies = {
            "http": None,
            "https": None,
        }
        if self.proxy_settings and self._use_proxy:
            uri = "{host}:{port}".format(host=self.proxy_settings['host'],
                                         port=self.proxy_settings['port'])
            if self.proxy_settings['user'] and self.proxy_settings['password']:
                uri = "{user}:{password}@{uri}".format(
                    user=self.proxy_settings['user'],
                    password=self.proxy_settings['password'],
                    uri=uri)
            self.proxies['http'] = "http://{uri}".format(uri=uri)
            self.proxies['https'] = "https://{uri}".format(uri=uri)
Exemple #25
0
def main():
    options, args = get_parsed_args()
    agentConfig = get_config(options=options)
    autorestart = agentConfig.get('autorestart', False)
    hostname = get_hostname(agentConfig)
    in_developer_mode = agentConfig.get('developer_mode')

    COMMANDS_AGENT = [
        'start',
        'stop',
        'restart',
        'status',
        'foreground',
    ]

    COMMANDS_NO_AGENT = [
        'info',
        'check',
        'configcheck',
        'jmx',
        'flare',
    ]

    COMMANDS = COMMANDS_AGENT + COMMANDS_NO_AGENT

    if len(args) < 1:
        sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS)))
        return 2

    command = args[0]
    if command not in COMMANDS:
        sys.stderr.write("Unknown command: %s\n" % command)
        return 3

    # TODO: actually kill the start/stop/restart/status command for 5.11
    if command in ['start', 'stop', 'restart', 'status'] and not in_developer_mode:
        logging.error('Please use supervisor to manage the agent')
        return 1

    if command in COMMANDS_AGENT:
        agent = Agent(PidFile(PID_NAME, PID_DIR).get_path(), autorestart, in_developer_mode=in_developer_mode)

    if 'start' == command:
        log.info('Start daemon')
        agent.start()

    elif 'stop' == command:
        log.info('Stop daemon')
        agent.stop()

    elif 'restart' == command:
        log.info('Restart daemon')
        agent.restart()

    elif 'status' == command:
        agent.status()

    elif 'info' == command:
        return Agent.info(verbose=options.verbose)

    elif 'foreground' == command:
        log.info('Agent version %s' % get_version())
        if autorestart:
            # Set-up the supervisor callbacks and fork it.
            logging.info('Running Agent with auto-restart ON')

            def child_func():
                agent.start(foreground=True)

            def parent_func():
                agent.start_event = False

            AgentSupervisor.start(parent_func, child_func)
        else:
            # Run in the standard foreground.
            agent.start(foreground=True)

    elif 'check' == command:
        if len(args) < 2:
            sys.stderr.write(
                "Usage: %s check <check_name> [check_rate]\n"
                "Add check_rate as last argument to compute rates\n"
                % sys.argv[0]
            )
            return 1

        check_name = args[1]
        try:
            import checks.collector
            # Try the old-style check first
            print getattr(checks.collector, check_name)(log).check(agentConfig)
        except Exception:
            # If not an old-style check, try checks.d
            checks = load_check_directory(agentConfig, hostname)
            for check in checks['initialized_checks']:
                if check.name == check_name:
                    if in_developer_mode:
                        check.run = AgentProfiler.wrap_profiling(check.run)

                    cs = Collector.run_single_check(check, verbose=True)
                    print CollectorStatus.render_check_status(cs)

                    if len(args) == 3 and args[2] == 'check_rate':
                        print "Running 2nd iteration to capture rate metrics"
                        time.sleep(1)
                        cs = Collector.run_single_check(check, verbose=True)
                        print CollectorStatus.render_check_status(cs)

                    check.stop()

    elif 'configcheck' == command or 'configtest' == command:
        configcheck()
        sd_configcheck(agentConfig)

    elif 'jmx' == command:
        jmx_command(args[1:], agentConfig)

    elif 'flare' == command:
        Flare.check_user_rights()
        case_id = int(args[1]) if len(args) > 1 else None
        f = Flare(True, case_id)
        f.collect()
        try:
            f.upload()
        except Exception as e:
            print 'The upload failed:\n{0}'.format(str(e))

    return 0
Exemple #26
0
    def run(self, config=None):
        """Main loop of the collector"""

        # Gracefully exit on sigterm.
        signal.signal(signal.SIGTERM, self._handle_sigterm)

        # A SIGUSR1 signals an exit with an autorestart
        signal.signal(signal.SIGUSR1, self._handle_sigusr1)

        # Handle Keyboard Interrupt
        signal.signal(signal.SIGINT, self._handle_sigterm)

        # A SIGHUP signals a configuration reload
        signal.signal(signal.SIGHUP, self._handle_sighup)

        # Save the agent start-up stats.
        CollectorStatus().persist()

        # Intialize the collector.
        if not config:
            config = get_config(parse_args=True)

        self._agentConfig = self._set_agent_config_hostname(config)
        hostname = get_hostname(self._agentConfig)
        systemStats = get_system_stats(
            proc_path=self._agentConfig.get('procfs_path', '/proc').rstrip('/')
        )
        emitters = self._get_emitters()

        # Initialize service discovery
        if self._agentConfig.get('service_discovery'):
            self.sd_backend = get_sd_backend(self._agentConfig)

        if _is_affirmative(self._agentConfig.get('sd_jmx_enable')):
            pipe_path = get_jmx_pipe_path()
            if Platform.is_windows():
                pipe_name = pipe_path.format(pipename=SD_PIPE_NAME)
            else:
                pipe_name = os.path.join(pipe_path, SD_PIPE_NAME)

            if os.access(pipe_path, os.W_OK):
                if not os.path.exists(pipe_name):
                    os.mkfifo(pipe_name)
                self.sd_pipe = os.open(pipe_name, os.O_RDWR) # RW to avoid blocking (will only W)

                # Initialize Supervisor proxy
                self.supervisor_proxy = self._get_supervisor_socket(self._agentConfig)
            else:
                log.debug('Unable to create pipe in temporary directory. JMX service discovery disabled.')

        # Load the checks.d checks
        self._checksd = load_check_directory(self._agentConfig, hostname)

        # Load JMX configs if available
        if self._jmx_service_discovery_enabled:
            jmx_sd_configs = generate_jmx_configs(self._agentConfig, hostname)
            if jmx_sd_configs:
                self._submit_jmx_service_discovery(jmx_sd_configs)

        # Initialize the Collector
        self.collector = Collector(self._agentConfig, emitters, systemStats, hostname)

        # In developer mode, the number of runs to be included in a single collector profile
        try:
            self.collector_profile_interval = int(
                self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL))
        except ValueError:
            log.warn('collector_profile_interval is invalid. '
                     'Using default value instead (%s).' % DEFAULT_COLLECTOR_PROFILE_INTERVAL)
            self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL

        # Configure the watchdog.
        self.check_frequency = int(self._agentConfig['check_freq'])
        watchdog = self._get_watchdog(self.check_frequency)

        # Initialize the auto-restarter
        self.restart_interval = int(self._agentConfig.get('restart_interval', RESTART_INTERVAL))
        self.agent_start = time.time()

        self.allow_profiling = self._agentConfig.get('allow_profiling', True)

        profiled = False
        collector_profiled_runs = 0

        # Run the main loop.
        while self.run_forever:
            # Setup profiling if necessary
            if self.allow_profiling and self.in_developer_mode and not profiled:
                try:
                    profiler = AgentProfiler()
                    profiler.enable_profiling()
                    profiled = True
                except Exception as e:
                    log.warn("Cannot enable profiler: %s" % str(e))

            if self.reload_configs_flag:
                if isinstance(self.reload_configs_flag, set):
                    self.reload_configs(checks_to_reload=self.reload_configs_flag)
                else:
                    self.reload_configs()

            # Do the work. Pass `configs_reloaded` to let the collector know if it needs to
            # look for the AgentMetrics check and pop it out.
            self.collector.run(checksd=self._checksd,
                               start_event=self.start_event,
                               configs_reloaded=True if self.reload_configs_flag else False)

            self.reload_configs_flag = False

            # Look for change in the config template store.
            # The self.sd_backend.reload_check_configs flag is set
            # to True if a config reload is needed.
            if self._agentConfig.get('service_discovery') and self.sd_backend and \
               not self.sd_backend.reload_check_configs:
                try:
                    self.sd_backend.reload_check_configs = get_config_store(
                        self._agentConfig).crawl_config_template()
                except Exception as e:
                    log.warn('Something went wrong while looking for config template changes: %s' % str(e))

            # Check if we should run service discovery
            # The `reload_check_configs` flag can be set through the docker_daemon check or
            # using ConfigStore.crawl_config_template
            if self._agentConfig.get('service_discovery') and self.sd_backend and \
               self.sd_backend.reload_check_configs:
                self.reload_configs_flag = self.sd_backend.reload_check_configs
                self.sd_backend.reload_check_configs = False

            if profiled:
                if collector_profiled_runs >= self.collector_profile_interval:
                    try:
                        profiler.disable_profiling()
                        profiled = False
                        collector_profiled_runs = 0
                    except Exception as e:
                        log.warn("Cannot disable profiler: %s" % str(e))

            # Check if we should restart.
            if self.autorestart and self._should_restart():
                self._do_restart()

            # Only plan for next loop if we will continue, otherwise exit quickly.
            if self.run_forever:
                if watchdog:
                    watchdog.reset()
                if profiled:
                    collector_profiled_runs += 1
                log.debug("Sleeping for {0} seconds".format(self.check_frequency))
                time.sleep(self.check_frequency)

        # Now clean-up.
        try:
            CollectorStatus.remove_latest_status()
        except Exception:
            pass

        # Explicitly kill the process, because it might be running as a daemon.
        log.info("Exiting. Bye bye.")
        sys.exit(0)
Exemple #27
0
    def run(self):
        try:
            hostname = get_hostname()
        except HostnameException as e:
            logging.critical(
                "{} - You can define one in datadog.yaml or in your hosts file"
                .format(e))
            sys.exit(1)

        logging.info("Starting the agent, hostname: %s", hostname)

        # init Forwarder
        logging.info("Starting the Forwarder")
        api_key = config.get('api_key')
        dd_url = config.get('dd_url')
        if not dd_url:
            logging.error('No Datadog URL configured - cannot continue')
            sys.exit(1)
        if not api_key:
            logging.error('No API key configured - cannot continue')
            sys.exit(1)

        # get proxy settings
        proxies = get_proxy()
        logging.debug('Proxy configuration used: %s', proxies)

        # get site url
        forwarder = Forwarder(
            api_key,
            get_site_url(dd_url, site=config.get('site')),
            proxies=proxies,
        )
        forwarder.start()

        # agent aggregator
        aggregator = MetricsAggregator(
            hostname,
            interval=config.get('aggregator_interval'),
            expiry_seconds=(config.get('min_collection_interval') +
                            config.get('aggregator_expiry_seconds')),
            recent_point_threshold=config.get('recent_point_threshold'),
            histogram_aggregates=config.get('histogram_aggregates'),
            histogram_percentiles=config.get('histogram_percentiles'),
        )

        # serializer
        serializer = Serializer(
            aggregator,
            forwarder,
        )

        # instantiate collector
        collector = Collector(config, aggregator)
        collector.load_check_classes()
        collector.instantiate_checks()

        # instantiate AgentRunner
        runner = AgentRunner(collector, serializer, config)

        # instantiate Dogstatsd
        reporter = None
        dsd_server = None
        dsd_enable = config['dogstatsd'].get('enable', False)
        if dsd_enable:
            reporter, dsd_server, _ = init_dogstatsd(config,
                                                     forwarder=forwarder)
            dsd = DogstatsdRunner(dsd_server)

        # instantiate API
        status = {
            'agent': aggregator.stats,
            'forwarder': forwarder.stats,
            'collector': collector.status,
        }
        if dsd_server:
            status['dogstatsd'] = dsd_server.aggregator.stats

        api = APIServer(config, status=status)

        handler = SignalHandler()
        # components
        handler.register('runner', runner)
        handler.register('forwarder', forwarder)
        handler.register('api', api)
        if dsd_enable:
            handler.register('reporter', reporter)
            handler.register('dsd_server', dsd_server)

        # signals
        handler.handle(signal.SIGTERM)
        handler.handle(signal.SIGINT)

        # start signal handler
        handler.start()

        runner.start()
        api.start()

        if dsd_enable:
            reporter.start()
            dsd.start()

            dsd.join()
            logging.info("Dogstatsd server done...")
            try:
                dsd.raise_for_status()
            except Exception as e:
                log.error("There was a problem with the dogstatsd server: %s",
                          e)
                reporter.stop()

        runner.join()
        logging.info("Collector done...")

        api.join()
        logging.info("API done...")

        handler.stop()
        handler.join()
        logging.info("Signal handler done...")

        logging.info("Thank you for shopping at DataDog! Come back soon!")

        sys.exit(0)
Exemple #28
0
def main():
    options, args = get_parsed_args()
    agentConfig = get_config(options=options)
    autorestart = agentConfig.get('autorestart', False)
    hostname = get_hostname(agentConfig)
    in_developer_mode = agentConfig.get('developer_mode')

    COMMANDS_AGENT = [
        'start',
        'stop',
        'restart',
        'status',
        'foreground',
    ]

    COMMANDS_NO_AGENT = [
        'info',
        'check',
        'configcheck',
        'jmx',
        'flare',
    ]

    COMMANDS = COMMANDS_AGENT + COMMANDS_NO_AGENT

    if len(args) < 1:
        sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS)))
        return 2

    command = args[0]
    if command not in COMMANDS:
        sys.stderr.write("Unknown command: %s\n" % command)
        return 3

    # TODO: actually kill the start/stop/restart/status command for 5.11
    if command in ['start', 'stop', 'restart', 'status'
                   ] and not in_developer_mode:
        logging.error('Please use supervisor to manage the agent')
        return 1

    if command in COMMANDS_AGENT:
        agent = Agent(PidFile(PID_NAME, PID_DIR).get_path(),
                      autorestart,
                      in_developer_mode=in_developer_mode)

    if 'start' == command:
        log.info('Start daemon')
        agent.start()

    elif 'stop' == command:
        log.info('Stop daemon')
        agent.stop()

    elif 'restart' == command:
        log.info('Restart daemon')
        agent.restart()

    elif 'status' == command:
        agent.status()

    elif 'info' == command:
        return Agent.info(verbose=options.verbose)

    elif 'foreground' == command:
        log.info('Agent version %s' % get_version())
        if autorestart:
            # Set-up the supervisor callbacks and fork it.
            logging.info('Running Agent with auto-restart ON')

            def child_func():
                agent.start(foreground=True)

            def parent_func():
                agent.start_event = False

            AgentSupervisor.start(parent_func, child_func)
        else:
            # Run in the standard foreground.
            agent.start(foreground=True)

    elif 'check' == command:
        if len(args) < 2:
            sys.stderr.write(
                "Usage: %s check <check_name> [check_rate]\n"
                "Add check_rate as last argument to compute rates\n" %
                sys.argv[0])
            return 1

        check_name = args[1]
        try:
            import checks.collector
            # Try the old-style check first
            print getattr(checks.collector, check_name)(log).check(agentConfig)
        except Exception:
            # If not an old-style check, try checks.d
            checks = load_check_directory(agentConfig, hostname)
            for check in checks['initialized_checks']:
                if check.name == check_name:
                    if in_developer_mode:
                        check.run = AgentProfiler.wrap_profiling(check.run)

                    cs = Collector.run_single_check(check, verbose=True)
                    print CollectorStatus.render_check_status(cs)

                    if len(args) == 3 and args[2] == 'check_rate':
                        print "Running 2nd iteration to capture rate metrics"
                        time.sleep(1)
                        cs = Collector.run_single_check(check, verbose=True)
                        print CollectorStatus.render_check_status(cs)

                    check.stop()

    elif 'configcheck' == command or 'configtest' == command:
        configcheck()
        sd_configcheck(agentConfig)

    elif 'jmx' == command:
        jmx_command(args[1:], agentConfig)

    elif 'flare' == command:
        Flare.check_user_rights()
        case_id = int(args[1]) if len(args) > 1 else None
        f = Flare(True, case_id)
        f.collect()
        try:
            f.upload()
        except Exception as e:
            print 'The upload failed:\n{0}'.format(str(e))

    return 0
Exemple #29
0
    def run(self):
        handlers = [
            (r"/intake/?", AgentInputHandler),
            (r"/intake/metrics?", MetricsAgentInputHandler),
            (r"/intake/metadata?", MetadataAgentInputHandler),
            (r"/api/v1/series/?", ApiInputHandler),
            (r"/api/v1/check_run/?", ApiCheckRunHandler),
            (r"/status/?", StatusHandler),
        ]

        settings = dict(
            cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=",
            xsrf_cookies=False,
            debug=False,
            log_function=self.log_request
        )

        non_local_traffic = self._agentConfig.get("non_local_traffic", False)

        tornado.web.Application.__init__(self, handlers, **settings)
        http_server = tornado.httpserver.HTTPServer(self)

        try:
            # non_local_traffic must be == True to match, not just some non-false value
            if non_local_traffic is True:
                http_server.listen(self._port)
            else:
                # localhost in lieu of 127.0.0.1 to support IPv6
                try:
                    http_server.listen(self._port, address=self._agentConfig['bind_host'])
                except gaierror:
                    log.warning("localhost seems undefined in your host file, using 127.0.0.1 instead")
                    http_server.listen(self._port, address="127.0.0.1")
                except socket_error as e:
                    if "Errno 99" in str(e):
                        log.warning("IPv6 doesn't seem to be fully supported. Falling back to IPv4")
                        http_server.listen(self._port, address="127.0.0.1")
                    else:
                        raise
        except socket_error as e:
            log.exception("Socket error %s. Is another application listening on the same port ? Exiting", e)
            sys.exit(1)
        except Exception as e:
            log.exception("Uncaught exception. Forwarder is exiting.")
            sys.exit(1)

        log.info("Listening on port %d" % self._port)

        # Register callbacks
        self.mloop = tornado.ioloop.IOLoop.current()

        logging.getLogger().setLevel(get_logging_config()['log_level'] or logging.INFO)

        def flush_trs():
            if self._watchdog:
                self._watchdog.reset()
            self._postMetrics()
            self._tr_manager.flush()

        tr_sched = tornado.ioloop.PeriodicCallback(flush_trs, TRANSACTION_FLUSH_INTERVAL,
                                                   io_loop=self.mloop)

        # Register optional Graphite listener
        gport = self._agentConfig.get("graphite_listen_port", None)
        if gport is not None:
            log.info("Starting graphite listener on port %s" % gport)
            from graphite import GraphiteServer
            gs = GraphiteServer(self, get_hostname(self._agentConfig), io_loop=self.mloop)
            if non_local_traffic is True:
                gs.listen(gport)
            else:
                gs.listen(gport, address="localhost")

        # Start everything
        if self._watchdog:
            self._watchdog.reset()
        tr_sched.start()

        self.mloop.start()
        log.info("Stopped")
def test_get_hostname_bin_nonrfc(subprocess):
    # this would fail validation if specified manually
    # but since it's collected from the OS we let it fly
    assert not is_valid_hostname("subprocess_hostname")
    assert get_hostname() == "subprocess_hostname"
Exemple #31
0
def test_get_hostname_bin(subprocess):
    assert get_hostname() == "subprocess-hostname"
Exemple #32
0
def test_get_hostname_error(subprocess, socket):
    with pytest.raises(Exception) as err:
        get_hostname()
    assert "Unable to reliably determine hostname or hostname not RFC1123 compliant." in str(
        err)
def test_get_hostname_error(subprocess, socket):
    with pytest.raises(Exception) as err:
        get_hostname()
    assert "Unable to reliably determine host name. You can define one in datadog.conf or in your hosts file" in str(
        err)
    def run(self):
        try:
            hostname = get_hostname()
        except HostnameException as e:
            logging.critical(
                "{} - You can define one in datadog.yaml or in your hosts file"
                .format(e))
            sys.exit(1)

        logging.info("Starting the agent, hostname: %s", hostname)

        # init Forwarder
        logging.info("Starting the Forwarder")
        api_key = config.get('api_key')
        dd_url = config.get('dd_url')
        if not dd_url:
            logging.error('No Datadog URL configured - cannot continue')
            sys.exit(1)
        if not api_key:
            logging.error('No API key configured - cannot continue')
            sys.exit(1)

        # get proxy settings
        proxies = get_proxy()
        logging.debug('Proxy configuration used: %s', proxies)

        forwarder = Forwarder(
            api_key,
            dd_url,
            proxies=proxies,
        )
        forwarder.start()

        # aggregator
        aggregator = MetricsAggregator(
            hostname,
            interval=config.get('aggregator_interval'),
            expiry_seconds=(config.get('min_collection_interval') +
                            config.get('aggregator_expiry_seconds')),
            recent_point_threshold=config.get('recent_point_threshold'),
            histogram_aggregates=config.get('histogram_aggregates'),
            histogram_percentiles=config.get('histogram_percentiles'),
        )

        # serializer
        serializer = Serializer(
            aggregator,
            forwarder,
        )

        # instantiate collector
        collector = Collector(config, aggregator)
        collector.load_check_classes()
        collector.instantiate_checks()

        # instantiate AgentRunner
        runner = AgentRunner(collector, serializer, config)

        # instantiate API
        api = APIServer(config, aggregator.stats)

        handler = SignalHandler()
        # components
        handler.register('runner', runner)
        handler.register('forwarder', forwarder)
        handler.register('api', api)
        # signals
        handler.handle(signal.SIGTERM)
        handler.handle(signal.SIGINT)

        # start signal handler
        handler.start()

        runner.start()
        api.start()

        runner.join()
        logging.info("Agent done...")

        api.join()
        logging.info("API done...")

        handler.stop()
        handler.join()
        logging.info("Signal handler done...")

        logging.info("Thank you for shopping at DataDog! Come back soon!")

        sys.exit(0)
Exemple #35
0
def start_graphite_listener(port):
    echo_server = GraphiteServer(None, get_hostname(None))
    echo_server.listen(port)
    IOLoop.instance().start()
Exemple #36
0
    def run(self):
        handlers = [
            (r"/intake/?", AgentInputHandler),
            (r"/intake/metrics?", MetricsAgentInputHandler),
            (r"/intake/metadata?", MetadataAgentInputHandler),
            (r"/api/v1/series/?", ApiInputHandler),
            (r"/api/v1/check_run/?", ApiCheckRunHandler),
            (r"/status/?", StatusHandler),
        ]

        settings = dict(
            cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=",
            xsrf_cookies=False,
            debug=False,
            log_function=self.log_request)

        non_local_traffic = self._agentConfig.get("non_local_traffic", False)

        tornado.web.Application.__init__(self, handlers, **settings)
        http_server = tornado.httpserver.HTTPServer(self)

        try:
            # non_local_traffic must be == True to match, not just some non-false value
            if non_local_traffic is True:
                http_server.listen(self._port)
            else:
                # localhost in lieu of 127.0.0.1 to support IPv6
                try:
                    http_server.listen(self._port,
                                       address=self._agentConfig['bind_host'])
                except gaierror:
                    log.warning(
                        "localhost seems undefined in your host file, using 127.0.0.1 instead"
                    )
                    http_server.listen(self._port, address="127.0.0.1")
                except socket_error as e:
                    if "Errno 99" in str(e):
                        log.warning(
                            "IPv6 doesn't seem to be fully supported. Falling back to IPv4"
                        )
                        http_server.listen(self._port, address="127.0.0.1")
                    else:
                        raise
        except socket_error as e:
            log.exception(
                "Socket error %s. Is another application listening on the same port ? Exiting",
                e)
            sys.exit(1)
        except Exception as e:
            log.exception("Uncaught exception. Forwarder is exiting.")
            sys.exit(1)

        log.info("Listening on port %d" % self._port)

        # Register callbacks
        self.mloop = tornado.ioloop.IOLoop.current()

        logging.getLogger().setLevel(get_logging_config()['log_level']
                                     or logging.INFO)

        def flush_trs():
            if self._watchdog:
                self._watchdog.reset()
            self._postMetrics()
            self._tr_manager.flush()

        tr_sched = tornado.ioloop.PeriodicCallback(flush_trs,
                                                   TRANSACTION_FLUSH_INTERVAL,
                                                   io_loop=self.mloop)

        # Register optional Graphite listener
        gport = self._agentConfig.get("graphite_listen_port", None)
        if gport is not None:
            log.info("Starting graphite listener on port %s" % gport)
            from graphite import GraphiteServer
            gs = GraphiteServer(self,
                                get_hostname(self._agentConfig),
                                io_loop=self.mloop)
            if non_local_traffic is True:
                gs.listen(gport)
            else:
                gs.listen(gport, address="localhost")

        # Start everything
        if self._watchdog:
            self._watchdog.reset()
        tr_sched.start()

        self.mloop.start()
        log.info("Stopped")
Exemple #37
0
def start():
    """
    Dummy start until we have a collector
    """
    init_agent()

    hostname = get_hostname()

    logging.info("Starting the agent, hostname: %s", hostname)

    # init Forwarder
    logging.info("Starting the Forwarder")
    api_key = config.get('api_key')
    dd_url = config.get('dd_url')
    if not dd_url:
        logging.error('No Datadog URL configured - cannot continue')
        sys.exit(1)
    if not api_key:
        logging.error('No API key configured - cannot continue')
        sys.exit(1)

    forwarder = Forwarder(api_key, dd_url)
    forwarder.start()

    # aggregator
    aggregator = MetricsAggregator(
        hostname,
        interval=config.get('aggregator_interval'),
        expiry_seconds=(config.get('min_collection_interval') +
                        config.get('aggregator_expiry_seconds')),
        recent_point_threshold=config.get('recent_point_threshold'),
        histogram_aggregates=config.get('histogram_aggregates'),
        histogram_percentiles=config.get('histogram_percentiles'),
    )

    # serializer
    serializer = Serializer(
        aggregator,
        forwarder,
    )

    # instantiate collector
    collector = Collector(config, aggregator)
    collector.load_check_classes()
    collector.instantiate_checks()

    def signal_handler(signal, frame):
        logging.info("SIGINT received: stopping the agent")
        logging.info("Stopping the forwarder")
        forwarder.stop()
        logging.info("See you !")
        sys.exit(0)

    signal.signal(signal.SIGINT, signal_handler)

    # update the metadata periodically?
    metadata = get_metadata(hostname)
    serializer.submit_metadata(metadata)
    while True:
        collector.run_checks()
        serializer.serialize_and_push()
        time.sleep(config.get('min_collection_interval'))
Exemple #38
0
    def run(self, config=None):
        """Main loop of the collector"""

        # Gracefully exit on sigterm.
        signal.signal(signal.SIGTERM, self._handle_sigterm)

        if not Platform.is_windows():
            # A SIGUSR1 signals an exit with an autorestart
            signal.signal(signal.SIGUSR1, self._handle_sigusr1)

            # Handle Keyboard Interrupt
            signal.signal(signal.SIGINT, self._handle_sigterm)

            # A SIGHUP signals a configuration reload
            signal.signal(signal.SIGHUP, self._handle_sighup)

        # Save the agent start-up stats.
        CollectorStatus().persist()

        # Intialize the collector.
        if not config:
            config = get_config(parse_args=True)

        self._agentConfig = self._set_agent_config_hostname(config)
        hostname = get_hostname(self._agentConfig)
        systemStats = get_system_stats(proc_path=self._agentConfig.get(
            'procfs_path', '/proc').rstrip('/'))
        emitters = self._get_emitters()

        # Initialize service discovery
        if self._agentConfig.get('service_discovery'):
            self.sd_backend = get_sd_backend(self._agentConfig)

        if _is_affirmative(self._agentConfig.get('sd_jmx_enable', False)):
            pipe_path = get_jmx_pipe_path()
            if Platform.is_windows():
                pipe_name = pipe_path.format(pipename=SD_PIPE_NAME)
            else:
                pipe_name = os.path.join(pipe_path, SD_PIPE_NAME)

            if os.access(pipe_path, os.W_OK):
                if not os.path.exists(pipe_name):
                    os.mkfifo(pipe_name)
                self.sd_pipe = os.open(
                    pipe_name, os.O_RDWR)  # RW to avoid blocking (will only W)

                # Initialize Supervisor proxy
                self.supervisor_proxy = self._get_supervisor_socket(
                    self._agentConfig)
            else:
                log.debug(
                    'Unable to create pipe in temporary directory. JMX service discovery disabled.'
                )

        # Load the checks.d checks
        self._checksd = load_check_directory(self._agentConfig, hostname)

        # Load JMX configs if available
        if self._jmx_service_discovery_enabled:
            jmx_sd_configs = generate_jmx_configs(self._agentConfig, hostname)
            if jmx_sd_configs:
                self._submit_jmx_service_discovery(jmx_sd_configs)

        # Initialize the Collector
        self.collector = Collector(self._agentConfig, emitters, systemStats,
                                   hostname)

        # In developer mode, the number of runs to be included in a single collector profile
        try:
            self.collector_profile_interval = int(
                self._agentConfig.get('collector_profile_interval',
                                      DEFAULT_COLLECTOR_PROFILE_INTERVAL))
        except ValueError:
            log.warn('collector_profile_interval is invalid. '
                     'Using default value instead (%s).' %
                     DEFAULT_COLLECTOR_PROFILE_INTERVAL)
            self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL

        # Configure the watchdog.
        self.check_frequency = int(self._agentConfig['check_freq'])
        watchdog = self._get_watchdog(self.check_frequency)

        # Initialize the auto-restarter
        self.restart_interval = int(
            self._agentConfig.get('restart_interval', RESTART_INTERVAL))
        self.agent_start = time.time()

        self.allow_profiling = self._agentConfig.get('allow_profiling', True)

        profiled = False
        collector_profiled_runs = 0

        # Run the main loop.
        while self.run_forever:
            # Setup profiling if necessary
            if self.allow_profiling and self.in_developer_mode and not profiled:
                try:
                    profiler = AgentProfiler()
                    profiler.enable_profiling()
                    profiled = True
                except Exception as e:
                    log.warn("Cannot enable profiler: %s" % str(e))

            if self.reload_configs_flag:
                if isinstance(self.reload_configs_flag, set):
                    self.reload_configs(
                        checks_to_reload=self.reload_configs_flag)
                else:
                    self.reload_configs()

            # Do the work. Pass `configs_reloaded` to let the collector know if it needs to
            # look for the AgentMetrics check and pop it out.
            self.collector.run(
                checksd=self._checksd,
                start_event=self.start_event,
                configs_reloaded=True if self.reload_configs_flag else False)

            self.reload_configs_flag = False

            # Look for change in the config template store.
            # The self.sd_backend.reload_check_configs flag is set
            # to True if a config reload is needed.
            if self._agentConfig.get('service_discovery') and self.sd_backend and \
               not self.sd_backend.reload_check_configs:
                try:
                    self.sd_backend.reload_check_configs = get_config_store(
                        self._agentConfig).crawl_config_template()
                except Exception as e:
                    log.warn(
                        'Something went wrong while looking for config template changes: %s'
                        % str(e))

            # Check if we should run service discovery
            # The `reload_check_configs` flag can be set through the docker_daemon check or
            # using ConfigStore.crawl_config_template
            if self._agentConfig.get('service_discovery') and self.sd_backend and \
               self.sd_backend.reload_check_configs:
                self.reload_configs_flag = self.sd_backend.reload_check_configs
                self.sd_backend.reload_check_configs = False

            if profiled:
                if collector_profiled_runs >= self.collector_profile_interval:
                    try:
                        profiler.disable_profiling()
                        profiled = False
                        collector_profiled_runs = 0
                    except Exception as e:
                        log.warn("Cannot disable profiler: %s" % str(e))

            # Check if we should restart.
            if self.autorestart and self._should_restart():
                self._do_restart()

            # Only plan for next loop if we will continue, otherwise exit quickly.
            if self.run_forever:
                if watchdog:
                    watchdog.reset()
                if profiled:
                    collector_profiled_runs += 1
                log.debug("Sleeping for {0} seconds".format(
                    self.check_frequency))
                time.sleep(self.check_frequency)

        # Now clean-up.
        try:
            CollectorStatus.remove_latest_status()
        except Exception:
            pass

        # Explicitly kill the process, because it might be running as a daemon.
        log.info("Exiting. Bye bye.")
        sys.exit(0)
Exemple #39
0
    def run(self):
        try:
            hostname = get_hostname()
        except HostnameException as e:
            logging.critical(
                "{} - You can define one in datadog.yaml or in your hosts file"
                .format(e))
            sys.exit(1)

        logging.info("Starting the agent, hostname: %s", hostname)

        # init Forwarder
        logging.info("Starting the Forwarder")
        api_key = config.get('api_key')
        dd_url = config.get('dd_url')
        if not dd_url:
            logging.error('No Datadog URL configured - cannot continue')
            sys.exit(1)
        if not api_key:
            logging.error('No API key configured - cannot continue')
            sys.exit(1)

        # get proxy settings
        proxies = get_proxy()
        logging.debug('Proxy configuration used: %s', proxies)

        forwarder = Forwarder(
            api_key,
            dd_url,
            proxies=proxies,
        )
        forwarder.start()

        # aggregator
        aggregator = MetricsAggregator(
            hostname,
            interval=config.get('aggregator_interval'),
            expiry_seconds=(config.get('min_collection_interval') +
                            config.get('aggregator_expiry_seconds')),
            recent_point_threshold=config.get('recent_point_threshold'),
            histogram_aggregates=config.get('histogram_aggregates'),
            histogram_percentiles=config.get('histogram_percentiles'),
        )

        # serializer
        serializer = Serializer(
            aggregator,
            forwarder,
        )

        # instantiate collector
        collector = Collector(config, aggregator)
        collector.load_check_classes()
        collector.instantiate_checks()

        # instantiate AgentRunner
        runner = AgentRunner(collector, serializer, config)

        # instantiate API
        api = APIServer(8888, aggregator.stats)

        def signal_handler(signal, frame):
            log.info("SIGINT received: stopping the agent")
            log.info("Stopping the forwarder")
            runner.stop()
            forwarder.stop()
            api.stop()
            log.info("See you !")
            sys.exit(0)

        signal.signal(signal.SIGINT, signal_handler)

        runner.start()
        api.run()  # blocking tornado in main thread
Exemple #40
0
def test_get_hostname_conf():
    config.set("hostname", "test-hostname")
    assert get_hostname() == "test-hostname"
    config.reset("hostname")
Exemple #41
0
def init(config_path=None, use_watchdog=False, use_forwarder=False, args=None):
    """Configure the server and the reporting thread.
    """
    c = get_config(parse_args=False, cfg_path=config_path)

    if (not c['use_dogstatsd']
            and (args and args[0] in ['start', 'restart'] or not args)):
        log.info("StsStatsd is disabled. Exiting")
        # We're exiting purposefully, so exit with zero (supervisor's expected
        # code). HACK: Sleep a little bit so supervisor thinks we've started cleanly
        # and thus can exit cleanly.
        sleep(4)
        sys.exit(0)

    port = c['dogstatsd_port']
    interval = DOGSTATSD_FLUSH_INTERVAL
    api_key = c['api_key']
    aggregator_interval = DOGSTATSD_AGGREGATOR_BUCKET_SIZE
    non_local_traffic = c['non_local_traffic']
    forward_to_host = c.get('statsd_forward_host')
    forward_to_port = c.get('statsd_forward_port')
    event_chunk_size = c.get('event_chunk_size')
    recent_point_threshold = c.get('recent_point_threshold', None)
    server_host = c['bind_host']

    target = c['dd_url']
    if use_forwarder:
        target = c['dogstatsd_target']

    hostname = get_hostname(c)

    # Create the aggregator (which is the point of communication between the
    # server and reporting threads.
    assert 0 < interval

    aggregator = MetricsBucketAggregator(
        hostname,
        aggregator_interval,
        recent_point_threshold=recent_point_threshold,
        formatter=get_formatter(c),
        histogram_aggregates=c.get('histogram_aggregates'),
        histogram_percentiles=c.get('histogram_percentiles'),
        utf8_decoding=c['utf8_decoding'])

    # Start the reporting thread.
    reporter = Reporter(interval, aggregator, target, api_key, use_watchdog,
                        event_chunk_size)

    # NOTICE: when `non_local_traffic` is passed we need to bind to any interface on the box. The forwarder uses
    # Tornado which takes care of sockets creation (more than one socket can be used at once depending on the
    # network settings), so it's enough to just pass an empty string '' to the library.
    # In Dogstatsd we use a single, fullstack socket, so passing '' as the address doesn't work and we default to
    # '0.0.0.0'. If someone needs to bind Dogstatsd to the IPv6 '::', they need to turn off `non_local_traffic` and
    # use the '::' meta address as `bind_host`.
    if non_local_traffic:
        server_host = '0.0.0.0'

    server = Server(aggregator,
                    server_host,
                    port,
                    forward_to_host=forward_to_host,
                    forward_to_port=forward_to_port)

    return reporter, server, c
Exemple #42
0
def test_get_hostname_socket(subprocess, socket):
    assert get_hostname() == "socket-hostname"
Exemple #43
0
 def __init__(self, aggregator, forwarder):
     self._aggregator = aggregator
     self._forwarder = forwarder
     self._internal_hostname = get_hostname()
Exemple #44
0
def init_dogstatsd(config, forwarder=None):
    api_key = config['api_key']
    recent_point_threshold = config.get('recent_point_threshold', None)
    server_host = config['dogstatsd']['bind_host']
    dd_url = config['dd_url']
    port = config['dogstatsd']['port']
    forward_to_host = config['dogstatsd'].get('forward_host')
    forward_to_port = config['dogstatsd'].get('forward_port')
    non_local_traffic = config['dogstatsd'].get('non_local_traffic')
    so_rcvbuf = config['dogstatsd'].get('so_rcvbuf')
    utf8_decoding = config['dogstatsd'].get('utf8_decoding')

    interval = DOGSTATSD_FLUSH_INTERVAL
    aggregator_interval = DOGSTATSD_AGGREGATOR_BUCKET_SIZE

    hostname = get_hostname()

    # get proxy settings
    proxies = get_proxy()

    if not forwarder:
        forwarder = Forwarder(
            api_key,
            get_site_url(dd_url, site=config.get('site')),
            proxies=proxies,
        )

    aggregator = MetricsBucketAggregator(
        hostname,
        aggregator_interval,
        recent_point_threshold=recent_point_threshold,
        formatter=get_formatter(config),
        histogram_aggregates=config.get('histogram_aggregates'),
        histogram_percentiles=config.get('histogram_percentiles'),
        utf8_decoding=utf8_decoding)
    # serializer
    serializer = Serializer(
        aggregator,
        forwarder,
    )

    reporter = Reporter(interval,
                        aggregator,
                        serializer,
                        api_key,
                        use_watchdog=False,
                        hostname=hostname)

    # NOTICE: when `non_local_traffic` is passed we need to bind to any interface on the box. The forwarder uses
    # Tornado which takes care of sockets creation (more than one socket can be used at once depending on the
    # network settings), so it's enough to just pass an empty string '' to the library.
    # In Dogstatsd we use a single, fullstack socket, so passing '' as the address doesn't work and we default to
    # '0.0.0.0'. If someone needs to bind Dogstatsd to the IPv6 '::', they need to turn off `non_local_traffic` and
    # use the '::' meta address as `bind_host`.
    if non_local_traffic:
        server_host = '0.0.0.0'

    server = Server(aggregator,
                    server_host,
                    port,
                    forward_to_host=forward_to_host,
                    forward_to_port=forward_to_port,
                    so_rcvbuf=so_rcvbuf)

    return reporter, server, forwarder
Exemple #45
0
    def test_topology_collection(self):
        agentConfig = {
            'api_key': 'test_apikey',
            'check_timings': True,
            'collect_ec2_tags': True,
            'collect_orchestrator_tags': False,
            'collect_instance_metadata': False,
            'create_dd_check_tags': False,
            'version': 'test',
            'tags': '',
        }

        # Run a single checks.d check as part of the collector.
        dummy_topology_check_config = {
            "init_config": {},
            "instances": [{
                "dummy_instance": "dummy_instance"
            }]
        }

        # create dummy checks, creating two component and 1 relation
        check1 = DummyTopologyCheck(
            1,
            'dummy_topology_check',
            dummy_topology_check_config.get('init_config'),
            agentConfig,
            instances=[{
                "instance_id": 1,
                "pass": True
            }, {
                "instance_id": 2,
                "pass": True
            }])
        check2 = DummyTopologyCheck(
            2,
            'dummy_topology_check',
            dummy_topology_check_config.get('init_config'),
            agentConfig,
            instances=[{
                "instance_id": 3,
                "pass": True
            }, {
                "instance_id": 4,
                "pass": True
            }],
            snapshot=True)

        emitted_topologies = []

        # mock emitter to pick up data emitted by the collector
        def mock_emitter(message, log, agentConfig, endpoint):
            emitted_topologies.extend(message['topologies'])

        c = Collector(agentConfig, [mock_emitter], {},
                      get_hostname(agentConfig))
        payload, _ = c.run({
            'initialized_checks': [check1, check2],
            'init_failed_checks': {}
        })
        topologies = payload['topologies']

        def assertTopology(topology, check, instance_id):
            self.assertEquals(topology['instance'],
                              check.instance_key(instance_id))
            self.assertEquals(len(topology['components']), 2)
            self.assertEquals(len(topology['relations']), 1)
            self.assertEquals(check.expected_components(instance_id),
                              topology['components'])
            self.assertEquals(check.expected_relations(),
                              topology['relations'])
            if check.snapshot:
                self.assertTrue(topology["start_snapshot"])
                self.assertTrue(topology["stop_snapshot"])
            else:
                self.assertTrue("start_snapshot" not in topology)
                self.assertTrue("stop_snapshot" not in topology)


# Make sure the emissions of the collector are observed

        assertTopology(topologies[0], check1, 1)
        assertTopology(topologies[1], check1, 2)
        assertTopology(topologies[2], check2, 4)
        assertTopology(topologies[3], check2, 3)

        assertTopology(emitted_topologies[0], check1, 1)
        assertTopology(emitted_topologies[1], check1, 2)
        assertTopology(emitted_topologies[2], check2, 4)
        assertTopology(emitted_topologies[3], check2, 3)