def test_apptags(self): ''' Tests that the app tags are sent if specified so ''' agentConfig = { 'api_key': 'test_apikey', 'collect_ec2_tags': False, 'collect_instance_metadata': False, 'create_dd_check_tags': True, 'version': 'test', 'tags': '', } # Run a single checks.d check as part of the collector. redis_config = { "init_config": {}, "instances": [{"host": "localhost", "port": 6379}] } checks = [load_check('redisdb', redis_config, agentConfig)] c = Collector(agentConfig, [], {}, get_hostname(agentConfig)) payload = c.run({ 'initialized_checks': checks, 'init_failed_checks': {} }) # We check that the redis DD_CHECK_TAG is sent in the payload self.assertTrue('dd_check:redisdb' in payload['host-tags']['system'])
def __init__(self, args): win32serviceutil.ServiceFramework.__init__(self, args) self.hWaitStop = win32event.CreateEvent(None, 0, 0, None) config = get_config(parse_args=False) # Setup the correct options so the agent will use the forwarder opts, args = Values({ 'autorestart': False, 'dd_url': None, 'use_forwarder': True, 'disabled_dd': False, 'profile': False }), [] agentConfig = get_config(parse_args=False, options=opts) self.hostname = get_hostname(agentConfig) # Watchdog for Windows self._collector_heartbeat, self._collector_send_heartbeat = multiprocessing.Pipe(False) self._collector_failed_heartbeats = 0 self._max_failed_heartbeats = \ MAX_FAILED_HEARTBEATS * agentConfig['check_freq'] / SERVICE_SLEEP_INTERVAL # Watch JMXFetch restarts self._MAX_JMXFETCH_RESTARTS = 3 self._count_jmxfetch_restarts = 0 # Keep a list of running processes so we can start/end as needed. # Processes will start started in order and stopped in reverse order. self.procs = { 'forwarder': ProcessWatchDog("forwarder", DDForwarder(config, self.hostname)), 'collector': ProcessWatchDog("collector", DDAgent(agentConfig, self.hostname, heartbeat=self._collector_send_heartbeat)), 'dogstatsd': ProcessWatchDog("dogstatsd", DogstatsdProcess(config, self.hostname)), 'jmxfetch': ProcessWatchDog("jmxfetch", JMXFetchProcess(config, self.hostname), 3), }
def reload_configs(self, checks_to_reload=set()): """Reload the agent configuration and checksd configurations. Can also reload only an explicit set of checks.""" log.info("Attempting a configuration reload...") hostname = get_hostname(self._agentConfig) # if no check was given, reload them all if not checks_to_reload: log.debug("No check list was passed, reloading every check") # stop checks for check in self._checksd.get('initialized_checks', []): check.stop() self._checksd = load_check_directory(self._agentConfig, hostname) else: new_checksd = copy(self._checksd) self.refresh_specific_checks(hostname, new_checksd, checks_to_reload) # once the reload is done, replace existing checks with the new ones self._checksd = new_checksd # Logging num_checks = len(self._checksd['initialized_checks']) if num_checks > 0: opt_msg = " (refreshed %s checks)" % len(checks_to_reload) if checks_to_reload else '' msg = "Check reload was successful. Running {num_checks} checks{opt_msg}.".format( num_checks=num_checks, opt_msg=opt_msg) log.info(msg) else: log.info("No checksd configs found")
def test_apptags(self): ''' Tests that the app tags are sent if specified so ''' agentConfig = { 'agent_key': 'test_agentkey', 'collect_ec2_tags': False, 'collect_orchestrator_tags': False, 'collect_instance_metadata': False, 'create_sd_check_tags': True, 'version': 'test', 'tags': '', } # Run a single checks.d check as part of the collector. disk_config = { "init_config": {}, "instances": [{}] } checks = [load_check('disk', disk_config, agentConfig)] c = Collector(agentConfig, [], {}, get_hostname(agentConfig)) payload = c.run({ 'initialized_checks': checks, 'init_failed_checks': {} }) # We check that the redis SD_CHECK_TAG is sent in the payload self.assertTrue('sd_check:disk' in payload['host-tags']['system'])
def init(config_path=None, use_watchdog=False, use_forwarder=False, args=None): """Configure the server and the reporting thread. """ c = get_config(parse_args=False, cfg_path=config_path) if (not c['use_dogstatsd'] and (args and args[0] in ['start', 'restart'] or not args)): log.info("Dogstatsd is disabled. Exiting") # We're exiting purposefully, so exit with zero (supervisor's expected # code). HACK: Sleep a little bit so supervisor thinks we've started cleanly # and thus can exit cleanly. sleep(4) sys.exit(0) port = c['dogstatsd_port'] interval = DOGSTATSD_FLUSH_INTERVAL api_key = c['api_key'] aggregator_interval = DOGSTATSD_AGGREGATOR_BUCKET_SIZE non_local_traffic = c['non_local_traffic'] forward_to_host = c.get('statsd_forward_host') forward_to_port = c.get('statsd_forward_port') event_chunk_size = c.get('event_chunk_size') recent_point_threshold = c.get('recent_point_threshold', None) server_host = c['bind_host'] target = c['dd_url'] if use_forwarder: target = c['dogstatsd_target'] hostname = get_hostname(c) # Create the aggregator (which is the point of communication between the # server and reporting threads. assert 0 < interval aggregator = MetricsBucketAggregator( hostname, aggregator_interval, recent_point_threshold=recent_point_threshold, formatter=get_formatter(c), histogram_aggregates=c.get('histogram_aggregates'), histogram_percentiles=c.get('histogram_percentiles'), utf8_decoding=c['utf8_decoding'] ) # Start the reporting thread. reporter = Reporter(interval, aggregator, target, api_key, use_watchdog, event_chunk_size) # NOTICE: when `non_local_traffic` is passed we need to bind to any interface on the box. The forwarder uses # Tornado which takes care of sockets creation (more than one socket can be used at once depending on the # network settings), so it's enough to just pass an empty string '' to the library. # In Dogstatsd we use a single, fullstack socket, so passing '' as the address doesn't work and we default to # '0.0.0.0'. If someone needs to bind Dogstatsd to the IPv6 '::', they need to turn off `non_local_traffic` and # use the '::' meta address as `bind_host`. if non_local_traffic: server_host = '0.0.0.0' server = Server(aggregator, server_host, port, forward_to_host=forward_to_host, forward_to_port=forward_to_port) return reporter, server, c
def check(self, agentConfig): process_exclude_args = agentConfig.get('exclude_process_args', False) if process_exclude_args: ps_arg = 'aux' else: ps_arg = 'auxww' # Get output from ps try: output, _, _ = get_subprocess_output(['ps', ps_arg], self.logger) processLines = output.splitlines() # Also removes a trailing empty line del processLines[0] # Removes the headers except Exception: self.logger.exception('getProcesses') return False processes = [] for line in processLines: line = line.split(None, 10) processes.append(map(lambda s: s.strip(), line)) return {'processes': processes, 'apiKey': agentConfig['api_key'], 'host': get_hostname(agentConfig)}
def test_collector(self): agentConfig = { 'api_key': 'test_apikey', 'check_timings': True, 'collect_ec2_tags': True, 'collect_instance_metadata': False, 'create_dd_check_tags': False, 'version': 'test', 'tags': '', } # Run a single checks.d check as part of the collector. redis_config = { "init_config": {}, "instances": [{"host": "localhost", "port": 6379}] } checks = [load_check('redisdb', redis_config, agentConfig)] c = Collector(agentConfig, [], {}, get_hostname(agentConfig)) payload = c.run({ 'initialized_checks': checks, 'init_failed_checks': {} }) metrics = payload['metrics'] # Check that we got a timing metric for all checks. timing_metrics = [m for m in metrics if m[0] == 'datadog.agent.check_run_time'] all_tags = [] for metric in timing_metrics: all_tags.extend(metric[3]['tags']) for check in checks: tag = "check:%s" % check.name assert tag in all_tags, all_tags
def load_check(name, config, agentConfig): if not _is_sdk(): checksd_path = agentConfig.get('additional_checksd', get_checksd_path(get_os())) # find (in checksd_path) and load the check module fd, filename, desc = imp.find_module(name, [checksd_path]) check_module = imp.load_module(name, fd, filename, desc) else: check_module = _load_sdk_module(name) # parent module check_class = None classes = inspect.getmembers(check_module, inspect.isclass) for _, clsmember in classes: if clsmember == AgentCheck: continue if issubclass(clsmember, AgentCheck): check_class = clsmember if AgentCheck in clsmember.__bases__: continue else: break if check_class is None: raise Exception("Unable to import check %s. Missing a class that inherits AgentCheck" % name) init_config = config.get('init_config', {}) instances = config.get('instances') agentConfig['checksd_hostname'] = get_hostname(agentConfig) # init the check class try: return check_class(name, init_config, agentConfig, instances=instances) except TypeError as e: raise Exception("Check is using old API, {0}".format(e)) except Exception: raise
def __init__(self, name, init_config, agentConfig, instances=None): """ Initialize a new check. :param name: The name of the check :param init_config: The config for initializing the check :param agentConfig: The global configuration for the agent :param instances: A list of configuration objects for each instance. """ from aggregator import MetricsAggregator self._enabled_checks.append(name) self._enabled_checks = list(set(self._enabled_checks)) self.name = name self.init_config = init_config or {} self.agentConfig = agentConfig self.in_developer_mode = agentConfig.get("developer_mode") and psutil self._internal_profiling_stats = None self.default_integration_http_timeout = float(agentConfig.get("default_integration_http_timeout", 9)) self.hostname = agentConfig.get("checksd_hostname") or get_hostname(agentConfig) self.log = logging.getLogger("%s.%s" % (__name__, name)) self.min_collection_interval = self.init_config.get( "min_collection_interval", self.DEFAULT_MIN_COLLECTION_INTERVAL ) self.aggregator = MetricsAggregator( self.hostname, expiry_seconds=self.min_collection_interval + self.DEFAULT_EXPIRY_SECONDS, formatter=agent_formatter, recent_point_threshold=agentConfig.get("recent_point_threshold", None), histogram_aggregates=agentConfig.get("histogram_aggregates"), histogram_percentiles=agentConfig.get("histogram_percentiles"), ) self.events = [] self.service_checks = [] self.instances = instances or [] self.warnings = [] self.library_versions = None self.last_collection_time = defaultdict(int) self._instance_metadata = [] self.svc_metadata = [] self.historate_dict = {} # Set proxy settings self.proxy_settings = get_proxy(self.agentConfig) self._use_proxy = False if init_config is None else init_config.get("use_agent_proxy", True) self.proxies = {"http": None, "https": None} if self.proxy_settings and self._use_proxy: uri = "{host}:{port}".format(host=self.proxy_settings["host"], port=self.proxy_settings["port"]) if self.proxy_settings["user"] and self.proxy_settings["password"]: uri = "{user}:{password}@{uri}".format( user=self.proxy_settings["user"], password=self.proxy_settings["password"], uri=uri ) self.proxies["http"] = "http://{uri}".format(uri=uri) self.proxies["https"] = "https://{uri}".format(uri=uri)
def _postMetrics(self): if len(self._metrics) > 0: self._metrics['uuid'] = get_uuid() self._metrics['internalHostname'] = get_hostname(self._agentConfig) self._metrics['apiKey'] = self._agentConfig['api_key'] MetricTransaction(json.dumps(self._metrics), headers={'Content-Type': 'application/json'}) self._metrics = {}
def sd_configcheck(agentConfig): if agentConfig.get('service_discovery', False): # set the TRACE_CONFIG flag to True to make load_check_directory return # the source of config objects. # Then call load_check_directory here and pass the result to get_sd_configcheck # to avoid circular imports agentConfig[TRACE_CONFIG] = True configs = { # check_name: (config_source, config) } print("\nLoading check configurations...\n\n") configs = load_check_directory(agentConfig, get_hostname(agentConfig)) get_sd_configcheck(agentConfig, configs)
def __init__(self, cmdline=False, case_id=None): self._case_id = case_id self._cmdline = cmdline self._init_tarfile() self._init_permissions_file() self._save_logs_path() self._config = get_config() self._api_key = self._config.get('api_key') self._url = "{0}{1}".format( get_url_endpoint(self._config.get('dd_url'), endpoint_type='flare'), self.DATADOG_SUPPORT_URL ) self._hostname = get_hostname(self._config) self._prefix = "datadog-{0}".format(self._hostname)
def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False, event_chunk_size=None): threading.Thread.__init__(self) self.interval = int(interval) self.finished = threading.Event() self.metrics_aggregator = metrics_aggregator self.flush_count = 0 self.log_count = 0 self.hostname = get_hostname() self.watchdog = None if use_watchdog: self.watchdog = Watchdog.create(WATCHDOG_TIMEOUT) self.api_key = api_key self.api_host = api_host self.event_chunk_size = event_chunk_size or EVENT_CHUNK_SIZE
def test_check(self): self.run_check_twice(self.config) shared_tag = ['instance_url:http://localhost:3835/stats'] self._test_frontend_metrics(shared_tag) self._test_backend_metrics(shared_tag) # check was run 2 times # - FRONTEND is reporting OPEN that we ignore # - only the BACKEND aggregate is reporting UP -> OK # - The 3 individual servers are returning no check -> UNKNOWN self._test_service_checks() # Make sure the service checks aren't tagged with an empty hostname. self.assertEquals(self.service_checks[0]['host_name'], get_hostname()) self.coverage_report()
def submit_events(self, events): headers = {'Content-Type':'application/json'} event_chunk_size = self.event_chunk_size for chunk in chunks(events, event_chunk_size): payload = { 'apiKey': self.api_key, 'events': { 'api': chunk }, 'uuid': get_uuid(), 'internalHostname': get_hostname() } params = {} if self.api_key: params['api_key'] = self.api_key url = '%s/intake?%s' % (self.api_host, urlencode(params)) self.submit_http(url, json.dumps(payload), headers)
def test_check(self): self.run_check_twice(self.config) shared_tag = ['instance_url:http://localhost:3835/stats'] self._test_frontend_metrics(shared_tag) self._test_backend_metrics(shared_tag) # check was run 2 times # - FRONTEND is reporting OPEN that we ignore # - only the BACKEND aggregate is reporting UP -> OK # - The 3 individual servers are returning no check -> UNKNOWN self._test_service_checks() # Make sure the service checks aren't tagged with an empty hostname. self.assertEquals(self.service_checks[0]['host_name'], get_hostname(config=self.config)) self.coverage_report()
def submit_events(self, events): headers = {'Content-Type': 'application/json'} event_chunk_size = self.event_chunk_size for chunk in chunks(events, event_chunk_size): payload = { 'apiKey': self.api_key, 'events': { 'api': chunk }, 'uuid': get_uuid(), 'internalHostname': get_hostname() } params = {} if self.api_key: params['api_key'] = self.api_key url = '%s/intake?%s' % (self.api_host, urlencode(params)) self.submit_http(url, json.dumps(payload), headers)
def reload_configs(self, checks_to_reload=set()): """Reload the agent configuration and checksd configurations. Can also reload only an explicit set of checks.""" log.info("Attempting a configuration reload...") hostname = get_hostname(self._agentConfig) jmx_sd_configs = None # if no check was given, reload them all if not checks_to_reload: log.debug("No check list was passed, reloading every check") # stop checks for check in self._checksd.get('initialized_checks', []): check.stop() self._checksd = load_check_directory(self._agentConfig, hostname) if self._jmx_service_discovery_enabled: jmx_sd_configs = generate_jmx_configs(self._agentConfig, hostname) else: new_checksd = copy(self._checksd) jmx_checks = [check for check in checks_to_reload if check in JMX_CHECKS] py_checks = set(checks_to_reload) - set(jmx_checks) self.refresh_specific_checks(hostname, new_checksd, py_checks) if self._jmx_service_discovery_enabled: jmx_sd_configs = generate_jmx_configs(self._agentConfig, hostname, jmx_checks) # once the reload is done, replace existing checks with the new ones self._checksd = new_checksd if jmx_sd_configs: self._submit_jmx_service_discovery(jmx_sd_configs) # Logging num_checks = len(self._checksd['initialized_checks']) if num_checks > 0: opt_msg = " (refreshed %s checks)" % len(checks_to_reload) if checks_to_reload else '' msg = "Check reload was successful. Running {num_checks} checks{opt_msg}.".format( num_checks=num_checks, opt_msg=opt_msg) log.info(msg) else: log.info("No checksd configs found")
def collection(self): while not self._event.is_set(): try: current_ts = time.monotonic() if self._meta_ts is None or ( current_ts - self._meta_ts ) >= self._config.get('host_metadata_interval'): metadata = get_metadata( get_hostname(), AGENT_VERSION, start_event=(self._meta_ts is None)) self._serializer.submit_metadata(metadata) self._meta_ts = current_ts self._collector.run_checks() self._serializer.serialize_and_push() except Exception: log.exception("Unexpected error in last collection run") time.sleep(self._config.get('min_collection_interval'))
def load_check(name, config, agentConfig): if not _is_sdk(): checksd_path = get_checksd_path(get_os()) # find (in checksd_path) and load the check module fd, filename, desc = imp.find_module(name, [checksd_path]) check_module = imp.load_module(name, fd, filename, desc) else: check_module = __import__("check") check_class = None classes = inspect.getmembers(check_module, inspect.isclass) for _, clsmember in classes: if clsmember == AgentCheck: continue if issubclass(clsmember, AgentCheck): check_class = clsmember if AgentCheck in clsmember.__bases__: continue else: break if check_class is None: raise Exception( "Unable to import check %s. Missing a class that inherits AgentCheck" % name) init_config = config.get('init_config', {}) instances = config.get('instances') agentConfig['checksd_hostname'] = get_hostname(agentConfig) # init the check class try: return check_class(name, init_config=init_config, agentConfig=agentConfig, instances=instances) except TypeError as e: raise Exception("Check is using old API, {0}".format(e)) except Exception: raise
def test_collector(self): agentConfig = { 'api_key': 'test_apikey', 'check_timings': True, 'collect_ec2_tags': True, 'collect_instance_metadata': False, 'create_dd_check_tags': False, 'version': 'test', 'tags': '', } # Run a single checks.d check as part of the collector. redis_config = { "init_config": {}, "instances": [{ "host": "localhost", "port": 6379 }] } checks = [load_check('redisdb', redis_config, agentConfig)] c = Collector(agentConfig, [], {}, get_hostname(agentConfig)) payload = c.run({ 'initialized_checks': checks, 'init_failed_checks': {} }) metrics = payload['metrics'] # Check that we got a timing metric for all checks. timing_metrics = [ m for m in metrics if m[0] == 'datadog.agent.check_run_time' ] all_tags = [] for metric in timing_metrics: all_tags.extend(metric[3]['tags']) for check in checks: tag = "check:%s" % check.name assert tag in all_tags, all_tags
def start_graphite_listener(port): echo_server = GraphiteServer(None, get_hostname(None)) echo_server.listen(port) IOLoop.instance().start()
def __init__(self, name, init_config, agentConfig, instances=None): """ Initialize a new check. :param name: The name of the check :param init_config: The config for initializing the check :param agentConfig: The global configuration for the agent :param instances: A list of configuration objects for each instance. """ from aggregator import MetricsAggregator self._enabled_checks.append(name) self._enabled_checks = list(set(self._enabled_checks)) self.name = name self.init_config = init_config or {} self.agentConfig = agentConfig self.in_developer_mode = agentConfig.get('developer_mode') and psutil self._internal_profiling_stats = None self.default_integration_http_timeout = float( agentConfig.get('default_integration_http_timeout', 9)) self.hostname = agentConfig.get('checksd_hostname') or get_hostname( agentConfig) self.log = logging.getLogger('%s.%s' % (__name__, name)) self.min_collection_interval = self.init_config.get( 'min_collection_interval', self.DEFAULT_MIN_COLLECTION_INTERVAL) self.aggregator = MetricsAggregator( self.hostname, expiry_seconds=self.min_collection_interval + self.DEFAULT_EXPIRY_SECONDS, formatter=agent_formatter, recent_point_threshold=agentConfig.get('recent_point_threshold', None), histogram_aggregates=agentConfig.get('histogram_aggregates'), histogram_percentiles=agentConfig.get('histogram_percentiles')) self.events = [] self.service_checks = [] self.instances = instances or [] self.warnings = [] self.library_versions = None self.last_collection_time = defaultdict(int) self._instance_metadata = [] self.svc_metadata = [] self.historate_dict = {} # Set proxy settings self.proxy_settings = get_proxy(self.agentConfig) self._use_proxy = False if init_config is None else init_config.get( "use_agent_proxy", True) self.proxies = { "http": None, "https": None, } if self.proxy_settings and self._use_proxy: uri = "{host}:{port}".format(host=self.proxy_settings['host'], port=self.proxy_settings['port']) if self.proxy_settings['user'] and self.proxy_settings['password']: uri = "{user}:{password}@{uri}".format( user=self.proxy_settings['user'], password=self.proxy_settings['password'], uri=uri) self.proxies['http'] = "http://{uri}".format(uri=uri) self.proxies['https'] = "https://{uri}".format(uri=uri)
def main(): options, args = get_parsed_args() agentConfig = get_config(options=options) autorestart = agentConfig.get('autorestart', False) hostname = get_hostname(agentConfig) in_developer_mode = agentConfig.get('developer_mode') COMMANDS_AGENT = [ 'start', 'stop', 'restart', 'status', 'foreground', ] COMMANDS_NO_AGENT = [ 'info', 'check', 'configcheck', 'jmx', 'flare', ] COMMANDS = COMMANDS_AGENT + COMMANDS_NO_AGENT if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 # TODO: actually kill the start/stop/restart/status command for 5.11 if command in ['start', 'stop', 'restart', 'status'] and not in_developer_mode: logging.error('Please use supervisor to manage the agent') return 1 if command in COMMANDS_AGENT: agent = Agent(PidFile(PID_NAME, PID_DIR).get_path(), autorestart, in_developer_mode=in_developer_mode) if 'start' == command: log.info('Start daemon') agent.start() elif 'stop' == command: log.info('Stop daemon') agent.stop() elif 'restart' == command: log.info('Restart daemon') agent.restart() elif 'status' == command: agent.status() elif 'info' == command: return Agent.info(verbose=options.verbose) elif 'foreground' == command: log.info('Agent version %s' % get_version()) if autorestart: # Set-up the supervisor callbacks and fork it. logging.info('Running Agent with auto-restart ON') def child_func(): agent.start(foreground=True) def parent_func(): agent.start_event = False AgentSupervisor.start(parent_func, child_func) else: # Run in the standard foreground. agent.start(foreground=True) elif 'check' == command: if len(args) < 2: sys.stderr.write( "Usage: %s check <check_name> [check_rate]\n" "Add check_rate as last argument to compute rates\n" % sys.argv[0] ) return 1 check_name = args[1] try: import checks.collector # Try the old-style check first print getattr(checks.collector, check_name)(log).check(agentConfig) except Exception: # If not an old-style check, try checks.d checks = load_check_directory(agentConfig, hostname) for check in checks['initialized_checks']: if check.name == check_name: if in_developer_mode: check.run = AgentProfiler.wrap_profiling(check.run) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) if len(args) == 3 and args[2] == 'check_rate': print "Running 2nd iteration to capture rate metrics" time.sleep(1) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) check.stop() elif 'configcheck' == command or 'configtest' == command: configcheck() sd_configcheck(agentConfig) elif 'jmx' == command: jmx_command(args[1:], agentConfig) elif 'flare' == command: Flare.check_user_rights() case_id = int(args[1]) if len(args) > 1 else None f = Flare(True, case_id) f.collect() try: f.upload() except Exception as e: print 'The upload failed:\n{0}'.format(str(e)) return 0
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats( proc_path=self._agentConfig.get('procfs_path', '/proc').rstrip('/') ) emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get('service_discovery'): self.sd_backend = get_sd_backend(self._agentConfig) if _is_affirmative(self._agentConfig.get('sd_jmx_enable')): pipe_path = get_jmx_pipe_path() if Platform.is_windows(): pipe_name = pipe_path.format(pipename=SD_PIPE_NAME) else: pipe_name = os.path.join(pipe_path, SD_PIPE_NAME) if os.access(pipe_path, os.W_OK): if not os.path.exists(pipe_name): os.mkfifo(pipe_name) self.sd_pipe = os.open(pipe_name, os.O_RDWR) # RW to avoid blocking (will only W) # Initialize Supervisor proxy self.supervisor_proxy = self._get_supervisor_socket(self._agentConfig) else: log.debug('Unable to create pipe in temporary directory. JMX service discovery disabled.') # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Load JMX configs if available if self._jmx_service_discovery_enabled: jmx_sd_configs = generate_jmx_configs(self._agentConfig, hostname) if jmx_sd_configs: self._submit_jmx_service_discovery(jmx_sd_configs) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile try: self.collector_profile_interval = int( self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL)) except ValueError: log.warn('collector_profile_interval is invalid. ' 'Using default value instead (%s).' % DEFAULT_COLLECTOR_PROFILE_INTERVAL) self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int(self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() self.allow_profiling = self._agentConfig.get('allow_profiling', True) profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: # Setup profiling if necessary if self.allow_profiling and self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) if self.reload_configs_flag: if isinstance(self.reload_configs_flag, set): self.reload_configs(checks_to_reload=self.reload_configs_flag) else: self.reload_configs() # Do the work. Pass `configs_reloaded` to let the collector know if it needs to # look for the AgentMetrics check and pop it out. self.collector.run(checksd=self._checksd, start_event=self.start_event, configs_reloaded=True if self.reload_configs_flag else False) self.reload_configs_flag = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if self._agentConfig.get('service_discovery') and self.sd_backend and \ not self.sd_backend.reload_check_configs: try: self.sd_backend.reload_check_configs = get_config_store( self._agentConfig).crawl_config_template() except Exception as e: log.warn('Something went wrong while looking for config template changes: %s' % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get('service_discovery') and self.sd_backend and \ self.sd_backend.reload_check_configs: self.reload_configs_flag = self.sd_backend.reload_check_configs self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format(self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def run(self): try: hostname = get_hostname() except HostnameException as e: logging.critical( "{} - You can define one in datadog.yaml or in your hosts file" .format(e)) sys.exit(1) logging.info("Starting the agent, hostname: %s", hostname) # init Forwarder logging.info("Starting the Forwarder") api_key = config.get('api_key') dd_url = config.get('dd_url') if not dd_url: logging.error('No Datadog URL configured - cannot continue') sys.exit(1) if not api_key: logging.error('No API key configured - cannot continue') sys.exit(1) # get proxy settings proxies = get_proxy() logging.debug('Proxy configuration used: %s', proxies) # get site url forwarder = Forwarder( api_key, get_site_url(dd_url, site=config.get('site')), proxies=proxies, ) forwarder.start() # agent aggregator aggregator = MetricsAggregator( hostname, interval=config.get('aggregator_interval'), expiry_seconds=(config.get('min_collection_interval') + config.get('aggregator_expiry_seconds')), recent_point_threshold=config.get('recent_point_threshold'), histogram_aggregates=config.get('histogram_aggregates'), histogram_percentiles=config.get('histogram_percentiles'), ) # serializer serializer = Serializer( aggregator, forwarder, ) # instantiate collector collector = Collector(config, aggregator) collector.load_check_classes() collector.instantiate_checks() # instantiate AgentRunner runner = AgentRunner(collector, serializer, config) # instantiate Dogstatsd reporter = None dsd_server = None dsd_enable = config['dogstatsd'].get('enable', False) if dsd_enable: reporter, dsd_server, _ = init_dogstatsd(config, forwarder=forwarder) dsd = DogstatsdRunner(dsd_server) # instantiate API status = { 'agent': aggregator.stats, 'forwarder': forwarder.stats, 'collector': collector.status, } if dsd_server: status['dogstatsd'] = dsd_server.aggregator.stats api = APIServer(config, status=status) handler = SignalHandler() # components handler.register('runner', runner) handler.register('forwarder', forwarder) handler.register('api', api) if dsd_enable: handler.register('reporter', reporter) handler.register('dsd_server', dsd_server) # signals handler.handle(signal.SIGTERM) handler.handle(signal.SIGINT) # start signal handler handler.start() runner.start() api.start() if dsd_enable: reporter.start() dsd.start() dsd.join() logging.info("Dogstatsd server done...") try: dsd.raise_for_status() except Exception as e: log.error("There was a problem with the dogstatsd server: %s", e) reporter.stop() runner.join() logging.info("Collector done...") api.join() logging.info("API done...") handler.stop() handler.join() logging.info("Signal handler done...") logging.info("Thank you for shopping at DataDog! Come back soon!") sys.exit(0)
def main(): options, args = get_parsed_args() agentConfig = get_config(options=options) autorestart = agentConfig.get('autorestart', False) hostname = get_hostname(agentConfig) in_developer_mode = agentConfig.get('developer_mode') COMMANDS_AGENT = [ 'start', 'stop', 'restart', 'status', 'foreground', ] COMMANDS_NO_AGENT = [ 'info', 'check', 'configcheck', 'jmx', 'flare', ] COMMANDS = COMMANDS_AGENT + COMMANDS_NO_AGENT if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 # TODO: actually kill the start/stop/restart/status command for 5.11 if command in ['start', 'stop', 'restart', 'status' ] and not in_developer_mode: logging.error('Please use supervisor to manage the agent') return 1 if command in COMMANDS_AGENT: agent = Agent(PidFile(PID_NAME, PID_DIR).get_path(), autorestart, in_developer_mode=in_developer_mode) if 'start' == command: log.info('Start daemon') agent.start() elif 'stop' == command: log.info('Stop daemon') agent.stop() elif 'restart' == command: log.info('Restart daemon') agent.restart() elif 'status' == command: agent.status() elif 'info' == command: return Agent.info(verbose=options.verbose) elif 'foreground' == command: log.info('Agent version %s' % get_version()) if autorestart: # Set-up the supervisor callbacks and fork it. logging.info('Running Agent with auto-restart ON') def child_func(): agent.start(foreground=True) def parent_func(): agent.start_event = False AgentSupervisor.start(parent_func, child_func) else: # Run in the standard foreground. agent.start(foreground=True) elif 'check' == command: if len(args) < 2: sys.stderr.write( "Usage: %s check <check_name> [check_rate]\n" "Add check_rate as last argument to compute rates\n" % sys.argv[0]) return 1 check_name = args[1] try: import checks.collector # Try the old-style check first print getattr(checks.collector, check_name)(log).check(agentConfig) except Exception: # If not an old-style check, try checks.d checks = load_check_directory(agentConfig, hostname) for check in checks['initialized_checks']: if check.name == check_name: if in_developer_mode: check.run = AgentProfiler.wrap_profiling(check.run) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) if len(args) == 3 and args[2] == 'check_rate': print "Running 2nd iteration to capture rate metrics" time.sleep(1) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) check.stop() elif 'configcheck' == command or 'configtest' == command: configcheck() sd_configcheck(agentConfig) elif 'jmx' == command: jmx_command(args[1:], agentConfig) elif 'flare' == command: Flare.check_user_rights() case_id = int(args[1]) if len(args) > 1 else None f = Flare(True, case_id) f.collect() try: f.upload() except Exception as e: print 'The upload failed:\n{0}'.format(str(e)) return 0
def run(self): handlers = [ (r"/intake/?", AgentInputHandler), (r"/intake/metrics?", MetricsAgentInputHandler), (r"/intake/metadata?", MetadataAgentInputHandler), (r"/api/v1/series/?", ApiInputHandler), (r"/api/v1/check_run/?", ApiCheckRunHandler), (r"/status/?", StatusHandler), ] settings = dict( cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=", xsrf_cookies=False, debug=False, log_function=self.log_request ) non_local_traffic = self._agentConfig.get("non_local_traffic", False) tornado.web.Application.__init__(self, handlers, **settings) http_server = tornado.httpserver.HTTPServer(self) try: # non_local_traffic must be == True to match, not just some non-false value if non_local_traffic is True: http_server.listen(self._port) else: # localhost in lieu of 127.0.0.1 to support IPv6 try: http_server.listen(self._port, address=self._agentConfig['bind_host']) except gaierror: log.warning("localhost seems undefined in your host file, using 127.0.0.1 instead") http_server.listen(self._port, address="127.0.0.1") except socket_error as e: if "Errno 99" in str(e): log.warning("IPv6 doesn't seem to be fully supported. Falling back to IPv4") http_server.listen(self._port, address="127.0.0.1") else: raise except socket_error as e: log.exception("Socket error %s. Is another application listening on the same port ? Exiting", e) sys.exit(1) except Exception as e: log.exception("Uncaught exception. Forwarder is exiting.") sys.exit(1) log.info("Listening on port %d" % self._port) # Register callbacks self.mloop = tornado.ioloop.IOLoop.current() logging.getLogger().setLevel(get_logging_config()['log_level'] or logging.INFO) def flush_trs(): if self._watchdog: self._watchdog.reset() self._postMetrics() self._tr_manager.flush() tr_sched = tornado.ioloop.PeriodicCallback(flush_trs, TRANSACTION_FLUSH_INTERVAL, io_loop=self.mloop) # Register optional Graphite listener gport = self._agentConfig.get("graphite_listen_port", None) if gport is not None: log.info("Starting graphite listener on port %s" % gport) from graphite import GraphiteServer gs = GraphiteServer(self, get_hostname(self._agentConfig), io_loop=self.mloop) if non_local_traffic is True: gs.listen(gport) else: gs.listen(gport, address="localhost") # Start everything if self._watchdog: self._watchdog.reset() tr_sched.start() self.mloop.start() log.info("Stopped")
def test_get_hostname_bin_nonrfc(subprocess): # this would fail validation if specified manually # but since it's collected from the OS we let it fly assert not is_valid_hostname("subprocess_hostname") assert get_hostname() == "subprocess_hostname"
def test_get_hostname_bin(subprocess): assert get_hostname() == "subprocess-hostname"
def test_get_hostname_error(subprocess, socket): with pytest.raises(Exception) as err: get_hostname() assert "Unable to reliably determine hostname or hostname not RFC1123 compliant." in str( err)
def test_get_hostname_error(subprocess, socket): with pytest.raises(Exception) as err: get_hostname() assert "Unable to reliably determine host name. You can define one in datadog.conf or in your hosts file" in str( err)
def run(self): try: hostname = get_hostname() except HostnameException as e: logging.critical( "{} - You can define one in datadog.yaml or in your hosts file" .format(e)) sys.exit(1) logging.info("Starting the agent, hostname: %s", hostname) # init Forwarder logging.info("Starting the Forwarder") api_key = config.get('api_key') dd_url = config.get('dd_url') if not dd_url: logging.error('No Datadog URL configured - cannot continue') sys.exit(1) if not api_key: logging.error('No API key configured - cannot continue') sys.exit(1) # get proxy settings proxies = get_proxy() logging.debug('Proxy configuration used: %s', proxies) forwarder = Forwarder( api_key, dd_url, proxies=proxies, ) forwarder.start() # aggregator aggregator = MetricsAggregator( hostname, interval=config.get('aggregator_interval'), expiry_seconds=(config.get('min_collection_interval') + config.get('aggregator_expiry_seconds')), recent_point_threshold=config.get('recent_point_threshold'), histogram_aggregates=config.get('histogram_aggregates'), histogram_percentiles=config.get('histogram_percentiles'), ) # serializer serializer = Serializer( aggregator, forwarder, ) # instantiate collector collector = Collector(config, aggregator) collector.load_check_classes() collector.instantiate_checks() # instantiate AgentRunner runner = AgentRunner(collector, serializer, config) # instantiate API api = APIServer(config, aggregator.stats) handler = SignalHandler() # components handler.register('runner', runner) handler.register('forwarder', forwarder) handler.register('api', api) # signals handler.handle(signal.SIGTERM) handler.handle(signal.SIGINT) # start signal handler handler.start() runner.start() api.start() runner.join() logging.info("Agent done...") api.join() logging.info("API done...") handler.stop() handler.join() logging.info("Signal handler done...") logging.info("Thank you for shopping at DataDog! Come back soon!") sys.exit(0)
def run(self): handlers = [ (r"/intake/?", AgentInputHandler), (r"/intake/metrics?", MetricsAgentInputHandler), (r"/intake/metadata?", MetadataAgentInputHandler), (r"/api/v1/series/?", ApiInputHandler), (r"/api/v1/check_run/?", ApiCheckRunHandler), (r"/status/?", StatusHandler), ] settings = dict( cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=", xsrf_cookies=False, debug=False, log_function=self.log_request) non_local_traffic = self._agentConfig.get("non_local_traffic", False) tornado.web.Application.__init__(self, handlers, **settings) http_server = tornado.httpserver.HTTPServer(self) try: # non_local_traffic must be == True to match, not just some non-false value if non_local_traffic is True: http_server.listen(self._port) else: # localhost in lieu of 127.0.0.1 to support IPv6 try: http_server.listen(self._port, address=self._agentConfig['bind_host']) except gaierror: log.warning( "localhost seems undefined in your host file, using 127.0.0.1 instead" ) http_server.listen(self._port, address="127.0.0.1") except socket_error as e: if "Errno 99" in str(e): log.warning( "IPv6 doesn't seem to be fully supported. Falling back to IPv4" ) http_server.listen(self._port, address="127.0.0.1") else: raise except socket_error as e: log.exception( "Socket error %s. Is another application listening on the same port ? Exiting", e) sys.exit(1) except Exception as e: log.exception("Uncaught exception. Forwarder is exiting.") sys.exit(1) log.info("Listening on port %d" % self._port) # Register callbacks self.mloop = tornado.ioloop.IOLoop.current() logging.getLogger().setLevel(get_logging_config()['log_level'] or logging.INFO) def flush_trs(): if self._watchdog: self._watchdog.reset() self._postMetrics() self._tr_manager.flush() tr_sched = tornado.ioloop.PeriodicCallback(flush_trs, TRANSACTION_FLUSH_INTERVAL, io_loop=self.mloop) # Register optional Graphite listener gport = self._agentConfig.get("graphite_listen_port", None) if gport is not None: log.info("Starting graphite listener on port %s" % gport) from graphite import GraphiteServer gs = GraphiteServer(self, get_hostname(self._agentConfig), io_loop=self.mloop) if non_local_traffic is True: gs.listen(gport) else: gs.listen(gport, address="localhost") # Start everything if self._watchdog: self._watchdog.reset() tr_sched.start() self.mloop.start() log.info("Stopped")
def start(): """ Dummy start until we have a collector """ init_agent() hostname = get_hostname() logging.info("Starting the agent, hostname: %s", hostname) # init Forwarder logging.info("Starting the Forwarder") api_key = config.get('api_key') dd_url = config.get('dd_url') if not dd_url: logging.error('No Datadog URL configured - cannot continue') sys.exit(1) if not api_key: logging.error('No API key configured - cannot continue') sys.exit(1) forwarder = Forwarder(api_key, dd_url) forwarder.start() # aggregator aggregator = MetricsAggregator( hostname, interval=config.get('aggregator_interval'), expiry_seconds=(config.get('min_collection_interval') + config.get('aggregator_expiry_seconds')), recent_point_threshold=config.get('recent_point_threshold'), histogram_aggregates=config.get('histogram_aggregates'), histogram_percentiles=config.get('histogram_percentiles'), ) # serializer serializer = Serializer( aggregator, forwarder, ) # instantiate collector collector = Collector(config, aggregator) collector.load_check_classes() collector.instantiate_checks() def signal_handler(signal, frame): logging.info("SIGINT received: stopping the agent") logging.info("Stopping the forwarder") forwarder.stop() logging.info("See you !") sys.exit(0) signal.signal(signal.SIGINT, signal_handler) # update the metadata periodically? metadata = get_metadata(hostname) serializer.submit_metadata(metadata) while True: collector.run_checks() serializer.serialize_and_push() time.sleep(config.get('min_collection_interval'))
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) if not Platform.is_windows(): # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats(proc_path=self._agentConfig.get( 'procfs_path', '/proc').rstrip('/')) emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get('service_discovery'): self.sd_backend = get_sd_backend(self._agentConfig) if _is_affirmative(self._agentConfig.get('sd_jmx_enable', False)): pipe_path = get_jmx_pipe_path() if Platform.is_windows(): pipe_name = pipe_path.format(pipename=SD_PIPE_NAME) else: pipe_name = os.path.join(pipe_path, SD_PIPE_NAME) if os.access(pipe_path, os.W_OK): if not os.path.exists(pipe_name): os.mkfifo(pipe_name) self.sd_pipe = os.open( pipe_name, os.O_RDWR) # RW to avoid blocking (will only W) # Initialize Supervisor proxy self.supervisor_proxy = self._get_supervisor_socket( self._agentConfig) else: log.debug( 'Unable to create pipe in temporary directory. JMX service discovery disabled.' ) # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Load JMX configs if available if self._jmx_service_discovery_enabled: jmx_sd_configs = generate_jmx_configs(self._agentConfig, hostname) if jmx_sd_configs: self._submit_jmx_service_discovery(jmx_sd_configs) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile try: self.collector_profile_interval = int( self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL)) except ValueError: log.warn('collector_profile_interval is invalid. ' 'Using default value instead (%s).' % DEFAULT_COLLECTOR_PROFILE_INTERVAL) self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int( self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() self.allow_profiling = self._agentConfig.get('allow_profiling', True) profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: # Setup profiling if necessary if self.allow_profiling and self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) if self.reload_configs_flag: if isinstance(self.reload_configs_flag, set): self.reload_configs( checks_to_reload=self.reload_configs_flag) else: self.reload_configs() # Do the work. Pass `configs_reloaded` to let the collector know if it needs to # look for the AgentMetrics check and pop it out. self.collector.run( checksd=self._checksd, start_event=self.start_event, configs_reloaded=True if self.reload_configs_flag else False) self.reload_configs_flag = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if self._agentConfig.get('service_discovery') and self.sd_backend and \ not self.sd_backend.reload_check_configs: try: self.sd_backend.reload_check_configs = get_config_store( self._agentConfig).crawl_config_template() except Exception as e: log.warn( 'Something went wrong while looking for config template changes: %s' % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get('service_discovery') and self.sd_backend and \ self.sd_backend.reload_check_configs: self.reload_configs_flag = self.sd_backend.reload_check_configs self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format( self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def run(self): try: hostname = get_hostname() except HostnameException as e: logging.critical( "{} - You can define one in datadog.yaml or in your hosts file" .format(e)) sys.exit(1) logging.info("Starting the agent, hostname: %s", hostname) # init Forwarder logging.info("Starting the Forwarder") api_key = config.get('api_key') dd_url = config.get('dd_url') if not dd_url: logging.error('No Datadog URL configured - cannot continue') sys.exit(1) if not api_key: logging.error('No API key configured - cannot continue') sys.exit(1) # get proxy settings proxies = get_proxy() logging.debug('Proxy configuration used: %s', proxies) forwarder = Forwarder( api_key, dd_url, proxies=proxies, ) forwarder.start() # aggregator aggregator = MetricsAggregator( hostname, interval=config.get('aggregator_interval'), expiry_seconds=(config.get('min_collection_interval') + config.get('aggregator_expiry_seconds')), recent_point_threshold=config.get('recent_point_threshold'), histogram_aggregates=config.get('histogram_aggregates'), histogram_percentiles=config.get('histogram_percentiles'), ) # serializer serializer = Serializer( aggregator, forwarder, ) # instantiate collector collector = Collector(config, aggregator) collector.load_check_classes() collector.instantiate_checks() # instantiate AgentRunner runner = AgentRunner(collector, serializer, config) # instantiate API api = APIServer(8888, aggregator.stats) def signal_handler(signal, frame): log.info("SIGINT received: stopping the agent") log.info("Stopping the forwarder") runner.stop() forwarder.stop() api.stop() log.info("See you !") sys.exit(0) signal.signal(signal.SIGINT, signal_handler) runner.start() api.run() # blocking tornado in main thread
def test_get_hostname_conf(): config.set("hostname", "test-hostname") assert get_hostname() == "test-hostname" config.reset("hostname")
def init(config_path=None, use_watchdog=False, use_forwarder=False, args=None): """Configure the server and the reporting thread. """ c = get_config(parse_args=False, cfg_path=config_path) if (not c['use_dogstatsd'] and (args and args[0] in ['start', 'restart'] or not args)): log.info("StsStatsd is disabled. Exiting") # We're exiting purposefully, so exit with zero (supervisor's expected # code). HACK: Sleep a little bit so supervisor thinks we've started cleanly # and thus can exit cleanly. sleep(4) sys.exit(0) port = c['dogstatsd_port'] interval = DOGSTATSD_FLUSH_INTERVAL api_key = c['api_key'] aggregator_interval = DOGSTATSD_AGGREGATOR_BUCKET_SIZE non_local_traffic = c['non_local_traffic'] forward_to_host = c.get('statsd_forward_host') forward_to_port = c.get('statsd_forward_port') event_chunk_size = c.get('event_chunk_size') recent_point_threshold = c.get('recent_point_threshold', None) server_host = c['bind_host'] target = c['dd_url'] if use_forwarder: target = c['dogstatsd_target'] hostname = get_hostname(c) # Create the aggregator (which is the point of communication between the # server and reporting threads. assert 0 < interval aggregator = MetricsBucketAggregator( hostname, aggregator_interval, recent_point_threshold=recent_point_threshold, formatter=get_formatter(c), histogram_aggregates=c.get('histogram_aggregates'), histogram_percentiles=c.get('histogram_percentiles'), utf8_decoding=c['utf8_decoding']) # Start the reporting thread. reporter = Reporter(interval, aggregator, target, api_key, use_watchdog, event_chunk_size) # NOTICE: when `non_local_traffic` is passed we need to bind to any interface on the box. The forwarder uses # Tornado which takes care of sockets creation (more than one socket can be used at once depending on the # network settings), so it's enough to just pass an empty string '' to the library. # In Dogstatsd we use a single, fullstack socket, so passing '' as the address doesn't work and we default to # '0.0.0.0'. If someone needs to bind Dogstatsd to the IPv6 '::', they need to turn off `non_local_traffic` and # use the '::' meta address as `bind_host`. if non_local_traffic: server_host = '0.0.0.0' server = Server(aggregator, server_host, port, forward_to_host=forward_to_host, forward_to_port=forward_to_port) return reporter, server, c
def test_get_hostname_socket(subprocess, socket): assert get_hostname() == "socket-hostname"
def __init__(self, aggregator, forwarder): self._aggregator = aggregator self._forwarder = forwarder self._internal_hostname = get_hostname()
def init_dogstatsd(config, forwarder=None): api_key = config['api_key'] recent_point_threshold = config.get('recent_point_threshold', None) server_host = config['dogstatsd']['bind_host'] dd_url = config['dd_url'] port = config['dogstatsd']['port'] forward_to_host = config['dogstatsd'].get('forward_host') forward_to_port = config['dogstatsd'].get('forward_port') non_local_traffic = config['dogstatsd'].get('non_local_traffic') so_rcvbuf = config['dogstatsd'].get('so_rcvbuf') utf8_decoding = config['dogstatsd'].get('utf8_decoding') interval = DOGSTATSD_FLUSH_INTERVAL aggregator_interval = DOGSTATSD_AGGREGATOR_BUCKET_SIZE hostname = get_hostname() # get proxy settings proxies = get_proxy() if not forwarder: forwarder = Forwarder( api_key, get_site_url(dd_url, site=config.get('site')), proxies=proxies, ) aggregator = MetricsBucketAggregator( hostname, aggregator_interval, recent_point_threshold=recent_point_threshold, formatter=get_formatter(config), histogram_aggregates=config.get('histogram_aggregates'), histogram_percentiles=config.get('histogram_percentiles'), utf8_decoding=utf8_decoding) # serializer serializer = Serializer( aggregator, forwarder, ) reporter = Reporter(interval, aggregator, serializer, api_key, use_watchdog=False, hostname=hostname) # NOTICE: when `non_local_traffic` is passed we need to bind to any interface on the box. The forwarder uses # Tornado which takes care of sockets creation (more than one socket can be used at once depending on the # network settings), so it's enough to just pass an empty string '' to the library. # In Dogstatsd we use a single, fullstack socket, so passing '' as the address doesn't work and we default to # '0.0.0.0'. If someone needs to bind Dogstatsd to the IPv6 '::', they need to turn off `non_local_traffic` and # use the '::' meta address as `bind_host`. if non_local_traffic: server_host = '0.0.0.0' server = Server(aggregator, server_host, port, forward_to_host=forward_to_host, forward_to_port=forward_to_port, so_rcvbuf=so_rcvbuf) return reporter, server, forwarder
def test_topology_collection(self): agentConfig = { 'api_key': 'test_apikey', 'check_timings': True, 'collect_ec2_tags': True, 'collect_orchestrator_tags': False, 'collect_instance_metadata': False, 'create_dd_check_tags': False, 'version': 'test', 'tags': '', } # Run a single checks.d check as part of the collector. dummy_topology_check_config = { "init_config": {}, "instances": [{ "dummy_instance": "dummy_instance" }] } # create dummy checks, creating two component and 1 relation check1 = DummyTopologyCheck( 1, 'dummy_topology_check', dummy_topology_check_config.get('init_config'), agentConfig, instances=[{ "instance_id": 1, "pass": True }, { "instance_id": 2, "pass": True }]) check2 = DummyTopologyCheck( 2, 'dummy_topology_check', dummy_topology_check_config.get('init_config'), agentConfig, instances=[{ "instance_id": 3, "pass": True }, { "instance_id": 4, "pass": True }], snapshot=True) emitted_topologies = [] # mock emitter to pick up data emitted by the collector def mock_emitter(message, log, agentConfig, endpoint): emitted_topologies.extend(message['topologies']) c = Collector(agentConfig, [mock_emitter], {}, get_hostname(agentConfig)) payload, _ = c.run({ 'initialized_checks': [check1, check2], 'init_failed_checks': {} }) topologies = payload['topologies'] def assertTopology(topology, check, instance_id): self.assertEquals(topology['instance'], check.instance_key(instance_id)) self.assertEquals(len(topology['components']), 2) self.assertEquals(len(topology['relations']), 1) self.assertEquals(check.expected_components(instance_id), topology['components']) self.assertEquals(check.expected_relations(), topology['relations']) if check.snapshot: self.assertTrue(topology["start_snapshot"]) self.assertTrue(topology["stop_snapshot"]) else: self.assertTrue("start_snapshot" not in topology) self.assertTrue("stop_snapshot" not in topology) # Make sure the emissions of the collector are observed assertTopology(topologies[0], check1, 1) assertTopology(topologies[1], check1, 2) assertTopology(topologies[2], check2, 4) assertTopology(topologies[3], check2, 3) assertTopology(emitted_topologies[0], check1, 1) assertTopology(emitted_topologies[1], check1, 2) assertTopology(emitted_topologies[2], check2, 4) assertTopology(emitted_topologies[3], check2, 3)