def _fetch_data(self, instance): if 'kong_status_url' not in instance: raise Exception('missing "kong_status_url" value') tags = instance.get('tags', []) url = instance.get('kong_status_url') ssl_validation = instance.get('ssl_validation', True) parsed_url = urlparse(url) host = parsed_url.hostname port = parsed_url.port or 80 service_check_name = 'kong.can_connect' service_check_tags = ['kong_host:%s' % host, 'kong_port:%s' % port] + tags try: self.log.debug(u"Querying URL: {0}".format(url)) response = requests.get(url, headers=headers(self.agentConfig), verify=ssl_validation) self.log.debug(u"Kong status `response`: {0}".format(response)) response.raise_for_status() except Exception: self.service_check(service_check_name, Kong.CRITICAL, tags=service_check_tags) raise else: if response.status_code == 200: self.service_check(service_check_name, Kong.OK, tags=service_check_tags) else: self.service_check(service_check_name, Kong.CRITICAL, tags=service_check_tags) return self._parse_json(response.content, tags)
def _fetch_url_data(self, url, username, password, verify, custom_headers): ''' Hit a given http url and return the stats lines ''' # Try to fetch data from the stats URL auth = (username, password) url = "%s%s" % (url, STATS_URL) custom_headers.update(headers(self.agentConfig)) self.log.debug("Fetching haproxy stats from url: %s" % url) response = requests.get(url, auth=auth, headers=custom_headers, verify=verify, timeout=self.default_integration_http_timeout) response.raise_for_status() # it only needs additional decoding in py3, so skip it if it's py2 if PY2: return response.content.splitlines() else: content = response.content # If the content is a string, it can't be decoded again # But if it's bytes, it can be decoded. # So, check if it has the decode method decode_fn = getattr(content, "decode", None) if callable(decode_fn): content = content.decode('utf-8') return content.splitlines()
def get(self, url, instance, service_check_tags, run_check=False): "Hit a given URL and return the parsed json" self.log.debug('Fetching CouchDB stats at url: %s' % url) auth = None if 'user' in instance and 'password' in instance: auth = (instance['user'], instance['password']) # Override Accept request header so that failures are not redirected to the Futon web-ui request_headers = headers(self.agentConfig) request_headers['Accept'] = 'text/json' try: r = requests.get(url, auth=auth, headers=request_headers, timeout=int(instance.get('timeout', self.TIMEOUT))) r.raise_for_status() if run_check: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags, message='Connection to %s was successful' % url) except requests.exceptions.Timeout as e: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags, message="Request timeout: {0}, {1}".format(url, e)) raise except requests.exceptions.HTTPError as e: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags, message=str(e.message)) raise except Exception as e: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags, message=str(e)) raise return r.json()
def _process_status(self, status_url, auth, tags, http_host, timeout, disable_ssl_validation, use_fastcgi): data = {} try: if use_fastcgi: data = json.loads( self.request_fastcgi(status_url, query='json')) else: # TODO: adding the 'full' parameter gets you per-process detailed # informations, which could be nice to parse and output as metrics max_attempts = 3 for i in range(max_attempts): resp = requests.get(status_url, auth=auth, timeout=timeout, headers=headers(self.agentConfig, http_host=http_host), verify=not disable_ssl_validation, params={'json': True}) # Exponential backoff, wait at most (max_attempts - 1) times in case we get a 503. # Delay in seconds is (2^i + random amount of seconds between 0 and 1) # 503s originated here: https://github.com/php/php-src/blob/d84ef96/sapi/fpm/fpm/fpm_status.c#L96 if resp.status_code == 503 and i < max_attempts - 1: # retry time.sleep(2**i + random.random()) continue resp.raise_for_status() data = resp.json() # successfully got a response, exit the backoff system break except Exception as e: self.log.error("Failed to get metrics from {}: {}".format( status_url, e)) raise pool_name = data.get('pool', 'default') metric_tags = tags + ["pool:{0}".format(pool_name)] if http_host is not None: metric_tags += ["http_host:{0}".format(http_host)] for key, mname in iteritems(self.GAUGES): if key not in data: self.log.warn( "Gauge metric {0} is missing from FPM status".format(key)) continue self.gauge(mname, int(data[key]), tags=metric_tags) for key, mname in iteritems(self.MONOTONIC_COUNTS): if key not in data: self.log.warn( "Counter metric {0} is missing from FPM status".format( key)) continue self.monotonic_count(mname, int(data[key]), tags=metric_tags) # return pool, to tag the service check with it if we have one return pool_name
def _process_ping(self, ping_url, ping_reply, auth, tags, pool_name, http_host, timeout, disable_ssl_validation): if ping_reply is None: ping_reply = 'pong' sc_tags = ["ping_url:{0}".format(ping_url)] + tags if http_host is not None: sc_tags += ["http_host:{0}".format(http_host)] try: # TODO: adding the 'full' parameter gets you per-process detailed # informations, which could be nice to parse and output as metrics resp = requests.get(ping_url, auth=auth, timeout=timeout, headers=headers(self.agentConfig, http_host=http_host), verify=not disable_ssl_validation) resp.raise_for_status() if ping_reply not in resp.text: raise Exception("Received unexpected reply to ping: {}".format( resp.text)) except Exception as e: self.log.error("Failed to ping FPM pool {} on URL {}: {}".format( pool_name, ping_url, e)) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=sc_tags, message=str(e)) else: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=sc_tags)
def _perform_request(self, instance, url, ssl_validation, auth): r = requests.get(url, auth=auth, headers=headers(self.agentConfig), verify=ssl_validation, timeout=self.default_integration_http_timeout, proxies=self.get_instance_proxy(instance, url)) r.raise_for_status() return r
def _get_data(self, url, config, send_sc=True): """ Hit a given URL and return the parsed json """ # Load basic authentication configuration, if available. if config.username and config.password: auth = (config.username, config.password) else: auth = None # Load SSL configuration, if available. # ssl_verify can be a bool or a string # (http://docs.python-requests.org/en/latest/user/advanced/#ssl-cert-verification) if isinstance(config.ssl_verify, bool) or isinstance(config.ssl_verify, str): verify = config.ssl_verify else: verify = None if config.ssl_cert and config.ssl_key: cert = (config.ssl_cert, config.ssl_key) elif config.ssl_cert: cert = config.ssl_cert else: cert = None resp = None try: resp = requests.get( url, timeout=config.timeout, headers=headers(self.agentConfig), auth=auth, verify=verify, cert=cert ) resp.raise_for_status() except Exception as e: # this means we've hit a particular kind of auth error that means the config is broken if resp and resp.status_code == 400: raise AuthenticationError("The ElasticSearch credentials are incorrect") if send_sc: self.service_check( self.SERVICE_CHECK_CONNECT_NAME, AgentCheck.CRITICAL, message="Error {0} when hitting {1}".format(e, url), tags=config.service_check_tags ) raise self.log.debug("request to url {0} returned: {1}".format(url, resp)) return resp.json()
def _get_stats(self, url, instance): """ Hit a given URL and return the parsed json. """ self.log.debug('Fetching Couchbase stats at url: %s' % url) timeout = float(instance.get('timeout', DEFAULT_TIMEOUT)) auth = None if 'user' in instance and 'password' in instance: auth = (instance['user'], instance['password']) r = requests.get(url, auth=auth, headers=headers(self.agentConfig), timeout=timeout) r.raise_for_status() return r.json()
def check(self, instance): if 'monitor_agent_url' not in instance: raise Exception( 'Fluentd instance missing "monitor_agent_url" value.') try: url = instance.get('monitor_agent_url') plugin_ids = instance.get('plugin_ids', []) custom_tags = instance.get('tags', []) # Fallback with `tag_by: plugin_id` tag_by = instance.get('tag_by') tag_by = tag_by if tag_by in self._AVAILABLE_TAGS else 'plugin_id' parsed_url = urlparse.urlparse(url) monitor_agent_host = parsed_url.hostname monitor_agent_port = parsed_url.port or 24220 service_check_tags = [ 'fluentd_host:%s' % monitor_agent_host, 'fluentd_port:%s' % monitor_agent_port ] + custom_tags timeout = float(instance.get('timeout', self.default_timeout)) r = requests.get(url, headers=headers(self.agentConfig), timeout=timeout) r.raise_for_status() status = r.json() for p in status['plugins']: tag = "%s:%s" % (tag_by, p.get(tag_by)) for m in self.GAUGES: if p.get(m) is None: continue # Filter unspecified plugins to keep backward compatibility. if len(plugin_ids) == 0 or p.get( 'plugin_id') in plugin_ids: self.gauge('fluentd.%s' % (m), p.get(m), [tag] + custom_tags) except Exception as e: msg = "No stats could be retrieved from %s : %s" % (url, str(e)) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags, message=msg) raise else: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags)
def _perform_request(self, url, path, ssl_params, timeout): certificate = None if 'ssl_certfile' in ssl_params and 'ssl_keyfile' in ssl_params: certificate = (ssl_params['ssl_certfile'], ssl_params['ssl_keyfile']) verify = ssl_params.get( 'ssl_ca_certs', True) if ssl_params['ssl_cert_validation'] else False return requests.get(url + path, verify=verify, cert=certificate, timeout=timeout, headers=headers(self.agentConfig))
def _fetch_url_data(self, url, username, password, verify, custom_headers): ''' Hit a given http url and return the stats lines ''' # Try to fetch data from the stats URL auth = (username, password) url = "%s%s" % (url, STATS_URL) custom_headers.update(headers(self.agentConfig)) self.log.debug("Fetching haproxy stats from url: %s" % url) response = requests.get(url, auth=auth, headers=custom_headers, verify=verify, timeout=self.default_integration_http_timeout) response.raise_for_status() return response.content.splitlines()
def get(self, url, service_check_tags, run_check=False): """Hit a given URL and return the parsed json""" self.log.debug('Fetching CouchDB stats at url: %s' % url) # Override Accept request header so that failures are not redirected to the Futon web-ui request_headers = headers(self.agentConfig) request_headers['Accept'] = 'text/json' try: r = self.http.get(url, headers=request_headers) r.raise_for_status() if run_check: self.service_check( self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags, message='Connection to %s was successful' % url, ) except requests.exceptions.Timeout as e: self.service_check( self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags, message="Request timeout: {0}, {1}".format(url, e), ) raise except requests.exceptions.HTTPError as e: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags, message=str(e)) raise except Exception as e: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags, message=str(e)) raise return r.json()
def _check_connectivity_to_master(self, instance, tags): url = instance.get('gitlab_url') if url is None: # Simply ignore this service check if not configured return parsed_url = urlparse(url) gitlab_host = parsed_url.hostname gitlab_port = 443 if parsed_url.scheme == 'https' else (parsed_url.port or 80) service_check_tags = [ 'gitlab_host:{}'.format(gitlab_host), 'gitlab_port:{}'.format(gitlab_port) ] service_check_tags.extend(tags) # Load the ssl configuration ssl_cert_validation = _is_affirmative( instance.get('ssl_cert_validation', True)) ssl_ca_certs = instance.get('ssl_ca_certs', True) verify_ssl = ssl_ca_certs if ssl_cert_validation else False # Timeout settings timeouts = ( int( instance.get('connect_timeout', GitlabRunnerCheck.DEFAULT_CONNECT_TIMEOUT)), int( instance.get('receive_timeout', GitlabRunnerCheck.DEFAULT_RECEIVE_TIMEOUT)), ) # Auth settings auth = None if 'gitlab_user' in instance and 'gitlab_password' in instance: auth = (instance['gitlab_user'], instance['gitlab_password']) try: self.log.debug("checking connectivity against {}".format(url)) r = requests.get(url, auth=auth, verify=verify_ssl, timeout=timeouts, headers=headers(self.agentConfig)) if r.status_code != 200: self.service_check( self.MASTER_SERVICE_CHECK_NAME, OpenMetricsBaseCheck.CRITICAL, message="Got {} when hitting {}".format( r.status_code, url), tags=service_check_tags, ) raise Exception("Http status code {} on url {}".format( r.status_code, url)) else: r.raise_for_status() except requests.exceptions.Timeout: # If there's a timeout self.service_check( self.MASTER_SERVICE_CHECK_NAME, OpenMetricsBaseCheck.CRITICAL, message="Timeout when hitting {}".format(url), tags=service_check_tags, ) raise except Exception as e: self.service_check( self.MASTER_SERVICE_CHECK_NAME, OpenMetricsBaseCheck.CRITICAL, message="Error hitting {}. Error: {}".format(url, e), tags=service_check_tags, ) raise else: self.service_check(self.MASTER_SERVICE_CHECK_NAME, OpenMetricsBaseCheck.OK, tags=service_check_tags) self.log.debug("gitlab check succeeded")
def check(self, instance): if 'lighttpd_status_url' not in instance: raise Exception( "Missing 'lighttpd_status_url' variable in Lighttpd config") url = self.assumed_url.get(instance['lighttpd_status_url'], instance['lighttpd_status_url']) tags = instance.get('tags', []) auth = None auth_type = instance.get('auth_type', 'basic').lower() if auth_type == 'basic': if 'user' in instance and 'password' in instance: auth = (instance['user'], instance['password']) elif auth_type == 'digest': if 'user' in instance and 'password' in instance: auth = requests.auth.HTTPDigestAuth(instance['user'], instance['password']) else: msg = "Unsupported value of 'auth_type' variable in Lighttpd config: {}".format( auth_type) raise Exception(msg) self.log.debug("Connecting to %s" % url) # Submit a service check for status page availability. parsed_url = urlparse(url) lighttpd_url = parsed_url.hostname lighttpd_port = parsed_url.port or 80 service_check_tags = [ 'host:%s' % lighttpd_url, 'port:%s' % lighttpd_port ] + tags try: r = requests.get(url, auth=auth, headers=headers(self.agentConfig)) r.raise_for_status() except Exception: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags) raise else: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags) headers_resp = r.headers server_version = self._get_server_version(headers_resp) response = r.content metric_count = 0 # Loop through and extract the numerical values for line in response.split(b'\n'): values = line.split(b': ') if len(values) == 2: # match metric, value = values try: value = float(value) except ValueError: continue # Special case: kBytes => bytes if metric == b'Total kBytes': value = value * 1024 # Send metric as a gauge, if applicable if metric in self.GAUGES: metric_count += 1 metric_name = self.GAUGES[metric] self.gauge(metric_name, value, tags=tags) # Send metric as a rate, if applicable if metric in self.RATES: metric_count += 1 metric_name = self.RATES[metric] self.rate(metric_name, value, tags=tags) # Send metric as a counter, if applicable if metric in self.COUNTERS: metric_count += 1 metric_name = self.COUNTERS[metric] self.increment(metric_name, value, tags=tags) if metric_count == 0: url_suffix = self.URL_SUFFIX_PER_VERSION[server_version] if self.assumed_url.get( instance['lighttpd_status_url'] ) is None and url[-len(url_suffix):] != url_suffix: self.assumed_url[instance['lighttpd_status_url']] = '%s%s' % ( url, url_suffix) self.warning( "Assuming url was not correct. Trying to add %s suffix to the url" % url_suffix) self.check(instance) else: raise Exception( "No metrics were fetched for this instance. Make sure " "that %s is the proper url." % instance['lighttpd_status_url'])
def _check_health_endpoint(self, instance, check_type, tags): if check_type not in self.ALLOWED_SERVICE_CHECKS: raise CheckException( "Health endpoint {} is not a valid endpoint".format( check_type)) url = instance.get('gitlab_url') if url is None: # Simply ignore this service check if not configured self.log.debug( "gitlab_url not configured, service check {} skipped".format( check_type)) return service_check_tags = self._service_check_tags(url) service_check_tags.extend(tags) verify_ssl = self._verify_ssl(instance) # Timeout settings timeouts = ( int( instance.get('connect_timeout', GitlabCheck.DEFAULT_CONNECT_TIMEOUT)), int( instance.get('receive_timeout', GitlabCheck.DEFAULT_RECEIVE_TIMEOUT)), ) # Auth settings auth = None if 'gitlab_user' in instance and 'gitlab_password' in instance: auth = (instance['gitlab_user'], instance['gitlab_password']) # These define which endpoint is hit and which type of check is actually performed # TODO: parse errors and report for single sub-service failure? service_check_name = 'gitlab.{}'.format(check_type) check_url = '{}/-/{}'.format(url, check_type) try: self.log.debug("checking {} against {}".format( check_type, check_url)) r = requests.get(check_url, auth=auth, verify=verify_ssl, timeout=timeouts, headers=headers(self.agentConfig)) if r.status_code != 200: self.service_check( service_check_name, OpenMetricsBaseCheck.CRITICAL, message="Got {} when hitting {}".format( r.status_code, check_url), tags=service_check_tags, ) raise Exception("Http status code {} on check_url {}".format( r.status_code, check_url)) else: r.raise_for_status() except requests.exceptions.Timeout: # If there's a timeout self.service_check( service_check_name, OpenMetricsBaseCheck.CRITICAL, message="Timeout when hitting {}".format(check_url), tags=service_check_tags, ) raise except Exception as e: self.service_check( service_check_name, OpenMetricsBaseCheck.CRITICAL, message="Error hitting {}. Error: {}".format(check_url, e), tags=service_check_tags, ) raise else: self.service_check(service_check_name, OpenMetricsBaseCheck.OK, tags=service_check_tags) self.log.debug("gitlab check {} succeeded".format(check_type))
def check(self, instance): if 'monitor_agent_url' not in instance: raise Exception( 'Fluentd instance missing "monitor_agent_url" value.') try: url = instance.get('monitor_agent_url') plugin_ids = instance.get('plugin_ids', []) custom_tags = instance.get('tags', []) # Fallback with `tag_by: plugin_id` tag_by = instance.get('tag_by') tag_by = tag_by if tag_by in self._AVAILABLE_TAGS else 'plugin_id' parsed_url = urlparse(url) monitor_agent_host = parsed_url.hostname monitor_agent_port = parsed_url.port or 24220 service_check_tags = [ 'fluentd_host:%s' % monitor_agent_host, 'fluentd_port:%s' % monitor_agent_port, ] + custom_tags self.HTTP_CONFIG_REMAPPER = { 'headers': { 'name': 'headers', 'default': headers(self.agentConfig) }, 'timeout': { 'name': 'timeout', 'default': self.default_timeout }, } r = self.http.get(url) r.raise_for_status() status = r.json() for p in status['plugins']: tag = "%s:%s" % (tag_by, p.get(tag_by)) for m in self.GAUGES: metric = p.get(m) if metric is None: continue if m == 'retry_count': # Since v1, retry_count counts the total number of errors. # Use retry/steps field for temporal retry count instead. rs = p.get("retry") if rs is not None: if rs.get("steps") is not None: metric = rs.get("steps") else: metric = 0 # Filter unspecified plugins to keep backward compatibility. if len(plugin_ids) == 0 or p.get( 'plugin_id') in plugin_ids: self.gauge('fluentd.%s' % (m), metric, [tag] + custom_tags) except Exception as e: msg = "No stats could be retrieved from %s : %s" % (url, str(e)) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags, message=msg) raise else: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags)