Beispiel #1
0
    def get_connection(self, key, host, port, user, password, dbname, ssl, connect_fct, tags, use_cached=True):
        "Get and memoize connections to instances"
        if key in self.dbs and use_cached:
            return self.dbs[key]

        elif host != "" and user != "":
            try:
                if host == 'localhost' and password == '':
                    # Use ident method
                    connection = connect_fct("user=%s dbname=%s" % (user, dbname))
                elif port != '':
                    connection = connect_fct(host=host, port=port, user=user,
                                             password=password, database=dbname, ssl=ssl)
                elif host.startswith('/'):
                    # If the hostname starts with /, it's probably a path
                    # to a UNIX socket. This is similar behaviour to psql
                    connection = connect_fct(unix_sock=host, user=user,
                                             password=password, database=dbname)
                else:
                    connection = connect_fct(host=host, user=user, password=password,
                                             database=dbname, ssl=ssl)
            except Exception as e:
                message = u'Error establishing postgres connection: %s' % (str(e))
                service_check_tags = self._get_service_check_tags(host, port, tags)
                self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL,
                                   tags=service_check_tags, message=message)
                raise
        else:
            if not host:
                raise CheckException("Please specify a Postgres host to connect to.")
            elif not user:
                raise CheckException("Please specify a user to connect to Postgres as.")

        self.dbs[key] = connection
        return connection
Beispiel #2
0
    def check(self, instance):
        service_check_metric_name = 'nifi.instance.http_check'
        timeout = 10

        if 'url' not in instance:
            raise CheckException("No url defined for Nifi instance")
        url = instance.get('url')
        url = "{0}/{1}".format(url, ENDPOINT)

        instance_tags = instance.get('tags', [])
        self.log.info('Connecting to Nifi instance {0}'.format(url))
        try:
            r = requests.get(url, timeout=timeout)
            r.raise_for_status()
        except requests.exceptions.Timeout as e:
            self.service_check(service_check_metric_name,
                               self.WARNING,
                               tags=instance_tags,
                               message=str(e))
            return
        except Exception as e:
            self.service_check(service_check_metric_name,
                               self.CRITICAL,
                               tags=instance_tags)
            raise CheckException(e)
        self.service_check(service_check_metric_name,
                           self.OK,
                           tags=instance_tags)
        # Obtain all the key metrics from Nifi to send to DataDog
        for point in NiFiCheck.get_system_metrics(r.json()):
            if type(point.metric) is int:
                self.rate(point.type, point.metric, tags=instance_tags)
            else:
                self.gauge(point.type, point.metric, tags=instance_tags)
            time.sleep(1)
Beispiel #3
0
    def _get_custom_metrics(self, custom_metrics, key):
        # Pre-processed cached custom_metrics
        if key in self.custom_metrics:
            return self.custom_metrics[key]

        # Otherwise pre-process custom metrics and verify definition
        required_parameters = ("descriptors", "metrics", "query", "relation")

        for m in custom_metrics:
            for param in required_parameters:
                if param not in m:
                    raise CheckException(
                        "Missing {0} parameter in custom metric".format(param))

            self.log.debug("Metric: {0}".format(m))

            try:
                for ref, (_, mtype) in m['metrics'].iteritems():
                    cap_mtype = mtype.upper()
                    if cap_mtype not in ('RATE', 'GAUGE', 'MONOTONIC'):
                        raise CheckException(
                            "Collector method {0} is not known. "
                            "Known methods are RATE, GAUGE, MONOTONIC".format(
                                cap_mtype))

                    m['metrics'][ref][1] = getattr(PostgreSql, cap_mtype)
                    self.log.debug("Method: %s" % (str(mtype)))
            except Exception as e:
                raise CheckException(
                    "Error processing custom metric '{}': {}".format(m, e))

        self.custom_metrics[key] = custom_metrics
        return custom_metrics
Beispiel #4
0
    def _get_connect_kwargs(self, host, port, user, password, database_url):
        """
        Get the params to pass to psycopg2.connect() based on passed-in vals
        from yaml settings file
        """
        if database_url:
            return {'dsn': database_url}

        if not host:
            raise CheckException(
                "Please specify a PgBouncer host to connect to.")

        if not user:
            raise CheckException(
                "Please specify a user to connect to PgBouncer as.")

        if host in ('localhost', '127.0.0.1') and password == '':
            return {  # Use ident method
                'dsn': "user={} dbname={}".format(user, self.DB_NAME)
            }

        if port:
            return {'host': host, 'user': user, 'password': password,
                    'database': self.DB_NAME, 'port': port}

        return {'host': host, 'user': user, 'password': password,
                'database': self.DB_NAME}
Beispiel #5
0
 def _send_service_check(self, url, response, status, failure_expected=False, tags=None, message=None):
     if status is AgentCheck.CRITICAL and failure_expected:
         status = AgentCheck.OK
         message = "Got %s when hitting %s" % (response.status_code, url)
         raise CheckException(message)
     elif status is AgentCheck.CRITICAL and not failure_expected:
         raise CheckException('Cannot connect to mesos. Error: {0}'.format(message))
     if self.service_check_needed:
         self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=message)
         self.service_check_needed = False
 def get_metadata(self, type):
     try:
         response = requests.get(
             "http://169.254.169.254/latest/meta-data/{}".format(type))
         response.raise_for_status()
     except requests.exceptions.HTTPError as e:
         raise CheckException("HTTP error caught: {}".format(e))
     except requests.exceptions.RequestException as e:
         raise CheckException("Connection error: {}".format(e))
     return response.text
    def check(self, instance):
        try:
            region_name = instance.get('region_name')
            if not region_name:
                region_name = 'us-east-1'

            pricing_client = boto3.client('pricing', region_name=region_name)

            service_codes = get_aws_service_codes(pricing_client)
            rate_codes_dict = get_rate_codes_dict_from_instance(
                service_codes, instance)

            # Python dictionaries evaluate to false when empty
            if not rate_codes_dict:
                message = 'No rate codes for existing AWS services were defined, please fix conf.yaml'
                self.service_check('aws_pricing.status',
                                   self.CRITICAL,
                                   message=message)
                raise CheckException(message)

            missing_rate_codes = defaultdict(list)

            for service_code, rate_codes in iteritems(rate_codes_dict):
                for rate_code in rate_codes:
                    price_dimensions = get_aws_prices(pricing_client,
                                                      service_code, rate_code)

                    if price_dimensions is None:
                        missing_rate_codes[service_code].append(rate_code)
                        continue

                    name = 'aws.pricing.{}'.format(service_code.lower())
                    price = get_price_from_price_dimensions(price_dimensions)
                    tags = get_tags_from_price_dimensions(price_dimensions)

                    self.gauge(name, price, tags)

            # Python dictionaries evaluate to true when not empty
            if not missing_rate_codes:
                self.service_check('aws_pricing.status', self.OK)
            else:
                message = 'Pricing data not found for these service rate codes: {}'.format(
                    dict(missing_rate_codes))
                self.service_check('aws_pricing.status',
                                   self.WARNING,
                                   message=message)

        except ClientError as client_error:
            self.service_check('aws_pricing.status',
                               self.CRITICAL,
                               message=str(client_error))
            raise CheckException('Pricing Service client error: {}'.format(
                str(client_error)))
    def check(self, instance):
        url = instance.get('url', '')
        default_timeout = instance.get('default_timeout', 5)
        timeout = float(instance.get('timeout', default_timeout))
        tags = instance.get('tags', [])

        if not url:
            raise CheckException("Configuration error, please fix conf.yaml")

        try:
            r = requests.get(url, timeout=timeout)
        except requests.exceptions.Timeout as e:
            raise CheckException('URL: {0} timed out after {1} \
                                 seconds.'.format(url, timeout))
        except requests.exceptions.ConnectionError as e:
            raise CheckException(e)

        if r.status_code != 200:
            raise CheckException('Invalid Status Code, {0} returned a status \
                                 of {1}.'.format(url, r.status_code))

        try:
            stats = json.loads(r.text)
        except ValueError as e:
            raise CheckException('{0} returned an unserializable \
                                 payload'.format(url))

        for key, val in stats.iteritems():
            if key in self.REPL_STATS:
                self.safe_submit_metric("riak_repl." + key, val, tags=tags)

        if stats['realtime_enabled'] is not None:
            for key, val in stats['realtime_queue_stats'].iteritems():
                if key in self.REALTIME_QUEUE_STATS:
                    self.safe_submit_metric("riak_repl.realtime_queue_stats." +
                                            key,
                                            val,
                                            tags=tags)

        for c in stats['connected_clusters']:
            cluster = c.replace("-", "_")
            if c not in stats['fullsync_coordinator']:
                continue
            for key, val in stats['fullsync_coordinator'][c].iteritems():
                if key in self.FULLSYNC_COORDINATOR:
                    self.safe_submit_metric("riak_repl.fullsync_coordinator." +
                                            cluster + "." + key,
                                            val,
                                            tags=tags)
Beispiel #9
0
    def _get_tls_object(self, ssl_params):
        """
        Return a TLS object to establish a secure connection to a server
        """
        if ssl_params is None:
            return None

        if not ssl_params["verify"] and ssl_params["ca_certs"]:
            self.warning("Incorrect configuration: trying to disable server certificate validation, "
                         "while also specifying a capath. No validation will be performed. Fix your "
                         "configuration to remove this warning")

        validate = ssl.CERT_REQUIRED if ssl_params["verify"] else ssl.CERT_NONE

        if ssl_params["ca_certs"] is None or os.path.isfile(ssl_params["ca_certs"]):
            tls = ldap3.core.tls.Tls(
                local_private_key_file=ssl_params["key"],
                local_certificate_file=ssl_params["cert"],
                ca_certs_file=ssl_params["ca_certs"],
                version=ssl.PROTOCOL_SSLv23,
                validate=validate,
            )
        elif os.path.isdir(ssl_params["ca_certs"]):
            tls = ldap3.core.tls.Tls(
                local_private_key_file=ssl_params["key"],
                local_certificate_file=ssl_params["cert"],
                ca_certs_path=ssl_params["ca_certs"],
                version=ssl.PROTOCOL_SSLv23,
                validate=validate,
            )
        else:
            raise CheckException("Invalid path {} for ssl_ca_certs: no such file or directory"
                                 .format(ssl_params["ca_certs"]))
        return tls
Beispiel #10
0
    def _create_gitlab_runner_prometheus_instance(self, instance, init_config):
        """
        Set up the gitlab_runner instance so it can be used in OpenMetricsBaseCheck
        """
        # Mapping from Prometheus metrics names to Datadog ones
        # For now it's a 1:1 mapping
        allowed_metrics = init_config.get('allowed_metrics')
        if allowed_metrics is None:
            raise CheckException(
                "At least one metric must be whitelisted in `allowed_metrics`."
            )

        gitlab_runner_instance = deepcopy(instance)

        # gitlab_runner uses 'prometheus_endpoint' and not 'prometheus_url', so we have to rename the key
        gitlab_runner_instance['prometheus_url'] = instance.get(
            'prometheus_endpoint', None)

        gitlab_runner_instance.update({
            'namespace':
            'gitlab_runner',
            'metrics':
            allowed_metrics,
            # Defaults that were set when gitlab_runner was based on PrometheusCheck
            'send_monotonic_counter':
            instance.get('send_monotonic_counter', False),
            'health_service_check':
            instance.get('health_service_check', False)
        })

        return gitlab_runner_instance
Beispiel #11
0
    def check(self, instance):
        endpoint = instance.get('kube_state_url')
        if endpoint is None:
            raise CheckException("Unable to find kube_state_url in config file.")

        if 'labels_mapper' in instance:
            if isinstance(instance['labels_mapper'], dict):
                self.labels_mapper = instance['labels_mapper']
            else:
                self.log.warning("labels_mapper should be a dictionnary")

        send_buckets = instance.get('send_histograms_buckets', True)
        # By default we send the buckets.
        if send_buckets is not None and str(send_buckets).lower() == 'false':
            send_buckets = False
        else:
            send_buckets = True

        self.custom_tags = instance.get('tags', [])
        if self.custom_tags is None:
            self.custom_tags = []
        # Job counters are monotonic: they increase at every run of the job
        # We want to send the delta via the `monotonic_count` method
        self.job_succeeded_count = defaultdict(int)
        self.job_failed_count = defaultdict(int)

        self.process(endpoint, send_histograms_buckets=send_buckets, instance=instance)

        for job_tags, job_count in self.job_succeeded_count.iteritems():
            self.monotonic_count(self.NAMESPACE + '.job.succeeded', job_count, list(job_tags))
        for job_tags, job_count in self.job_failed_count.iteritems():
            self.monotonic_count(self.NAMESPACE + '.job.failed', job_count, list(job_tags))
    def check(self, instance):
        host, custom_tags, timeout, response_time = self._load_conf(instance)

        custom_tags.append("target_host:{}".format(host))

        try:
            lines = self._exec_ping(timeout, host)
            regex = re.compile(r"time=((\d|\.)*)")
            result = regex.findall(lines)
            if result:
                length = result[0][0]
            else:
                raise CheckException("No time= found ({})".format(lines))

        except CheckException as e:
            self.log.info("{} is DOWN ({})".format(host, str(e)))
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.CRITICAL,
                               custom_tags,
                               message=str(e))
            self.gauge(self.SERVICE_CHECK_NAME, 0, custom_tags)

            raise e

        if response_time:
            self.gauge("network.ping.response_time", length, custom_tags)

        self.log.debug("{} is UP".format(host))
        self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, custom_tags)
        self.gauge(self.SERVICE_CHECK_NAME, 1, custom_tags)
    def _exec_ping(self, timeout, target_host):
        if platform.system() == "Windows":  # pragma: nocover
            countOption = "-n"
            timeoutOption = "-w"
            # The timeout option is in ms on Windows
            # https://docs.microsoft.com/en-us/windows-server/administration/windows-commands/ping
            timeout = timeout * 1000
        elif platform.system() == "Darwin":
            countOption = "-c"
            timeoutOption = "-W"  # Also in ms on Mac
            timeout = timeout * 1000
        else:
            # The timeout option is is seconds on Linux, leaving timeout as is
            # https://linux.die.net/man/8/ping
            countOption = "-c"
            timeoutOption = "-W"

        self.log.debug("Running: ping {} {} {} {} {}".format(
            countOption, "1", timeoutOption, str(timeout), target_host))

        lines, err, retcode = get_subprocess_output([
            "ping", countOption, "1", timeoutOption,
            str(timeout), target_host
        ],
                                                    self.log,
                                                    raise_on_empty_output=True)
        self.log.debug("ping returned {} - {} - {}".format(
            retcode, lines, err))
        if retcode != 0:
            raise CheckException("ping returned {}: {}".format(retcode, err))

        return lines
    def check(self, instance):
        #### Metrics collection
        endpoint = instance.get('prometheus_endpoint')
        custom_tags = instance.get('tags', [])
        if endpoint is None:
            raise CheckException(
                "Unable to find prometheus_endpoint in config file.")

        # By default we send the buckets
        send_buckets = _is_affirmative(
            instance.get('send_histograms_buckets', True))

        try:
            self.process(endpoint,
                         send_histograms_buckets=send_buckets,
                         instance=instance)
            self.service_check(self.PROMETHEUS_SERVICE_CHECK_NAME,
                               PrometheusCheck.OK,
                               tags=custom_tags)
        except requests.exceptions.ConnectionError as e:
            # Unable to connect to the metrics endpoint
            self.service_check(
                self.PROMETHEUS_SERVICE_CHECK_NAME,
                PrometheusCheck.CRITICAL,
                message=
                "Unable to retrieve Prometheus metrics from endpoint %s: %s" %
                (endpoint, e.message),
                tags=custom_tags)

        #### Service check to check whether the Runner can talk to the Gitlab master
        self._check_connectivity_to_master(instance, custom_tags)
Beispiel #15
0
    def check(self, instance):
        self.kubelet_conn_info = get_connection_info()
        endpoint = self.kubelet_conn_info.get('url')
        if endpoint is None:
            raise CheckException(
                "Unable to find metrics_endpoint in config "
                "file or detect the kubelet URL automatically.")

        self.metrics_url = instance.get('metrics_endpoint') or urljoin(
            endpoint, CADVISOR_METRICS_PATH)
        self.kube_health_url = urljoin(endpoint, KUBELET_HEALTH_PATH)
        self.node_spec_url = urljoin(endpoint, NODE_SPEC_PATH)
        self.pod_list_url = urljoin(endpoint, POD_LIST_PATH)

        # By default we send the buckets.
        send_buckets = instance.get('send_histograms_buckets', True)
        if send_buckets is not None and str(send_buckets).lower() == 'false':
            send_buckets = False
        else:
            send_buckets = True

        try:
            self.pod_list = self.retrieve_pod_list()
        except Exception:
            self.pod_list = None

        instance_tags = instance.get('tags', [])
        self._perform_kubelet_check(instance_tags)
        self._report_node_metrics(instance_tags)
        self._report_pods_running(self.pod_list, instance_tags)
        self._report_container_spec_metrics(self.pod_list, instance_tags)
        self.process(self.metrics_url,
                     send_histograms_buckets=send_buckets,
                     instance=instance)
Beispiel #16
0
    def check(self, instance):
        # Metrics collection
        endpoint = instance.get('prometheus_endpoint')
        if endpoint is None:
            raise CheckException(
                "Unable to find prometheus_endpoint in config file.")

        scraper_config = self.config_map[endpoint]
        custom_tags = instance.get('tags', [])

        try:
            self.process(scraper_config)
            self.service_check(self.PROMETHEUS_SERVICE_CHECK_NAME,
                               OpenMetricsBaseCheck.OK,
                               tags=custom_tags)
        except requests.exceptions.ConnectionError as e:
            # Unable to connect to the metrics endpoint
            self.service_check(
                self.PROMETHEUS_SERVICE_CHECK_NAME,
                OpenMetricsBaseCheck.CRITICAL,
                message=
                "Unable to retrieve Prometheus metrics from endpoint {}: {}".
                format(endpoint, e),
                tags=custom_tags,
            )

        # Service check to check whether the Runner can talk to the Gitlab master
        self._check_connectivity_to_master(instance, custom_tags)
Beispiel #17
0
    def check(self, instance):
        # Metrics collection
        endpoint = instance.get('prometheus_endpoint')
        if endpoint is None:
            raise CheckException(
                "Unable to find prometheus_endpoint in config file.")

        # By default we send the buckets
        send_buckets = _is_affirmative(
            instance.get('send_histograms_buckets', True))
        custom_tags = instance.get('tags', [])

        try:
            self.process(endpoint,
                         send_histograms_buckets=send_buckets,
                         instance=instance)
            self.service_check(self.PROMETHEUS_SERVICE_CHECK_NAME,
                               PrometheusCheck.OK,
                               tags=custom_tags)
        except requests.exceptions.ConnectionError as e:
            # Unable to connect to the metrics endpoint
            self.service_check(
                self.PROMETHEUS_SERVICE_CHECK_NAME,
                PrometheusCheck.CRITICAL,
                message=
                "Unable to retrieve Prometheus metrics from endpoint {}: {}".
                format(endpoint, e.message),
                tags=custom_tags,
            )

        # Service check to check Gitlab's health endpoints
        for check_type in self.ALLOWED_SERVICE_CHECKS:
            self._check_health_endpoint(instance, check_type, custom_tags)
Beispiel #18
0
    def check(self, instance):
        # Metrics collection
        endpoint = instance.get('prometheus_endpoint')
        if endpoint is None:
            raise CheckException(
                "Unable to find prometheus_endpoint in config file.")

        scraper_config = self.config_map[endpoint]
        custom_tags = instance.get('tags', [])

        try:
            self.process(scraper_config)
            self.service_check(self.PROMETHEUS_SERVICE_CHECK_NAME,
                               OpenMetricsBaseCheck.OK,
                               tags=custom_tags)
        except requests.exceptions.ConnectionError as e:
            # Unable to connect to the metrics endpoint
            self.service_check(
                self.PROMETHEUS_SERVICE_CHECK_NAME,
                OpenMetricsBaseCheck.CRITICAL,
                message=
                "Unable to retrieve Prometheus metrics from endpoint {}: {}".
                format(endpoint, e),
                tags=custom_tags,
            )

        # Service check to check Gitlab's health endpoints
        for check_type in self.ALLOWED_SERVICE_CHECKS:
            self._check_health_endpoint(instance, check_type, custom_tags)

        self.submit_version(instance)
Beispiel #19
0
    def _check_health_endpoint(self, instance, check_type, tags):
        if check_type not in self.ALLOWED_SERVICE_CHECKS:
            raise CheckException(
                "Health endpoint {} is not a valid endpoint".format(
                    check_type))

        url = instance.get('gitlab_url')

        if url is None:
            # Simply ignore this service check if not configured
            self.log.debug(
                "gitlab_url not configured, service check %s skipped",
                check_type)
            return

        service_check_tags = self._service_check_tags(url)
        service_check_tags.extend(tags)
        # These define which endpoint is hit and which type of check is actually performed
        # TODO: parse errors and report for single sub-service failure?
        service_check_name = 'gitlab.{}'.format(check_type)
        check_url = '{}/-/{}'.format(url, check_type)

        try:
            self.log.debug("checking %s against %s", check_type, check_url)
            r = self.http.get(check_url)
            if r.status_code != 200:
                self.service_check(
                    service_check_name,
                    OpenMetricsBaseCheck.CRITICAL,
                    message="Got {} when hitting {}".format(
                        r.status_code, check_url),
                    tags=service_check_tags,
                )
                raise Exception("Http status code {} on check_url {}".format(
                    r.status_code, check_url))
            else:
                r.raise_for_status()

        except requests.exceptions.Timeout:
            # If there's a timeout
            self.service_check(
                service_check_name,
                OpenMetricsBaseCheck.CRITICAL,
                message="Timeout when hitting {}".format(check_url),
                tags=service_check_tags,
            )
            raise
        except Exception as e:
            self.service_check(
                service_check_name,
                OpenMetricsBaseCheck.CRITICAL,
                message="Error hitting {}. Error: {}".format(check_url, e),
                tags=service_check_tags,
            )
            raise
        else:
            self.service_check(service_check_name,
                               OpenMetricsBaseCheck.OK,
                               tags=service_check_tags)
        self.log.debug("gitlab check %s succeeded", check_type)
    def _get_json(self, url, timeout, verify, tags=None):
        tags = tags + ["url:%s" % url] if tags else ["url:%s" % url]
        msg = None
        status = None
        try:
            r = requests.get(url, timeout=timeout, verify=verify)
            if r.status_code != 200:
                status = AgentCheck.CRITICAL
                msg = "Got %s when hitting %s" % (r.status_code, url)
            else:
                status = AgentCheck.OK
                msg = "Mesos master instance detected at %s " % url
        except requests.exceptions.Timeout:
            # If there's a timeout
            msg = "%s seconds timeout when hitting %s" % (timeout, url)
            status = AgentCheck.CRITICAL
        except Exception as e:
            msg = str(e)
            status = AgentCheck.CRITICAL
        finally:
            self.log.debug('Request to url : {0}, timeout: {1}, message: {2}'.format(url, timeout, msg))
            if self.service_check_needed:
                self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=msg)
                self.service_check_needed = False
            if status is AgentCheck.CRITICAL:
                raise CheckException('Cannot connect to mesos. Error: {0}'.format(msg))

        if r.encoding is None:
            r.encoding = 'UTF8'

        return r.json()
Beispiel #21
0
    def check(self, instance):
        """
        Process the istio_mesh, process_mixer, pilot, and galley endpoints
        associated with this instance.
        All the endpoints themselves are optional, but at least one must be passed.
        """
        processed = False
        # Get the config for the istio_mesh instance
        istio_mesh_endpoint = instance.get('istio_mesh_endpoint')
        if istio_mesh_endpoint:
            istio_mesh_config = self.config_map[istio_mesh_endpoint]

            # Process istio_mesh
            self.process(istio_mesh_config)
            processed = True

        # Get the config for the process_mixer instance
        process_mixer_endpoint = instance.get('mixer_endpoint')
        if process_mixer_endpoint:
            process_mixer_config = self.config_map[process_mixer_endpoint]

            # Process process_mixer
            self.process(process_mixer_config)
            processed = True

        # Get the config for the process_pilot instance
        process_pilot_endpoint = instance.get('pilot_endpoint')
        if process_pilot_endpoint:
            process_pilot_config = self.config_map[process_pilot_endpoint]

            # Process process_pilot
            self.process(process_pilot_config)
            processed = True

        # Get the config for the process_galley instance
        process_galley_endpoint = instance.get('galley_endpoint')
        if process_galley_endpoint:
            process_galley_config = self.config_map[process_galley_endpoint]

            # Process process_galley
            self.process(process_galley_config)
            processed = True

        # Get the config for the process_citadel instance
        process_citadel_endpoint = instance.get('citadel_endpoint')
        if process_citadel_endpoint:
            process_citadel_config = self.config_map[process_citadel_endpoint]

            # Process process_citadel
            self.process(process_citadel_config)
            processed = True

        # Check that at least 1 endpoint is configured
        if not processed:
            raise CheckException(
                "At least one of Mixer, Mesh, Pilot, Galley or Citadel endpoints must be configured"
            )
Beispiel #22
0
    def _create_process_mixer_instance(self, instance):
        """
        Grab the mixer scraper from the dict and return it if it exists,
        otherwise create the scraper and add it to the dict
        """
        endpoint = instance.get('mixer_endpoint')
        if endpoint is None:
            raise CheckException("Unable to find mixer_endpoint in config file.")

        process_mixer_instance = deepcopy(instance)
        process_mixer_instance.update(
            {
                'namespace': self.MIXER_NAMESPACE,
                'prometheus_url': endpoint,
                'metrics': [
                    {
                        # Pre 1.1 metrics
                        'grpc_server_handled_total': 'grpc.server.handled_total',
                        'grpc_server_handling_seconds': 'grpc.server.handling_seconds',
                        'grpc_server_msg_received_total': 'grpc.server.msg_received_total',
                        'grpc_server_msg_sent_total': 'grpc.server.msg_sent_total',
                        'grpc_server_started_total': 'grpc.server.started_total',
                        'mixer_adapter_dispatch_count': 'adapter.dispatch_count',
                        'mixer_adapter_dispatch_duration': 'adapter.dispatch_duration',
                        'mixer_adapter_old_dispatch_count': 'adapter.old_dispatch_count',
                        'mixer_adapter_old_dispatch_duration': 'adapter.old_dispatch_duration',
                        'mixer_config_resolve_actions': 'config.resolve_actions',
                        'mixer_config_resolve_count': 'config.resolve_count',
                        'mixer_config_resolve_duration': 'config.resolve_duration',
                        'mixer_config_resolve_rules': 'config.resolve_rules',
                        # 1.1 metrics
                        'grpc_io_server_completed_rpcs': 'grpc_io_server.completed_rpcs',
                        'grpc_io_server_received_bytes_per_rpc': 'grpc_io_server.received_bytes_per_rpc',
                        'grpc_io_server_sent_bytes_per_rpc': 'grpc_io_server.sent_bytes_per_rpc',
                        'grpc_io_server_server_latency': 'grpc_io_server.server_latency',
                        'mixer_config_attributes_total': 'config.attributes_total',
                        'mixer_config_handler_configs_total': 'config.handler_configs_total',
                        'mixer_config_instance_configs_total': 'config.instance_configs_total',
                        'mixer_config_rule_configs_total': 'config.rule_configs_total',
                        'mixer_dispatcher_destinations_per_request': 'dispatcher.destinations_per_request',
                        'mixer_dispatcher_instances_per_request': 'dispatcher.instances_per_request',
                        'mixer_handler_daemons_total': 'handler.daemons_total',
                        'mixer_handler_new_handlers_total': 'handler.new_handlers_total',
                        'mixer_mcp_sink_reconnections': 'mcp_sink.reconnections',
                        'mixer_mcp_sink_request_acks_total': 'mcp_sink.request_acks_total',
                        'mixer_runtime_dispatches_total': 'runtime.dispatches_total',
                        'mixer_runtime_dispatch_duration_seconds': 'runtime.dispatch_duration_seconds',
                    }
                ],
                # Defaults that were set when istio was based on PrometheusCheck
                'send_monotonic_counter': instance.get('send_monotonic_counter', False),
                'health_service_check': instance.get('health_service_check', False),
            }
        )
        process_mixer_instance['metrics'][0].update(self._get_generic_metrics())

        return process_mixer_instance
Beispiel #23
0
    def check(self, instance):
        if self.kube_apiserver_config is None:
            kube_apiserver_config = self._create_kube_apiserver_metrics_instance(instance)
            self.kube_apiserver_config = self.get_scraper_config(kube_apiserver_config)

        if not self.kube_apiserver_config['metrics_mapper']:
            url = self.kube_apiserver_config['prometheus_url']
            raise CheckException("You have to collect at least one metric from the endpoint: {}".format(url))
        self.process(self.kube_apiserver_config, metric_transformers=self.metric_transformers)
Beispiel #24
0
    def check(self, instance):
        self.kubelet_conn_info = get_connection_info()
        endpoint = self.kubelet_conn_info.get('url')
        if endpoint is None:
            raise CheckException(
                "Unable to find metrics_endpoint in config "
                "file or detect the kubelet URL automatically.")

        self.metrics_url = instance.get(
            'metrics_endpoint', urljoin(endpoint, CADVISOR_METRICS_PATH))
        self.kube_health_url = urljoin(endpoint, KUBELET_HEALTH_PATH)
        self.node_spec_url = urljoin(endpoint, NODE_SPEC_PATH)
        self.pod_list_url = urljoin(endpoint, POD_LIST_PATH)

        # Legacy cadvisor support
        try:
            self.cadvisor_legacy_url = self.detect_cadvisor(
                endpoint, self.cadvisor_legacy_port)
        except Exception as e:
            self.log.debug(
                'cAdvisor not found, running in prometheus mode: %s' % str(e))

        # By default we send the buckets.
        send_buckets = instance.get('send_histograms_buckets', True)
        if send_buckets is not None and str(send_buckets).lower() == 'false':
            send_buckets = False
        else:
            send_buckets = True

        try:
            self.pod_list = self.retrieve_pod_list()
            if self.pod_list.get("items") is None:
                # Sanitize input: if no pod are running, 'items' is a NoneObject
                self.pod_list['items'] = []
        except Exception:
            self.pod_list = None

        self.container_filter = ContainerFilter(self.pod_list)

        self.instance_tags = instance.get('tags', [])
        self._perform_kubelet_check(self.instance_tags)
        self._report_node_metrics(self.instance_tags)
        self._report_pods_running(self.pod_list, self.instance_tags)
        self._report_container_spec_metrics(self.pod_list, self.instance_tags)

        if self.cadvisor_legacy_url:  # Legacy cAdvisor
            self.process_cadvisor(instance, self.cadvisor_legacy_url,
                                  self.pod_list, self.container_filter)
        elif self.metrics_url:  # Prometheus
            self.process(self.metrics_url,
                         send_histograms_buckets=send_buckets,
                         instance=instance)

        # Free up memory
        self.pod_list = None
        self.container_filter = None
    def _load_conf(self, instance):
        # Fetches the conf
        timeout = float(instance.get("timeout", 4))
        response_time = instance.get("collect_response_time", False)
        custom_tags = instance.get("tags", [])

        host = instance.get("host", None)
        if host is None:
            raise CheckException("A valid host must be specified")

        return host, custom_tags, timeout, response_time
    def __init__(self, name, init_config, agentConfig, instances=None):
        super(GitlabRunnerCheck, self).__init__(name, init_config, agentConfig, instances)
        # Mapping from Prometheus metrics names to Datadog ones
        # For now it's a 1:1 mapping
        # TODO: mark some metrics as rate
        allowed_metrics = init_config.get('allowed_metrics')

        if not allowed_metrics:
            raise CheckException("At least one metric must be whitelisted in `allowed_metrics`.")

        self.metrics_mapper = dict(zip(allowed_metrics, allowed_metrics))
        self.NAMESPACE = 'gitlab_runner'
Beispiel #27
0
    def check(self, instance):
        host = instance.get('host')
        port = instance.get('port', '8080')
        path = instance.get('path', '/health')

        if not host:
            self.warning("Configuration error, please fix traefik.yaml")
            raise CheckException(
                "Configuration error, please fix traefik.yaml")

        try:
            url = 'http://{}:{}{}'.format(host, port, path)
            response = requests.get(url)
            response_status_code = response.status_code

            if response_status_code == 200:
                self.service_check('traefik.health', self.OK)

                payload = response.json()

                if 'total_status_code_count' in payload:
                    values = payload['total_status_code_count']

                    for status_code in values:
                        self.gauge('traefik.total_status_code_count',
                                   values[status_code],
                                   ['status_code:' + status_code])

                else:
                    self.log.warn(
                        'Field total_status_code_count not found in response.')

                if 'total_count' in payload:
                    self.gauge('traefik.total_count', payload['total_count'])
                else:
                    self.log.warn('Field total_count not found in response.')

            else:
                self.service_check(
                    'traefik.health',
                    self.CRITICAL,
                    message="Traefik health check return code is not 200")

        except requests.exceptions.ConnectionError:
            self.service_check('traefik.health',
                               self.CRITICAL,
                               message="Traefik endpoint unreachable")

        except Exception as e:
            self.service_check('traefik.health',
                               self.UNKNOWN,
                               message="UNKNOWN exception" + str(e))
    def check(self, instance):
        self.metric_count = 0
        self.services_up = 0

        instance_tags = instance.get('tags', [])
        consumer = instance.get('consumer')
        if not consumer:
            raise CheckException(
                "The consumer must be specified in the configuration.")
        url = self.URL + '?consumer=' + consumer
        try:
            json = self._get_metrics_json(url)
            if 'services' not in json:
                self.service_check(
                    self.METRICS_SERVICE_CHECK,
                    AgentCheck.WARNING,
                    tags=instance_tags,
                    message="No services in response from metrics proxy on {}".
                    format(url))
                return

            for service in json['services']:
                service_name = service['name']
                self._report_service_status(instance_tags, service_name,
                                            service)
                for metrics in service['metrics']:
                    self._emit_metrics(service_name, metrics, instance_tags)

            self.log.info("Forwarded {} metrics to hq for {} services".format(
                self.metric_count, self.services_up))
            self.service_check(
                self.METRICS_SERVICE_CHECK,
                AgentCheck.OK,
                tags=instance_tags,
                message="Metrics collected successfully for consumer {}".
                format(consumer))
        except Timeout as e:
            self._report_metrics_error(
                "Timed out connecting to Vespa's node metrics api: {}".format(
                    e), AgentCheck.CRITICAL, instance_tags)
        except (HTTPError, InvalidURL, ConnectionError) as e:
            self._report_metrics_error(
                "Could not connect to Vespa's node metrics api: {}".format(e),
                AgentCheck.CRITICAL, instance_tags)
        except JSONDecodeError as e:
            self._report_metrics_error(
                "Error parsing JSON from Vespa's node metrics api: {}".format(
                    e), AgentCheck.CRITICAL, instance_tags)
        except Exception as e:
            self._report_metrics_error("Unexpected error: {}".format(e),
                                       AgentCheck.WARNING, instance_tags)
Beispiel #29
0
    def check(self, instance):
        socket = instance.get('socket')
        server = instance.get('url')
        options = instance.get('options', {})
        username = instance.get('username')
        password = instance.get('password')

        if not server and not socket:
            raise InvalidConfigError(
                'Either "url" or "socket" must be configured')

        if socket:
            server = 'unix'
            port = socket
            connection_server = "{}".format(port)
        else:
            port = int(instance.get('port', self.DEFAULT_PORT))
            connection_server = "{}:{}".format(server, port)
        custom_tags = instance.get('tags') or []

        mc = None  # client
        tags = ["url:{0}:{1}".format(server, port)] + custom_tags
        service_check_tags = ["host:%s" % server,
                              "port:%s" % port] + custom_tags

        try:
            self.log.debug("Connecting to %s, tags:%s", connection_server,
                           tags)
            mc = bmemcached.Client(connection_server, username, password)

            self._get_metrics(mc, tags, service_check_tags)
            if options:
                # setting specific handlers
                self.OPTIONAL_STATS["items"][2] = Memcache.get_items_stats
                self.OPTIONAL_STATS["slabs"][2] = Memcache.get_slabs_stats
                self._get_optional_metrics(mc, tags, options)
        except BadResponseError as e:
            self.service_check(self.SERVICE_CHECK,
                               AgentCheck.CRITICAL,
                               tags=service_check_tags,
                               message="Unable to fetch stats from server")
            raise CheckException(
                "Unable to retrieve stats from memcache instance: {}:{}."
                "Please check your configuration. ({})".format(
                    server, port, e))

        if mc is not None:
            mc.disconnect_all()
            self.log.debug("Disconnected from memcached")
        del mc
Beispiel #30
0
    def check(self, instance):
        endpoint = instance.get('prometheus_endpoint')
        if endpoint is None:
            raise CheckException("Unable to find prometheus_endpoint in config file.")

        self.set_prometheus_timeout(instance)

        send_buckets = instance.get('send_histograms_buckets', True)
        # By default we send the buckets.
        if send_buckets is not None and str(send_buckets).lower() == 'false':
            send_buckets = False
        else:
            send_buckets = True

        self.process(endpoint, send_histograms_buckets=send_buckets, instance=instance)