Beispiel #1
0
    def save_event(self, data):
        if self.event_dir is None:
            logger.warn("Cannot save event -- Event reporter is not initialized.")
            return

        try:
            fileutil.mkdir(self.event_dir, mode=0o700)
        except (IOError, OSError) as e:
            msg = "Failed to create events folder {0}. Error: {1}".format(self.event_dir, ustr(e))
            raise EventError(msg)

        try:
            existing_events = os.listdir(self.event_dir)
            if len(existing_events) >= MAX_NUMBER_OF_EVENTS:
                logger.periodic_warn(logger.EVERY_MINUTE, "[PERIODIC] Too many files under: {0}, current count:  {1}, "
                                                          "removing oldest event files".format(self.event_dir,
                                                                                               len(existing_events)))
                existing_events.sort()
                oldest_files = existing_events[:-999]
                for event_file in oldest_files:
                    os.remove(os.path.join(self.event_dir, event_file))
        except (IOError, OSError) as e:
            msg = "Failed to remove old events from events folder {0}. Error: {1}".format(self.event_dir, ustr(e))
            raise EventError(msg)

        filename = os.path.join(self.event_dir,
                                ustr(int(time.time() * 1000000)))
        try:
            with open(filename + ".tmp", 'wb+') as hfile:
                hfile.write(data.encode("utf-8"))
            os.rename(filename + ".tmp", filename + AGENT_EVENT_FILE_EXTENSION)
        except (IOError, OSError) as e:
            msg = "Failed to write events to file: {0}".format(e)
            raise EventError(msg)
Beispiel #2
0
    def image_origin(self):
        """
        An integer value describing the origin of the image.

          0 -> unknown
          1 -> custom - user created image
          2 -> endorsed - See https://docs.microsoft.com/en-us/azure/virtual-machines/linux/endorsed-distros
          3 -> platform - non-endorsed image that is available in the Azure Marketplace.
        """

        try:
            if self.publisher == "":
                return IMDS_IMAGE_ORIGIN_CUSTOM

            if ComputeInfo.__matcher.is_match(self.publisher, self.offer,
                                              self.sku, self.version):
                return IMDS_IMAGE_ORIGIN_ENDORSED
            else:
                return IMDS_IMAGE_ORIGIN_PLATFORM

        except Exception as e:
            logger.periodic_warn(
                logger.EVERY_FIFTEEN_MINUTES,
                "[PERIODIC] Could not determine the image origin from IMDS: {0}"
                .format(ustr(e)))
            return IMDS_IMAGE_ORIGIN_UNKNOWN
Beispiel #3
0
    def _get_metadata_from_endpoint(self, endpoint, resource_path, headers):
        """
        Get metadata from one of the IMDS endpoints.

        :param str endpoint: IMDS endpoint to call
        :param str resource_path: path of IMDS resource
        :param bool headers: headers to send in the request
        :return: Tuple<status:int, response:str>
            status: one of the following response status codes: IMDS_RESPONSE_SUCCESS, IMDS_RESPONSE_ERROR,
                    IMDS_CONNECTION_ERROR, IMDS_INTERNAL_SERVER_ERROR
            response: IMDS response on IMDS_RESPONSE_SUCCESS, failure message otherwise
        """
        try:
            resp = self._http_get(endpoint=endpoint, resource_path=resource_path, headers=headers)
        except ResourceGoneError:
            return IMDS_INTERNAL_SERVER_ERROR, "IMDS error in /metadata/{0}: HTTP Failed with Status Code 410: Gone".format(resource_path)
        except HttpError as e:
            msg = str(e)
            if self._regex_throttled.match(msg):
                return IMDS_RESPONSE_ERROR, "IMDS error in /metadata/{0}: Throttled".format(resource_path)
            if self._regex_ioerror.match(msg):
                logger.periodic_warn(logger.EVERY_FIFTEEN_MINUTES,
                                     "[PERIODIC] [IMDS_CONNECTION_ERROR] Unable to connect to IMDS endpoint {0}".format(endpoint))
                return IMDS_CONNECTION_ERROR, "IMDS error in /metadata/{0}: Unable to connect to endpoint".format(resource_path)
            return IMDS_INTERNAL_SERVER_ERROR, "IMDS error in /metadata/{0}: {1}".format(resource_path, msg)

        if resp.status >= 500:
            return IMDS_INTERNAL_SERVER_ERROR, "IMDS error in /metadata/{0}: {1}".format(
                                               resource_path, restutil.read_response_error(resp))

        if restutil.request_failed(resp):
            return IMDS_RESPONSE_ERROR, "IMDS error in /metadata/{0}: {1}".format(
                                        resource_path, restutil.read_response_error(resp))

        return IMDS_RESPONSE_SUCCESS, resp.read()
Beispiel #4
0
def report_metric(category,
                  counter,
                  instance,
                  value,
                  log_event=False,
                  reporter=__event_logger__):
    """
    Send a telemetry event reporting a single instance of a performance counter.
    :param str category: The category of the metric (cpu, memory, etc)
    :param str counter: The name of the metric ("%idle", etc)
    :param str instance: For instanced metrics, the identifier of the instance. E.g. a disk drive name, a cpu core#
    :param     value: The value of the metric
    :param bool log_event: If True, log the metric in the agent log as well
    :param EventLogger reporter: The EventLogger instance to which metric events should be sent
    """
    if reporter.event_dir is None:
        from azurelinuxagent.common.version import AGENT_NAME
        logger.warn(
            "Cannot report metric event -- Event reporter is not initialized.")
        message = "Metric {0}/{1} [{2}] = {3}".format(category, counter,
                                                      instance, value)
        _log_event(AGENT_NAME, "METRIC", message, 0)
        return
    try:
        reporter.add_metric(category, counter, instance, float(value),
                            log_event)
    except ValueError:
        logger.periodic_warn(
            logger.EVERY_HALF_HOUR,
            "[PERIODIC] Cannot cast the metric value. Details of the Metric - "
            "{0}/{1} [{2}] = {3}".format(category, counter, instance, value))
    def poll_all_tracked():
        metrics = []

        with CGroupsTelemetry._rlock:
            for cgroup in CGroupsTelemetry._tracked[:]:
                try:
                    metrics.extend(
                        cgroup.get_tracked_metrics(
                            track_throttled_time=CGroupsTelemetry.
                            _track_throttled_time))
                except Exception as e:
                    # There can be scenarios when the CGroup has been deleted by the time we are fetching the values
                    # from it. This would raise IOError with file entry not found (ERRNO: 2). We do not want to log
                    # every occurrences of such case as it would be very verbose. We do want to log all the other
                    # exceptions which could occur, which is why we do a periodic log for all the other errors.
                    if not isinstance(
                            e, (IOError, OSError)) or e.errno != errno.ENOENT:  # pylint: disable=E1101
                        logger.periodic_warn(
                            logger.EVERY_HOUR,
                            '[PERIODIC] Could not collect metrics for cgroup '
                            '{0}. Error : {1}'.format(cgroup.name, ustr(e)))
                if not cgroup.is_active():
                    CGroupsTelemetry.stop_tracking(cgroup)

        return metrics
Beispiel #6
0
    def test_periodic_does_not_emit_if_previously_sent(self, mock_info, mock_error, mock_warn, mock_verbose):
        # The count does not increase from 1 - the first time it sends the data.
        logger.periodic_info(logger.EVERY_DAY, _MSG_INFO, *_DATA)
        self.assertIn(hash(_MSG_INFO), logger.DEFAULT_LOGGER.periodic_messages)
        self.assertEqual(1, mock_info.call_count)

        logger.periodic_info(logger.EVERY_DAY, _MSG_INFO, *_DATA)
        self.assertIn(hash(_MSG_INFO), logger.DEFAULT_LOGGER.periodic_messages)
        self.assertEqual(1, mock_info.call_count)

        logger.periodic_warn(logger.EVERY_DAY, _MSG_WARN, *_DATA)
        self.assertIn(hash(_MSG_WARN), logger.DEFAULT_LOGGER.periodic_messages)
        self.assertEqual(1, mock_warn.call_count)

        logger.periodic_warn(logger.EVERY_DAY, _MSG_WARN, *_DATA)
        self.assertIn(hash(_MSG_WARN), logger.DEFAULT_LOGGER.periodic_messages)
        self.assertEqual(1, mock_warn.call_count)

        logger.periodic_error(logger.EVERY_DAY, _MSG_ERROR, *_DATA)
        self.assertIn(hash(_MSG_ERROR), logger.DEFAULT_LOGGER.periodic_messages)
        self.assertEqual(1, mock_error.call_count)

        logger.periodic_error(logger.EVERY_DAY, _MSG_ERROR, *_DATA)
        self.assertIn(hash(_MSG_ERROR), logger.DEFAULT_LOGGER.periodic_messages)
        self.assertEqual(1, mock_error.call_count)

        logger.periodic_verbose(logger.EVERY_DAY, _MSG_VERBOSE, *_DATA)
        self.assertIn(hash(_MSG_VERBOSE), logger.DEFAULT_LOGGER.periodic_messages)
        self.assertEqual(1, mock_verbose.call_count)

        logger.periodic_verbose(logger.EVERY_DAY, _MSG_VERBOSE, *_DATA)
        self.assertIn(hash(_MSG_VERBOSE), logger.DEFAULT_LOGGER.periodic_messages)
        self.assertEqual(1, mock_verbose.call_count)

        self.assertEqual(4, len(logger.DEFAULT_LOGGER.periodic_messages))
    def poll_all_tracked():
        with CGroupsTelemetry._rlock:
            for cgroup in CGroupsTelemetry._tracked[:]:

                if cgroup.name not in CGroupsTelemetry._cgroup_metrics:
                    CGroupsTelemetry._cgroup_metrics[
                        cgroup.name] = CgroupMetrics()

                metric = None
                # noinspection PyBroadException
                try:
                    metric = cgroup.collect()
                except Exception as e:
                    if isinstance(
                            e, (IOError, OSError)) and e.errno == errno.ENOENT:
                        pass
                    else:
                        logger.periodic_warn(
                            logger.EVERY_HALF_HOUR,
                            'Could not collect the cgroup metrics for cgroup path {0}. '
                            'Internal error : {1}'.format(
                                cgroup.path, ustr(e)))
                if metric:
                    CGroupsTelemetry._cgroup_metrics[cgroup.name].add_new_data(
                        cgroup.controller, metric)

                if not cgroup.is_active():
                    CGroupsTelemetry.stop_tracking(cgroup)
                    CGroupsTelemetry._cgroup_metrics[
                        cgroup.name].marked_for_delete = True
Beispiel #8
0
    def test_periodic_emits_if_not_previously_sent(self, mock_info, mock_error, mock_warn, mock_verbose):
        logger.periodic_info(logger.EVERY_DAY, _MSG_INFO, logger.LogLevel.INFO, *_DATA)
        self.assertEqual(1, mock_info.call_count)

        logger.periodic_error(logger.EVERY_DAY, _MSG_ERROR, logger.LogLevel.ERROR, *_DATA)
        self.assertEqual(1, mock_error.call_count)

        logger.periodic_warn(logger.EVERY_DAY, _MSG_WARN, logger.LogLevel.WARNING, *_DATA)
        self.assertEqual(1, mock_warn.call_count)

        logger.periodic_verbose(logger.EVERY_DAY, _MSG_VERBOSE, logger.LogLevel.VERBOSE, *_DATA)
        self.assertEqual(1, mock_verbose.call_count)
Beispiel #9
0
    def test_periodic_forwards_message_and_args(self, mock_info, mock_error, mock_warn, mock_verbose):
        logger.periodic_info(logger.EVERY_DAY, _MSG_INFO, *_DATA)
        mock_info.assert_called_once_with(_MSG_INFO, *_DATA)

        logger.periodic_error(logger.EVERY_DAY, _MSG_ERROR, *_DATA)
        mock_error.assert_called_once_with(_MSG_ERROR, *_DATA)

        logger.periodic_warn(logger.EVERY_DAY, _MSG_WARN, *_DATA)
        mock_warn.assert_called_once_with(_MSG_WARN, *_DATA)

        logger.periodic_verbose(logger.EVERY_DAY, _MSG_VERBOSE, *_DATA)
        mock_verbose.assert_called_once_with(_MSG_VERBOSE, *_DATA)
Beispiel #10
0
 def collect_data(self, cgroup):
     # noinspection PyBroadException
     try:
         if cgroup.controller == "cpu":
             self._cpu_usage.append(cgroup.get_cpu_usage())
         elif cgroup.controller == "memory":
             self._memory_usage.append(cgroup.get_memory_usage())
             self._max_memory_usage.append(cgroup.get_max_memory_usage())
         else:
             raise CGroupsException(
                 'CGroup controller {0} is not supported'.format(
                     controller))
     except Exception as e:
         if not isinstance(e,
                           (IOError, OSError)) or e.errno != errno.ENOENT:
             logger.periodic_warn(
                 logger.EVERY_HALF_HOUR,
                 'Could not collect metrics for cgroup {0}. Error : {1}'.
                 format(cgroup.path, ustr(e)))
Beispiel #11
0
 def is_active(self):
     try:
         tasks = self._get_parameters("tasks")
         if tasks:
             return len(tasks) != 0  # pylint: disable=len-as-condition
     except (IOError, OSError) as e:  # pylint: disable=C0103
         if e.errno == errno.ENOENT:
             # only suppressing file not found exceptions.
             pass
         else:
             logger.periodic_warn(
                 logger.EVERY_HALF_HOUR,
                 'Could not get list of tasks from "tasks" file in the cgroup: {0}.'
                 ' Internal error: {1}'.format(self.path, ustr(e)))
     except CGroupsException as e:  # pylint: disable=C0103
         logger.periodic_warn(
             logger.EVERY_HALF_HOUR,
             'Could not get list of tasks from "tasks" file in the cgroup: {0}.'
             ' Internal error: {1}'.format(self.path, ustr(e)))
     return False
Beispiel #12
0
 def get_tracked_processes(self):
     """
     :return: List of Str (Pids). Will return an empty string if we couldn't fetch any tracked processes.
     """
     procs = []
     try:
         procs = self._get_parameters("cgroup.procs")
     except (IOError, OSError) as e:  # pylint: disable=C0103
         if e.errno == errno.ENOENT:
             # only suppressing file not found exceptions.
             pass
         else:
             logger.periodic_warn(
                 logger.EVERY_HALF_HOUR,
                 'Could not get list of procs from "cgroup.procs" file in the cgroup: {0}.'
                 ' Internal error: {1}'.format(self.path, ustr(e)))
     except CGroupsException as e:  # pylint: disable=C0103
         logger.periodic_warn(
             logger.EVERY_HALF_HOUR,
             'Could not get list of tasks from "cgroup.procs" file in the cgroup: {0}.'
             ' Internal error: {1}'.format(self.path, ustr(e)))
     return procs
Beispiel #13
0
    def test_periodic_emits_after_elapsed_delta(self, mock_info, mock_error,
                                                mock_warn, mock_verbose):
        logger.periodic_info(logger.EVERY_DAY, _MSG_INFO, *_DATA)
        self.assertEqual(1, mock_info.call_count)

        logger.periodic_info(logger.EVERY_DAY, _MSG_INFO, *_DATA)
        self.assertEqual(1, mock_info.call_count)

        logger.DEFAULT_LOGGER.periodic_messages[hash(_MSG_INFO)] = datetime.now() - \
                                                                   logger.EVERY_DAY - logger.EVERY_HOUR
        logger.periodic_info(logger.EVERY_DAY, _MSG_INFO, *_DATA)
        self.assertEqual(2, mock_info.call_count)

        logger.periodic_warn(logger.EVERY_DAY, _MSG_WARN, *_DATA)
        self.assertEqual(1, mock_warn.call_count)
        logger.periodic_warn(logger.EVERY_DAY, _MSG_WARN, *_DATA)
        self.assertEqual(1, mock_warn.call_count)

        logger.DEFAULT_LOGGER.periodic_messages[hash(_MSG_WARN)] = datetime.now() - \
                                                                   logger.EVERY_DAY - logger.EVERY_HOUR
        logger.periodic_warn(logger.EVERY_DAY, _MSG_WARN, *_DATA)
        self.assertEqual(2, mock_info.call_count)

        logger.periodic_error(logger.EVERY_DAY, _MSG_ERROR, *_DATA)
        self.assertEqual(1, mock_error.call_count)
        logger.periodic_error(logger.EVERY_DAY, _MSG_ERROR, *_DATA)
        self.assertEqual(1, mock_error.call_count)

        logger.DEFAULT_LOGGER.periodic_messages[hash(_MSG_ERROR)] = datetime.now() - \
                                                                    logger.EVERY_DAY - logger.EVERY_HOUR
        logger.periodic_error(logger.EVERY_DAY, _MSG_ERROR, *_DATA)
        self.assertEqual(2, mock_info.call_count)

        logger.periodic_verbose(logger.EVERY_DAY, _MSG_VERBOSE, *_DATA)
        self.assertEqual(1, mock_verbose.call_count)
        logger.periodic_verbose(logger.EVERY_DAY, _MSG_VERBOSE, *_DATA)
        self.assertEqual(1, mock_verbose.call_count)

        logger.DEFAULT_LOGGER.periodic_messages[hash(_MSG_VERBOSE)] = datetime.now() - \
                                                                      logger.EVERY_DAY - logger.EVERY_HOUR
        logger.periodic_verbose(logger.EVERY_DAY, _MSG_VERBOSE, *_DATA)
        self.assertEqual(2, mock_info.call_count)
Beispiel #14
0
    def poll_all_tracked():
        metrics = []

        with CGroupsTelemetry._rlock:
            for cgroup in CGroupsTelemetry._tracked[:]:
                if cgroup.name not in CGroupsTelemetry._cgroup_metrics:
                    CGroupsTelemetry._cgroup_metrics[
                        cgroup.name] = CgroupMetrics()
                try:
                    if cgroup.controller == CGroupContollers.CPU:
                        current_cpu_usage = cgroup.get_cpu_usage()
                        CGroupsTelemetry._cgroup_metrics[
                            cgroup.name].add_cpu_usage(current_cpu_usage)
                        metrics.append(
                            MetricValue(MetricsCategory.PROCESS_CATEGORY,
                                        MetricsCounter.PROCESSOR_PERCENT_TIME,
                                        cgroup.name, current_cpu_usage))
                    elif cgroup.controller == CGroupContollers.MEMORY:
                        current_memory_usage = cgroup.get_memory_usage()
                        CGroupsTelemetry._cgroup_metrics[
                            cgroup.name].add_memory_usage(current_memory_usage)
                        metrics.append(
                            MetricValue(MetricsCategory.MEMORY_CATEGORY,
                                        MetricsCounter.TOTAL_MEM_USAGE,
                                        cgroup.name, current_memory_usage))

                        max_memory_usage = cgroup.get_max_memory_usage()
                        CGroupsTelemetry._cgroup_metrics[
                            cgroup.name].add_max_memory_usage(max_memory_usage)
                        metrics.append(
                            MetricValue(MetricsCategory.MEMORY_CATEGORY,
                                        MetricsCounter.MAX_MEM_USAGE,
                                        cgroup.name, max_memory_usage))

                        pids = cgroup.get_tracked_processes()
                        for pid in pids:
                            try:
                                mem_usage_from_procstatm = MemoryResourceUsage.get_memory_usage_from_proc_statm(
                                    pid)
                                metrics.append(
                                    MetricValue(
                                        MetricsCategory.MEMORY_CATEGORY,
                                        MetricsCounter.MEM_USED_BY_PROCESS,
                                        CGroupsTelemetry.
                                        get_process_info_summary(pid),
                                        mem_usage_from_procstatm))
                                CGroupsTelemetry._cgroup_metrics[
                                    cgroup.name].add_proc_statm_memory(
                                        CGroupsTelemetry.
                                        get_process_info_summary(pid),
                                        mem_usage_from_procstatm)
                            except Exception as e:
                                if not isinstance(
                                        e,
                                    (IOError,
                                     OSError)) or e.errno != errno.ENOENT:
                                    logger.periodic_warn(
                                        logger.EVERY_HOUR,
                                        "[PERIODIC] Could not collect proc_statm "
                                        "for pid {0}. Error : {1}", pid,
                                        ustr(e))
                    else:
                        raise CGroupsException(
                            'CGroup controller {0} is not supported for cgroup {1}'
                            .format(cgroup.controller, cgroup.name))
                except Exception as e:
                    # There can be scenarios when the CGroup has been deleted by the time we are fetching the values
                    # from it. This would raise IOError with file entry not found (ERRNO: 2). We do not want to log
                    # every occurrences of such case as it would be very verbose. We do want to log all the other
                    # exceptions which could occur, which is why we do a periodic log for all the other errors.
                    if not isinstance(
                            e, (IOError, OSError)) or e.errno != errno.ENOENT:
                        logger.periodic_warn(
                            logger.EVERY_HOUR,
                            '[PERIODIC] Could not collect metrics for cgroup '
                            '{0}. Error : {1}'.format(cgroup.name, ustr(e)))
                if not cgroup.is_active():
                    CGroupsTelemetry.stop_tracking(cgroup)
                    CGroupsTelemetry._cgroup_metrics[
                        cgroup.name].marked_for_delete = True

        return metrics