def save_event(self, data): if self.event_dir is None: logger.warn("Cannot save event -- Event reporter is not initialized.") return try: fileutil.mkdir(self.event_dir, mode=0o700) except (IOError, OSError) as e: msg = "Failed to create events folder {0}. Error: {1}".format(self.event_dir, ustr(e)) raise EventError(msg) try: existing_events = os.listdir(self.event_dir) if len(existing_events) >= MAX_NUMBER_OF_EVENTS: logger.periodic_warn(logger.EVERY_MINUTE, "[PERIODIC] Too many files under: {0}, current count: {1}, " "removing oldest event files".format(self.event_dir, len(existing_events))) existing_events.sort() oldest_files = existing_events[:-999] for event_file in oldest_files: os.remove(os.path.join(self.event_dir, event_file)) except (IOError, OSError) as e: msg = "Failed to remove old events from events folder {0}. Error: {1}".format(self.event_dir, ustr(e)) raise EventError(msg) filename = os.path.join(self.event_dir, ustr(int(time.time() * 1000000))) try: with open(filename + ".tmp", 'wb+') as hfile: hfile.write(data.encode("utf-8")) os.rename(filename + ".tmp", filename + AGENT_EVENT_FILE_EXTENSION) except (IOError, OSError) as e: msg = "Failed to write events to file: {0}".format(e) raise EventError(msg)
def image_origin(self): """ An integer value describing the origin of the image. 0 -> unknown 1 -> custom - user created image 2 -> endorsed - See https://docs.microsoft.com/en-us/azure/virtual-machines/linux/endorsed-distros 3 -> platform - non-endorsed image that is available in the Azure Marketplace. """ try: if self.publisher == "": return IMDS_IMAGE_ORIGIN_CUSTOM if ComputeInfo.__matcher.is_match(self.publisher, self.offer, self.sku, self.version): return IMDS_IMAGE_ORIGIN_ENDORSED else: return IMDS_IMAGE_ORIGIN_PLATFORM except Exception as e: logger.periodic_warn( logger.EVERY_FIFTEEN_MINUTES, "[PERIODIC] Could not determine the image origin from IMDS: {0}" .format(ustr(e))) return IMDS_IMAGE_ORIGIN_UNKNOWN
def _get_metadata_from_endpoint(self, endpoint, resource_path, headers): """ Get metadata from one of the IMDS endpoints. :param str endpoint: IMDS endpoint to call :param str resource_path: path of IMDS resource :param bool headers: headers to send in the request :return: Tuple<status:int, response:str> status: one of the following response status codes: IMDS_RESPONSE_SUCCESS, IMDS_RESPONSE_ERROR, IMDS_CONNECTION_ERROR, IMDS_INTERNAL_SERVER_ERROR response: IMDS response on IMDS_RESPONSE_SUCCESS, failure message otherwise """ try: resp = self._http_get(endpoint=endpoint, resource_path=resource_path, headers=headers) except ResourceGoneError: return IMDS_INTERNAL_SERVER_ERROR, "IMDS error in /metadata/{0}: HTTP Failed with Status Code 410: Gone".format(resource_path) except HttpError as e: msg = str(e) if self._regex_throttled.match(msg): return IMDS_RESPONSE_ERROR, "IMDS error in /metadata/{0}: Throttled".format(resource_path) if self._regex_ioerror.match(msg): logger.periodic_warn(logger.EVERY_FIFTEEN_MINUTES, "[PERIODIC] [IMDS_CONNECTION_ERROR] Unable to connect to IMDS endpoint {0}".format(endpoint)) return IMDS_CONNECTION_ERROR, "IMDS error in /metadata/{0}: Unable to connect to endpoint".format(resource_path) return IMDS_INTERNAL_SERVER_ERROR, "IMDS error in /metadata/{0}: {1}".format(resource_path, msg) if resp.status >= 500: return IMDS_INTERNAL_SERVER_ERROR, "IMDS error in /metadata/{0}: {1}".format( resource_path, restutil.read_response_error(resp)) if restutil.request_failed(resp): return IMDS_RESPONSE_ERROR, "IMDS error in /metadata/{0}: {1}".format( resource_path, restutil.read_response_error(resp)) return IMDS_RESPONSE_SUCCESS, resp.read()
def report_metric(category, counter, instance, value, log_event=False, reporter=__event_logger__): """ Send a telemetry event reporting a single instance of a performance counter. :param str category: The category of the metric (cpu, memory, etc) :param str counter: The name of the metric ("%idle", etc) :param str instance: For instanced metrics, the identifier of the instance. E.g. a disk drive name, a cpu core# :param value: The value of the metric :param bool log_event: If True, log the metric in the agent log as well :param EventLogger reporter: The EventLogger instance to which metric events should be sent """ if reporter.event_dir is None: from azurelinuxagent.common.version import AGENT_NAME logger.warn( "Cannot report metric event -- Event reporter is not initialized.") message = "Metric {0}/{1} [{2}] = {3}".format(category, counter, instance, value) _log_event(AGENT_NAME, "METRIC", message, 0) return try: reporter.add_metric(category, counter, instance, float(value), log_event) except ValueError: logger.periodic_warn( logger.EVERY_HALF_HOUR, "[PERIODIC] Cannot cast the metric value. Details of the Metric - " "{0}/{1} [{2}] = {3}".format(category, counter, instance, value))
def poll_all_tracked(): metrics = [] with CGroupsTelemetry._rlock: for cgroup in CGroupsTelemetry._tracked[:]: try: metrics.extend( cgroup.get_tracked_metrics( track_throttled_time=CGroupsTelemetry. _track_throttled_time)) except Exception as e: # There can be scenarios when the CGroup has been deleted by the time we are fetching the values # from it. This would raise IOError with file entry not found (ERRNO: 2). We do not want to log # every occurrences of such case as it would be very verbose. We do want to log all the other # exceptions which could occur, which is why we do a periodic log for all the other errors. if not isinstance( e, (IOError, OSError)) or e.errno != errno.ENOENT: # pylint: disable=E1101 logger.periodic_warn( logger.EVERY_HOUR, '[PERIODIC] Could not collect metrics for cgroup ' '{0}. Error : {1}'.format(cgroup.name, ustr(e))) if not cgroup.is_active(): CGroupsTelemetry.stop_tracking(cgroup) return metrics
def test_periodic_does_not_emit_if_previously_sent(self, mock_info, mock_error, mock_warn, mock_verbose): # The count does not increase from 1 - the first time it sends the data. logger.periodic_info(logger.EVERY_DAY, _MSG_INFO, *_DATA) self.assertIn(hash(_MSG_INFO), logger.DEFAULT_LOGGER.periodic_messages) self.assertEqual(1, mock_info.call_count) logger.periodic_info(logger.EVERY_DAY, _MSG_INFO, *_DATA) self.assertIn(hash(_MSG_INFO), logger.DEFAULT_LOGGER.periodic_messages) self.assertEqual(1, mock_info.call_count) logger.periodic_warn(logger.EVERY_DAY, _MSG_WARN, *_DATA) self.assertIn(hash(_MSG_WARN), logger.DEFAULT_LOGGER.periodic_messages) self.assertEqual(1, mock_warn.call_count) logger.periodic_warn(logger.EVERY_DAY, _MSG_WARN, *_DATA) self.assertIn(hash(_MSG_WARN), logger.DEFAULT_LOGGER.periodic_messages) self.assertEqual(1, mock_warn.call_count) logger.periodic_error(logger.EVERY_DAY, _MSG_ERROR, *_DATA) self.assertIn(hash(_MSG_ERROR), logger.DEFAULT_LOGGER.periodic_messages) self.assertEqual(1, mock_error.call_count) logger.periodic_error(logger.EVERY_DAY, _MSG_ERROR, *_DATA) self.assertIn(hash(_MSG_ERROR), logger.DEFAULT_LOGGER.periodic_messages) self.assertEqual(1, mock_error.call_count) logger.periodic_verbose(logger.EVERY_DAY, _MSG_VERBOSE, *_DATA) self.assertIn(hash(_MSG_VERBOSE), logger.DEFAULT_LOGGER.periodic_messages) self.assertEqual(1, mock_verbose.call_count) logger.periodic_verbose(logger.EVERY_DAY, _MSG_VERBOSE, *_DATA) self.assertIn(hash(_MSG_VERBOSE), logger.DEFAULT_LOGGER.periodic_messages) self.assertEqual(1, mock_verbose.call_count) self.assertEqual(4, len(logger.DEFAULT_LOGGER.periodic_messages))
def poll_all_tracked(): with CGroupsTelemetry._rlock: for cgroup in CGroupsTelemetry._tracked[:]: if cgroup.name not in CGroupsTelemetry._cgroup_metrics: CGroupsTelemetry._cgroup_metrics[ cgroup.name] = CgroupMetrics() metric = None # noinspection PyBroadException try: metric = cgroup.collect() except Exception as e: if isinstance( e, (IOError, OSError)) and e.errno == errno.ENOENT: pass else: logger.periodic_warn( logger.EVERY_HALF_HOUR, 'Could not collect the cgroup metrics for cgroup path {0}. ' 'Internal error : {1}'.format( cgroup.path, ustr(e))) if metric: CGroupsTelemetry._cgroup_metrics[cgroup.name].add_new_data( cgroup.controller, metric) if not cgroup.is_active(): CGroupsTelemetry.stop_tracking(cgroup) CGroupsTelemetry._cgroup_metrics[ cgroup.name].marked_for_delete = True
def test_periodic_emits_if_not_previously_sent(self, mock_info, mock_error, mock_warn, mock_verbose): logger.periodic_info(logger.EVERY_DAY, _MSG_INFO, logger.LogLevel.INFO, *_DATA) self.assertEqual(1, mock_info.call_count) logger.periodic_error(logger.EVERY_DAY, _MSG_ERROR, logger.LogLevel.ERROR, *_DATA) self.assertEqual(1, mock_error.call_count) logger.periodic_warn(logger.EVERY_DAY, _MSG_WARN, logger.LogLevel.WARNING, *_DATA) self.assertEqual(1, mock_warn.call_count) logger.periodic_verbose(logger.EVERY_DAY, _MSG_VERBOSE, logger.LogLevel.VERBOSE, *_DATA) self.assertEqual(1, mock_verbose.call_count)
def test_periodic_forwards_message_and_args(self, mock_info, mock_error, mock_warn, mock_verbose): logger.periodic_info(logger.EVERY_DAY, _MSG_INFO, *_DATA) mock_info.assert_called_once_with(_MSG_INFO, *_DATA) logger.periodic_error(logger.EVERY_DAY, _MSG_ERROR, *_DATA) mock_error.assert_called_once_with(_MSG_ERROR, *_DATA) logger.periodic_warn(logger.EVERY_DAY, _MSG_WARN, *_DATA) mock_warn.assert_called_once_with(_MSG_WARN, *_DATA) logger.periodic_verbose(logger.EVERY_DAY, _MSG_VERBOSE, *_DATA) mock_verbose.assert_called_once_with(_MSG_VERBOSE, *_DATA)
def collect_data(self, cgroup): # noinspection PyBroadException try: if cgroup.controller == "cpu": self._cpu_usage.append(cgroup.get_cpu_usage()) elif cgroup.controller == "memory": self._memory_usage.append(cgroup.get_memory_usage()) self._max_memory_usage.append(cgroup.get_max_memory_usage()) else: raise CGroupsException( 'CGroup controller {0} is not supported'.format( controller)) except Exception as e: if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT: logger.periodic_warn( logger.EVERY_HALF_HOUR, 'Could not collect metrics for cgroup {0}. Error : {1}'. format(cgroup.path, ustr(e)))
def is_active(self): try: tasks = self._get_parameters("tasks") if tasks: return len(tasks) != 0 # pylint: disable=len-as-condition except (IOError, OSError) as e: # pylint: disable=C0103 if e.errno == errno.ENOENT: # only suppressing file not found exceptions. pass else: logger.periodic_warn( logger.EVERY_HALF_HOUR, 'Could not get list of tasks from "tasks" file in the cgroup: {0}.' ' Internal error: {1}'.format(self.path, ustr(e))) except CGroupsException as e: # pylint: disable=C0103 logger.periodic_warn( logger.EVERY_HALF_HOUR, 'Could not get list of tasks from "tasks" file in the cgroup: {0}.' ' Internal error: {1}'.format(self.path, ustr(e))) return False
def get_tracked_processes(self): """ :return: List of Str (Pids). Will return an empty string if we couldn't fetch any tracked processes. """ procs = [] try: procs = self._get_parameters("cgroup.procs") except (IOError, OSError) as e: # pylint: disable=C0103 if e.errno == errno.ENOENT: # only suppressing file not found exceptions. pass else: logger.periodic_warn( logger.EVERY_HALF_HOUR, 'Could not get list of procs from "cgroup.procs" file in the cgroup: {0}.' ' Internal error: {1}'.format(self.path, ustr(e))) except CGroupsException as e: # pylint: disable=C0103 logger.periodic_warn( logger.EVERY_HALF_HOUR, 'Could not get list of tasks from "cgroup.procs" file in the cgroup: {0}.' ' Internal error: {1}'.format(self.path, ustr(e))) return procs
def test_periodic_emits_after_elapsed_delta(self, mock_info, mock_error, mock_warn, mock_verbose): logger.periodic_info(logger.EVERY_DAY, _MSG_INFO, *_DATA) self.assertEqual(1, mock_info.call_count) logger.periodic_info(logger.EVERY_DAY, _MSG_INFO, *_DATA) self.assertEqual(1, mock_info.call_count) logger.DEFAULT_LOGGER.periodic_messages[hash(_MSG_INFO)] = datetime.now() - \ logger.EVERY_DAY - logger.EVERY_HOUR logger.periodic_info(logger.EVERY_DAY, _MSG_INFO, *_DATA) self.assertEqual(2, mock_info.call_count) logger.periodic_warn(logger.EVERY_DAY, _MSG_WARN, *_DATA) self.assertEqual(1, mock_warn.call_count) logger.periodic_warn(logger.EVERY_DAY, _MSG_WARN, *_DATA) self.assertEqual(1, mock_warn.call_count) logger.DEFAULT_LOGGER.periodic_messages[hash(_MSG_WARN)] = datetime.now() - \ logger.EVERY_DAY - logger.EVERY_HOUR logger.periodic_warn(logger.EVERY_DAY, _MSG_WARN, *_DATA) self.assertEqual(2, mock_info.call_count) logger.periodic_error(logger.EVERY_DAY, _MSG_ERROR, *_DATA) self.assertEqual(1, mock_error.call_count) logger.periodic_error(logger.EVERY_DAY, _MSG_ERROR, *_DATA) self.assertEqual(1, mock_error.call_count) logger.DEFAULT_LOGGER.periodic_messages[hash(_MSG_ERROR)] = datetime.now() - \ logger.EVERY_DAY - logger.EVERY_HOUR logger.periodic_error(logger.EVERY_DAY, _MSG_ERROR, *_DATA) self.assertEqual(2, mock_info.call_count) logger.periodic_verbose(logger.EVERY_DAY, _MSG_VERBOSE, *_DATA) self.assertEqual(1, mock_verbose.call_count) logger.periodic_verbose(logger.EVERY_DAY, _MSG_VERBOSE, *_DATA) self.assertEqual(1, mock_verbose.call_count) logger.DEFAULT_LOGGER.periodic_messages[hash(_MSG_VERBOSE)] = datetime.now() - \ logger.EVERY_DAY - logger.EVERY_HOUR logger.periodic_verbose(logger.EVERY_DAY, _MSG_VERBOSE, *_DATA) self.assertEqual(2, mock_info.call_count)
def poll_all_tracked(): metrics = [] with CGroupsTelemetry._rlock: for cgroup in CGroupsTelemetry._tracked[:]: if cgroup.name not in CGroupsTelemetry._cgroup_metrics: CGroupsTelemetry._cgroup_metrics[ cgroup.name] = CgroupMetrics() try: if cgroup.controller == CGroupContollers.CPU: current_cpu_usage = cgroup.get_cpu_usage() CGroupsTelemetry._cgroup_metrics[ cgroup.name].add_cpu_usage(current_cpu_usage) metrics.append( MetricValue(MetricsCategory.PROCESS_CATEGORY, MetricsCounter.PROCESSOR_PERCENT_TIME, cgroup.name, current_cpu_usage)) elif cgroup.controller == CGroupContollers.MEMORY: current_memory_usage = cgroup.get_memory_usage() CGroupsTelemetry._cgroup_metrics[ cgroup.name].add_memory_usage(current_memory_usage) metrics.append( MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.TOTAL_MEM_USAGE, cgroup.name, current_memory_usage)) max_memory_usage = cgroup.get_max_memory_usage() CGroupsTelemetry._cgroup_metrics[ cgroup.name].add_max_memory_usage(max_memory_usage) metrics.append( MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MAX_MEM_USAGE, cgroup.name, max_memory_usage)) pids = cgroup.get_tracked_processes() for pid in pids: try: mem_usage_from_procstatm = MemoryResourceUsage.get_memory_usage_from_proc_statm( pid) metrics.append( MetricValue( MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MEM_USED_BY_PROCESS, CGroupsTelemetry. get_process_info_summary(pid), mem_usage_from_procstatm)) CGroupsTelemetry._cgroup_metrics[ cgroup.name].add_proc_statm_memory( CGroupsTelemetry. get_process_info_summary(pid), mem_usage_from_procstatm) except Exception as e: if not isinstance( e, (IOError, OSError)) or e.errno != errno.ENOENT: logger.periodic_warn( logger.EVERY_HOUR, "[PERIODIC] Could not collect proc_statm " "for pid {0}. Error : {1}", pid, ustr(e)) else: raise CGroupsException( 'CGroup controller {0} is not supported for cgroup {1}' .format(cgroup.controller, cgroup.name)) except Exception as e: # There can be scenarios when the CGroup has been deleted by the time we are fetching the values # from it. This would raise IOError with file entry not found (ERRNO: 2). We do not want to log # every occurrences of such case as it would be very verbose. We do want to log all the other # exceptions which could occur, which is why we do a periodic log for all the other errors. if not isinstance( e, (IOError, OSError)) or e.errno != errno.ENOENT: logger.periodic_warn( logger.EVERY_HOUR, '[PERIODIC] Could not collect metrics for cgroup ' '{0}. Error : {1}'.format(cgroup.name, ustr(e))) if not cgroup.is_active(): CGroupsTelemetry.stop_tracking(cgroup) CGroupsTelemetry._cgroup_metrics[ cgroup.name].marked_for_delete = True return metrics