Example #1
0
class SendHostPluginHeartbeat(PeriodicOperation):
    """
    Periodic operation for reporting the HostGAPlugin's health. The signal is 'Healthy' when we have been able to communicate with
    plugin at least once in the last _HOST_PLUGIN_HEALTH_PERIOD.
    """
    def __init__(self, protocol, health_service):
        super(SendHostPluginHeartbeat, self).__init__(SendHostPluginHeartbeat._HOST_PLUGIN_HEARTBEAT_PERIOD)
        self.protocol = protocol
        self.health_service = health_service
        self.host_plugin_error_state = ErrorState(min_timedelta=SendHostPluginHeartbeat._HOST_PLUGIN_HEALTH_PERIOD)

    _HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    _HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5)

    def _operation(self):
        try:
            host_plugin = self.protocol.client.get_host_plugin()
            host_plugin.ensure_initialized()
            self.protocol.update_host_plugin_from_goal_state()

            is_currently_healthy = host_plugin.get_health()

            if is_currently_healthy:
                self.host_plugin_error_state.reset()
            else:
                self.host_plugin_error_state.incr()

            is_healthy = self.host_plugin_error_state.is_triggered() is False
            logger.verbose("HostGAPlugin health: {0}", is_healthy)

            self.health_service.report_host_plugin_heartbeat(is_healthy)

            if not is_healthy:
                add_event(
                    name=AGENT_NAME,
                    version=CURRENT_VERSION,
                    op=WALAEventOperation.HostPluginHeartbeatExtended,
                    is_success=False,
                    message='{0} since successful heartbeat'.format(self.host_plugin_error_state.fail_time),
                    log_event=False)

        except Exception as e:
            msg = "Exception sending host plugin heartbeat: {0}".format(ustr(e))
            add_event(
                name=AGENT_NAME,
                version=CURRENT_VERSION,
                op=WALAEventOperation.HostPluginHeartbeat,
                is_success=False,
                message=msg,
                log_event=False)
            raise
Example #2
0
    def test_errorstate04(self):
        """
        If ErrorState is reset the timestamp of the last incr() is reset to
        None.
        """

        test_subject = ErrorState(timedelta(minutes=15))
        self.assertTrue(test_subject.timestamp is None)

        test_subject.incr()
        self.assertTrue(test_subject.timestamp is not None)

        test_subject.reset()
        self.assertTrue(test_subject.timestamp is None)
Example #3
0
class SendImdsHeartbeat(PeriodicOperation):
    """
    Periodic operation to report the IDMS's health. The signal is 'Healthy' when we have successfully called and validated
    a response in the last _IMDS_HEALTH_PERIOD.
    """
    def __init__(self, protocol_util, health_service):
        super(SendImdsHeartbeat, self).__init__(SendImdsHeartbeat._IMDS_HEARTBEAT_PERIOD)
        self.health_service = health_service
        self.imds_client = get_imds_client(protocol_util.get_wireserver_endpoint())
        self.imds_error_state = ErrorState(min_timedelta=SendImdsHeartbeat._IMDS_HEALTH_PERIOD)

    _IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    _IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3)

    def _operation(self):
        try:
            is_currently_healthy, response = self.imds_client.validate()

            if is_currently_healthy:
                self.imds_error_state.reset()
            else:
                self.imds_error_state.incr()

            is_healthy = self.imds_error_state.is_triggered() is False
            logger.verbose("IMDS health: {0} [{1}]", is_healthy, response)

            self.health_service.report_imds_status(is_healthy, response)

        except Exception as e:
            msg = "Exception sending imds heartbeat: {0}".format(ustr(e))
            add_event(
                name=AGENT_NAME,
                version=CURRENT_VERSION,
                op=WALAEventOperation.ImdsHeartbeat,
                is_success=False,
                message=msg,
                log_event=False)
            raise
Example #4
0
class ExtHandlersHandler(object):
    def __init__(self):
        self.protocol_util = get_protocol_util()
        self.protocol = None
        self.ext_handlers = None
        self.last_etag = None
        self.last_upgrade_guids = {}
        self.log_report = False
        self.log_etag = True
        self.log_process = False

        self.report_status_error_state = ErrorState()

    def run(self):
        self.ext_handlers, etag = None, None
        try:
            self.protocol = self.protocol_util.get_protocol()
            self.ext_handlers, etag = self.protocol.get_ext_handlers()
        except Exception as e:
            msg = u"Exception retrieving extension handlers: {0}".format(
                ustr(e))
            logger.warn(msg)
            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ExtensionProcessing,
                      is_success=False,
                      message=msg)
            return

        try:
            msg = u"Handle extensions updates for incarnation {0}".format(etag)
            logger.verbose(msg)
            # Log status report success on new config
            self.log_report = True
            self.handle_ext_handlers(etag)
            self.last_etag = etag

            self.report_ext_handlers_status()
            self.cleanup_outdated_handlers()
        except RestartError:
            raise
        except Exception as e:
            msg = u"Exception processing extension handlers: {0}".format(
                ustr(e))
            logger.warn(msg)
            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ExtensionProcessing,
                      is_success=False,
                      message=msg)
            return

    def run_status(self):
        self.report_ext_handlers_status()
        return

    def get_upgrade_guid(self, name):
        return self.last_upgrade_guids.get(name, (None, False))[0]

    def get_log_upgrade_guid(self, ext_handler):
        return self.last_upgrade_guids.get(ext_handler.name, (None, False))[1]

    def set_log_upgrade_guid(self, ext_handler, log_val):
        guid = self.get_upgrade_guid(ext_handler.name)
        if guid is not None:
            self.last_upgrade_guids[ext_handler.name] = (guid, log_val)

    def is_new_guid(self, ext_handler):
        last_guid = self.get_upgrade_guid(ext_handler.name)
        if last_guid is None:
            return True
        return last_guid != ext_handler.properties.upgradeGuid

    def cleanup_outdated_handlers(self):
        handlers = []
        pkgs = []

        # Build a collection of uninstalled handlers and orphaned packages
        # Note:
        # -- An orphaned package is one without a corresponding handler
        #    directory
        for item in os.listdir(conf.get_lib_dir()):
            path = os.path.join(conf.get_lib_dir(), item)

            if version.is_agent_package(path) or version.is_agent_path(path):
                continue

            if os.path.isdir(path):
                if re.match(HANDLER_NAME_PATTERN, item) is None:
                    continue
                try:
                    eh = ExtHandler()

                    separator = item.rfind('-')

                    eh.name = item[0:separator]
                    eh.properties.version = str(FlexibleVersion(item[separator+1:]))

                    handler = ExtHandlerInstance(eh, self.protocol)
                except Exception as e:
                    continue
                if handler.get_handler_state() != ExtHandlerState.NotInstalled:
                    continue
                handlers.append(handler)

            elif os.path.isfile(path) and \
                    not os.path.isdir(path[0:-len(HANDLER_PKG_EXT)]):
                if not re.match(HANDLER_PKG_PATTERN, item):
                    continue
                pkgs.append(path)

        # Then, remove the orphaned packages
        for pkg in pkgs:
            try:
                os.remove(pkg)
                logger.verbose("Removed orphaned extension package "
                            "{0}".format(pkg))
            except Exception as e:
                logger.warn("Failed to remove orphaned package: {0}".format(
                                pkg))

        # Finally, remove the directories and packages of the
        # uninstalled handlers
        for handler in handlers:
            handler.rm_ext_handler_dir()
            pkg = os.path.join(conf.get_lib_dir(),
                    handler.get_full_name() + HANDLER_PKG_EXT)
            if os.path.isfile(pkg):
                try:
                    os.remove(pkg)
                    logger.verbose("Removed extension package "
                                "{0}".format(pkg))
                except Exception as e:
                    logger.warn("Failed to remove extension package: "
                                "{0}".format(pkg))
   
    def handle_ext_handlers(self, etag=None):
        if self.ext_handlers.extHandlers is None or \
                len(self.ext_handlers.extHandlers) == 0:
            logger.verbose("No extension handler config found")
            return

        if conf.get_enable_overprovisioning():
            artifacts_profile = self.protocol.get_artifacts_profile()
            if artifacts_profile and artifacts_profile.is_on_hold():
                logger.info("Extension handling is on hold")
                return

        self.ext_handlers.extHandlers.sort(key=operator.methodcaller('sort_key'))
        for ext_handler in self.ext_handlers.extHandlers:
            # TODO: handle install in sequence, enable in parallel
            self.handle_ext_handler(ext_handler, etag)
    
    def handle_ext_handler(self, ext_handler, etag):
        ext_handler_i = ExtHandlerInstance(ext_handler, self.protocol)

        try:
            state = ext_handler.properties.state
            # The extension is to be enabled, there is an upgrade GUID
            # and the GUID is NOT new
            if state == u"enabled" and \
                    ext_handler.properties.upgradeGuid is not None and \
                    not self.is_new_guid(ext_handler):
                ext_handler_i.ext_handler.properties.version = ext_handler_i.get_installed_version()
                ext_handler_i.set_logger()
                if self.last_etag != etag:
                    self.set_log_upgrade_guid(ext_handler, True)

                msg = "New GUID is the same as the old GUID. Exiting without upgrading."
                if self.get_log_upgrade_guid(ext_handler):
                    ext_handler_i.logger.info(msg)
                    self.set_log_upgrade_guid(ext_handler, False)
                ext_handler_i.set_handler_state(ExtHandlerState.Enabled)
                ext_handler_i.set_handler_status(status="Ready", message="No change")
                ext_handler_i.set_operation(WALAEventOperation.SkipUpdate)
                ext_handler_i.report_event(message=ustr(msg), is_success=True)
                return

            self.set_log_upgrade_guid(ext_handler, True)
            ext_handler_i.decide_version(etag=etag, target_state=state)
            if not ext_handler_i.is_upgrade and self.last_etag == etag:
                if self.log_etag:
                    ext_handler_i.logger.verbose("Version {0} is current for etag {1}",
                                                 ext_handler_i.pkg.version,
                                                 etag)
                    self.log_etag = False
                return

            self.log_etag = True

            ext_handler_i.logger.info("Target handler state: {0}", state)
            if state == u"enabled":
                self.handle_enable(ext_handler_i)
                if ext_handler.properties.upgradeGuid is not None:
                    ext_handler_i.logger.info("New Upgrade GUID: {0}", ext_handler.properties.upgradeGuid)
                    self.last_upgrade_guids[ext_handler.name] = (ext_handler.properties.upgradeGuid, True)
            elif state == u"disabled":
                self.handle_disable(ext_handler_i)
                # Remove the GUID from the dictionary so that it is upgraded upon re-enable
                self.last_upgrade_guids.pop(ext_handler.name, None)
            elif state == u"uninstall":
                self.handle_uninstall(ext_handler_i)
                # Remove the GUID from the dictionary so that it is upgraded upon re-install
                self.last_upgrade_guids.pop(ext_handler.name, None)
            else:
                message = u"Unknown ext handler state:{0}".format(state)
                raise ExtensionError(message)
        except RestartError:
            ext_handler_i.logger.info("GoalState became stale during "
                                      "processing. Restarting with new "
                                      "GoalState")
            raise
        except Exception as e:
            ext_handler_i.set_handler_status(message=ustr(e), code=-1)
            ext_handler_i.report_event(message=ustr(e), is_success=False)
    
    def handle_enable(self, ext_handler_i):
        self.log_process = True
        old_ext_handler_i = ext_handler_i.get_installed_ext_handler()
        if old_ext_handler_i is not None and \
           old_ext_handler_i.version_gt(ext_handler_i):
            msg = "Downgrade is not allowed. Skipping install and enable."
            ext_handler_i.logger.error(msg)
            ext_handler_i.set_operation(WALAEventOperation.Downgrade)
            ext_handler_i.report_event(message=ustr(msg), is_success=True)
            return

        handler_state = ext_handler_i.get_handler_state()
        ext_handler_i.logger.info("[Enable] current handler state is: {0}",
                                  handler_state.lower())
        if handler_state == ExtHandlerState.NotInstalled:
            ext_handler_i.set_handler_state(ExtHandlerState.NotInstalled)
            ext_handler_i.download()
            ext_handler_i.update_settings()
            if old_ext_handler_i is None:
                ext_handler_i.install()
            elif ext_handler_i.version_gt(old_ext_handler_i):
                old_ext_handler_i.disable()
                ext_handler_i.copy_status_files(old_ext_handler_i)
                ext_handler_i.update()
                old_ext_handler_i.uninstall()
                old_ext_handler_i.rm_ext_handler_dir()
                ext_handler_i.update_with_install()
        else:
            ext_handler_i.update_settings()

        ext_handler_i.enable() 

    def handle_disable(self, ext_handler_i):
        self.log_process = True
        handler_state = ext_handler_i.get_handler_state()
        ext_handler_i.logger.info("[Disable] current handler state is: {0}",
                                  handler_state.lower())
        if handler_state == ExtHandlerState.Enabled:
            ext_handler_i.disable()

    def handle_uninstall(self, ext_handler_i):
        self.log_process = True
        handler_state = ext_handler_i.get_handler_state()
        ext_handler_i.logger.info("[Uninstall] current handler state is: {0}",
                                  handler_state.lower())
        if handler_state != ExtHandlerState.NotInstalled:
            if handler_state == ExtHandlerState.Enabled:
                ext_handler_i.disable()
            ext_handler_i.uninstall()
        ext_handler_i.rm_ext_handler_dir()

    def report_ext_handlers_status(self):
        """Go through handler_state dir, collect and report status"""
        vm_status = VMStatus(status="Ready", message="Guest Agent is running")
        if self.ext_handlers is not None:
            for ext_handler in self.ext_handlers.extHandlers:
                try:
                    self.report_ext_handler_status(vm_status, ext_handler)
                except ExtensionError as e:
                    add_event(
                        AGENT_NAME,
                        version=CURRENT_VERSION,
                        op=WALAEventOperation.ExtensionProcessing,
                        is_success=False,
                        message=ustr(e))

        logger.verbose("Report vm agent status")
        try:
            self.protocol.report_vm_status(vm_status)
            if self.log_report:
                logger.verbose("Completed vm agent status report")
            self.report_status_error_state.reset()
        except ProtocolError as e:
            self.report_status_error_state.incr()
            message = "Failed to report vm agent status: {0}".format(e)
            add_event(AGENT_NAME,
                version=CURRENT_VERSION,
                op=WALAEventOperation.ExtensionProcessing,
                is_success=False,
                message=message)

        if self.report_status_error_state.is_triggered():
            message = "Failed to report vm agent status for more than {0}"\
                .format(ERROR_STATE_DELTA)

            add_event(AGENT_NAME,
                version=CURRENT_VERSION,
                op=WALAEventOperation.ExtensionProcessing,
                is_success=False,
                message=message)

            self.report_status_error_state.reset()

    def report_ext_handler_status(self, vm_status, ext_handler):
        ext_handler_i = ExtHandlerInstance(ext_handler, self.protocol)
        
        handler_status = ext_handler_i.get_handler_status() 
        if handler_status is None:
            return
        guid = self.get_upgrade_guid(ext_handler.name)
        if guid is not None:
            handler_status.upgradeGuid = guid

        handler_state = ext_handler_i.get_handler_state()
        if handler_state != ExtHandlerState.NotInstalled:
            try:
                active_exts = ext_handler_i.report_ext_status()
                handler_status.extensions.extend(active_exts)
            except ExtensionError as e:
                ext_handler_i.set_handler_status(message=ustr(e), code=-1)

            try:
                heartbeat = ext_handler_i.collect_heartbeat()
                if heartbeat is not None:
                    handler_status.status = heartbeat.get('status')
            except ExtensionError as e:
                ext_handler_i.set_handler_status(message=ustr(e), code=-1)

        vm_status.vmAgent.extensionHandlers.append(handler_status)
Example #5
0
class MonitorHandler(object):

    EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1)
    TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30)
    HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5)

    def __init__(self):
        self.osutil = get_osutil()
        self.protocol_util = get_protocol_util()
        self.imds_client = get_imds_client()

        self.event_thread = None
        self.last_event_collection = None
        self.last_telemetry_heartbeat = None
        self.last_host_plugin_heartbeat = None
        self.protocol = None
        self.health_service = None

        self.counter = 0
        self.sysinfo = []
        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)

    def run(self):
        self.init_protocols()
        self.init_sysinfo()
        self.start()

    def stop(self):
        self.should_run = False
        if self.is_alive():
            self.event_thread.join()

    def init_protocols(self):
        self.protocol = self.protocol_util.get_protocol()
        self.health_service = HealthService(self.protocol.endpoint)

    def is_alive(self):
        return self.event_thread is not None and self.event_thread.is_alive()

    def start(self):
        self.event_thread = threading.Thread(target=self.daemon)
        self.event_thread.setDaemon(True)
        self.event_thread.start()

    def init_sysinfo(self):
        osversion = "{0}:{1}-{2}-{3}:{4}".format(platform.system(),
                                                 DISTRO_NAME,
                                                 DISTRO_VERSION,
                                                 DISTRO_CODE_NAME,
                                                 platform.release())
        self.sysinfo.append(TelemetryEventParam("OSVersion", osversion))
        self.sysinfo.append(
            TelemetryEventParam("GAVersion", CURRENT_AGENT))

        try:
            ram = self.osutil.get_total_mem()
            processors = self.osutil.get_processor_cores()
            self.sysinfo.append(TelemetryEventParam("RAM", ram))
            self.sysinfo.append(TelemetryEventParam("Processors", processors))
        except OSUtilError as e:
            logger.warn("Failed to get system info: {0}", e)

        try:
            vminfo = self.protocol.get_vminfo()
            self.sysinfo.append(TelemetryEventParam("VMName",
                                                    vminfo.vmName))
            self.sysinfo.append(TelemetryEventParam("TenantName",
                                                    vminfo.tenantName))
            self.sysinfo.append(TelemetryEventParam("RoleName",
                                                    vminfo.roleName))
            self.sysinfo.append(TelemetryEventParam("RoleInstanceName",
                                                    vminfo.roleInstanceName))
            self.sysinfo.append(TelemetryEventParam("ContainerId",
                                                    vminfo.containerId))
        except ProtocolError as e:
            logger.warn("Failed to get system info: {0}", e)

        try:
            vminfo = self.imds_client.get_compute()
            self.sysinfo.append(TelemetryEventParam('Location',
                                                    vminfo.location))
            self.sysinfo.append(TelemetryEventParam('SubscriptionId',
                                                    vminfo.subscriptionId))
            self.sysinfo.append(TelemetryEventParam('ResourceGroupName',
                                                    vminfo.resourceGroupName))
            self.sysinfo.append(TelemetryEventParam('VMId',
                                                    vminfo.vmId))
            self.sysinfo.append(TelemetryEventParam('ImageOrigin',
                                                    vminfo.image_origin))
        except (HttpError, ValueError) as e:
            logger.warn("failed to get IMDS info: {0}", e)

    def collect_event(self, evt_file_name):
        try:
            logger.verbose("Found event file: {0}", evt_file_name)
            with open(evt_file_name, "rb") as evt_file:
                # if fail to open or delete the file, throw exception
                data_str = evt_file.read().decode("utf-8", 'ignore')
            logger.verbose("Processed event file: {0}", evt_file_name)
            os.remove(evt_file_name)
            return data_str
        except IOError as e:
            msg = "Failed to process {0}, {1}".format(evt_file_name, e)
            raise EventError(msg)

    def collect_and_send_events(self):
        if self.last_event_collection is None:
            self.last_event_collection = datetime.datetime.utcnow() - MonitorHandler.EVENT_COLLECTION_PERIOD

        if datetime.datetime.utcnow() >= (self.last_event_collection + MonitorHandler.EVENT_COLLECTION_PERIOD):
            try:
                event_list = TelemetryEventList()
                event_dir = os.path.join(conf.get_lib_dir(), "events")
                event_files = os.listdir(event_dir)
                for event_file in event_files:
                    if not event_file.endswith(".tld"):
                        continue
                    event_file_path = os.path.join(event_dir, event_file)
                    try:
                        data_str = self.collect_event(event_file_path)
                    except EventError as e:
                        logger.error("{0}", e)
                        continue

                    try:
                        event = parse_event(data_str)
                        self.add_sysinfo(event)
                        event_list.events.append(event)
                    except (ValueError, ProtocolError) as e:
                        logger.warn("Failed to decode event file: {0}", e)
                        continue

                if len(event_list.events) == 0:
                    return

                try:
                    self.protocol.report_event(event_list)
                except ProtocolError as e:
                    logger.error("{0}", e)
            except Exception as e:
                logger.warn("Failed to send events: {0}", e)

            self.last_event_collection = datetime.datetime.utcnow()

    def daemon(self):
        min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD,
                        MonitorHandler.EVENT_COLLECTION_PERIOD,
                        MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD).seconds
        while self.should_run:
            self.send_telemetry_heartbeat()
            self.collect_and_send_events()
            self.send_host_plugin_heartbeat()
            time.sleep(min_delta)

    def add_sysinfo(self, event):
        sysinfo_names = [v.name for v in self.sysinfo]
        for param in event.parameters:
            if param.name in sysinfo_names:
                logger.verbose("Remove existing event parameter: [{0}:{1}]",
                               param.name,
                               param.value)
                event.parameters.remove(param)
        event.parameters.extend(self.sysinfo)

    def send_host_plugin_heartbeat(self):
        """
        Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to
        communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD.
        """
        if self.last_host_plugin_heartbeat is None:
            self.last_host_plugin_heartbeat = datetime.datetime.utcnow() - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (self.last_host_plugin_heartbeat + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD):
            try:
                host_plugin = self.protocol.client.get_host_plugin()
                host_plugin.ensure_initialized()
                is_currently_healthy = host_plugin.get_health()

                if is_currently_healthy:
                    self.host_plugin_errorstate.reset()
                else:
                    self.host_plugin_errorstate.incr()

                is_healthy = self.host_plugin_errorstate.is_triggered() is False
                logger.verbose("HostGAPlugin health: {0}", is_healthy)

                self.health_service.report_host_plugin_heartbeat(is_healthy)

            except Exception as e:
                msg = "Exception sending host plugin heartbeat: {0}".format(ustr(e))
                add_event(
                    name=AGENT_NAME,
                    version=CURRENT_VERSION,
                    op=WALAEventOperation.HostPluginHeartbeat,
                    is_success=False,
                    message=msg,
                    log_event=False)

            self.last_host_plugin_heartbeat = datetime.datetime.utcnow()

    def send_telemetry_heartbeat(self):

        if self.last_telemetry_heartbeat is None:
            self.last_telemetry_heartbeat = datetime.datetime.utcnow() - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD):
            try:
                incarnation = self.protocol.get_incarnation()
                dropped_packets = self.osutil.get_firewall_dropped_packets(self.protocol.endpoint)
                msg = "{0};{1};{2};{3}".format(incarnation, self.counter, self.heartbeat_id, dropped_packets)

                add_event(
                    name=AGENT_NAME,
                    version=CURRENT_VERSION,
                    op=WALAEventOperation.HeartBeat,
                    is_success=True,
                    message=msg,
                    log_event=False)

                self.counter += 1

                io_errors = IOErrorCounter.get_and_reset()
                hostplugin_errors = io_errors.get("hostplugin")
                protocol_errors = io_errors.get("protocol")
                other_errors = io_errors.get("other")

                if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0:
                    msg = "hostplugin:{0};protocol:{1};other:{2}".format(hostplugin_errors,
                                                                         protocol_errors,
                                                                         other_errors)
                    add_event(
                        name=AGENT_NAME,
                        version=CURRENT_VERSION,
                        op=WALAEventOperation.HttpErrors,
                        is_success=True,
                        message=msg,
                        log_event=False)
            except Exception as e:
                logger.warn("Failed to send heartbeat: {0}", e)

            self.last_telemetry_heartbeat = datetime.datetime.utcnow()
Example #6
0
class MonitorHandler(object):
    # telemetry
    EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1)
    # host health
    TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30)
    # cgroup data period
    CGROUP_TELEMETRY_POLLING_PERIOD = datetime.timedelta(minutes=5)
    CGROUP_TELEMETRY_REPORTING_PERIOD = datetime.timedelta(minutes=30)
    # host plugin
    HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5)
    # imds
    IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3)
    # log network configuration
    LOG_NETWORK_CONFIGURATION_PERIOD = datetime.timedelta(minutes=1)
    # Resetting loggers period
    RESET_LOGGERS_PERIOD = datetime.timedelta(hours=12)

    def __init__(self):
        self.osutil = get_osutil()
        self.imds_client = None

        self.event_thread = None
        self._reset_loggers_op = PeriodicOperation("reset_loggers",
                                                   self.reset_loggers,
                                                   self.RESET_LOGGERS_PERIOD)
        self._collect_and_send_events_op = PeriodicOperation(
            "collect_and_send_events", self.collect_and_send_events,
            self.EVENT_COLLECTION_PERIOD)
        self._send_telemetry_heartbeat_op = PeriodicOperation(
            "send_telemetry_heartbeat", self.send_telemetry_heartbeat,
            self.TELEMETRY_HEARTBEAT_PERIOD)
        self._poll_telemetry_metrics_op = PeriodicOperation(
            "poll_telemetry_metrics usage", self.poll_telemetry_metrics,
            self.CGROUP_TELEMETRY_POLLING_PERIOD)
        self._send_telemetry_metrics_op = PeriodicOperation(
            "send_telemetry_metrics usage", self.send_telemetry_metrics,
            self.CGROUP_TELEMETRY_REPORTING_PERIOD)
        self._send_host_plugin_heartbeat_op = PeriodicOperation(
            "send_host_plugin_heartbeat", self.send_host_plugin_heartbeat,
            self.HOST_PLUGIN_HEARTBEAT_PERIOD)
        self._send_imds_heartbeat_op = PeriodicOperation(
            "send_imds_heartbeat", self.send_imds_heartbeat,
            self.IMDS_HEARTBEAT_PERIOD)
        self._log_altered_network_configuration_op = PeriodicOperation(
            "log_altered_network_configuration",
            self.log_altered_network_configuration,
            self.LOG_NETWORK_CONFIGURATION_PERIOD)
        self.protocol = None
        self.protocol_util = None
        self.health_service = None
        self.last_route_table_hash = b''
        self.last_nic_state = {}

        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(
            min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)
        self.imds_errorstate = ErrorState(
            min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD)

    def run(self):
        self.start(init_data=True)

    def stop(self):
        self.should_run = False
        if self.is_alive():
            self.join()

    def join(self):
        self.event_thread.join()

    def stopped(self):
        return not self.should_run

    def init_protocols(self):
        # The initialization of ProtocolUtil for the Monitor thread should be done within the thread itself rather
        # than initializing it in the ExtHandler thread. This is done to avoid any concurrency issues as each
        # thread would now have its own ProtocolUtil object as per the SingletonPerThread model.
        self.protocol_util = get_protocol_util()
        self.protocol = self.protocol_util.get_protocol()
        self.health_service = HealthService(self.protocol.get_endpoint())

    def init_imds_client(self):
        wireserver_endpoint = self.protocol_util.get_wireserver_endpoint()
        self.imds_client = get_imds_client(wireserver_endpoint)

    def is_alive(self):
        return self.event_thread is not None and self.event_thread.is_alive()

    def start(self, init_data=False):
        self.event_thread = threading.Thread(target=self.daemon,
                                             args=(init_data, ))
        self.event_thread.setDaemon(True)
        self.event_thread.setName("MonitorHandler")
        self.event_thread.start()

    def collect_and_send_events(self):
        """
        Periodically send any events located in the events folder
        """
        event_list = collect_events()

        if len(event_list.events) > 0:
            self.protocol.report_event(event_list)

    def daemon(self, init_data=False):

        if init_data:
            self.init_protocols()
            self.init_imds_client()

        min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD,
                        MonitorHandler.CGROUP_TELEMETRY_POLLING_PERIOD,
                        MonitorHandler.CGROUP_TELEMETRY_REPORTING_PERIOD,
                        MonitorHandler.EVENT_COLLECTION_PERIOD,
                        MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD,
                        MonitorHandler.IMDS_HEARTBEAT_PERIOD).seconds
        while not self.stopped():
            try:
                self.protocol.update_host_plugin_from_goal_state()
                self._send_telemetry_heartbeat_op.run()
                self._poll_telemetry_metrics_op.run()
                # This will be removed in favor of poll_telemetry_metrics() and it'll directly send the perf data for
                # each cgroup.
                self._send_telemetry_metrics_op.run()
                self._collect_and_send_events_op.run()
                self._send_host_plugin_heartbeat_op.run()
                self._send_imds_heartbeat_op.run()
                self._log_altered_network_configuration_op.run()
                self._reset_loggers_op.run()
            except Exception as e:
                logger.warn(
                    "An error occurred in the monitor thread main loop; will skip the current iteration.\n{0}",
                    ustr(e))
            time.sleep(min_delta)

    def reset_loggers(self):
        """
        The loggers maintain hash-tables in memory and they need to be cleaned up from time to time.
        For reference, please check azurelinuxagent.common.logger.Logger and
        azurelinuxagent.common.event.EventLogger classes
        """
        logger.reset_periodic()

    def send_imds_heartbeat(self):
        """
        Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have
        successfully called and validated a response in the last IMDS_HEALTH_PERIOD.
        """
        try:
            is_currently_healthy, response = self.imds_client.validate()

            if is_currently_healthy:
                self.imds_errorstate.reset()
            else:
                self.imds_errorstate.incr()

            is_healthy = self.imds_errorstate.is_triggered() is False
            logger.verbose("IMDS health: {0} [{1}]", is_healthy, response)

            self.health_service.report_imds_status(is_healthy, response)

        except Exception as e:
            msg = "Exception sending imds heartbeat: {0}".format(ustr(e))
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ImdsHeartbeat,
                      is_success=False,
                      message=msg,
                      log_event=False)

    def send_host_plugin_heartbeat(self):
        """
        Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to
        communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD.
        """
        try:
            host_plugin = self.protocol.client.get_host_plugin()
            host_plugin.ensure_initialized()
            is_currently_healthy = host_plugin.get_health()

            if is_currently_healthy:
                self.host_plugin_errorstate.reset()
            else:
                self.host_plugin_errorstate.incr()

            is_healthy = self.host_plugin_errorstate.is_triggered() is False
            logger.verbose("HostGAPlugin health: {0}", is_healthy)

            self.health_service.report_host_plugin_heartbeat(is_healthy)

            if not is_healthy:
                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.HostPluginHeartbeatExtended,
                          is_success=False,
                          message='{0} since successful heartbeat'.format(
                              self.host_plugin_errorstate.fail_time),
                          log_event=False)

        except Exception as e:
            msg = "Exception sending host plugin heartbeat: {0}".format(
                ustr(e))
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.HostPluginHeartbeat,
                      is_success=False,
                      message=msg,
                      log_event=False)

    def send_telemetry_heartbeat(self):
        io_errors = IOErrorCounter.get_and_reset()
        hostplugin_errors = io_errors.get("hostplugin")
        protocol_errors = io_errors.get("protocol")
        other_errors = io_errors.get("other")

        if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0:
            msg = "hostplugin:{0};protocol:{1};other:{2}".format(
                hostplugin_errors, protocol_errors, other_errors)
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.HttpErrors,
                      is_success=True,
                      message=msg,
                      log_event=False)

    def poll_telemetry_metrics(self):
        """
        This method polls the tracked cgroups to get data from the cgroups filesystem and send the data directly.

        :return: List of Metrics (which would be sent to PerfCounterMetrics directly.
        """
        metrics = CGroupsTelemetry.poll_all_tracked()

        if metrics:
            for metric in metrics:
                report_metric(metric.category, metric.counter, metric.instance,
                              metric.value)

    def send_telemetry_metrics(self):
        """
        The send_telemetry_metrics would soon be removed in favor of sending performance metrics directly.
        """
        performance_metrics = CGroupsTelemetry.report_all_tracked()

        if performance_metrics:
            message = generate_extension_metrics_telemetry_dictionary(
                schema_version=1.0, performance_metrics=performance_metrics)
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ExtensionMetricsData,
                      is_success=True,
                      message=ustr(message),
                      log_event=False)

    def log_altered_network_configuration(self):
        """
        Check various pieces of network configuration and, if altered since the last check, log the new state.
        """
        raw_route_list = self.osutil.read_route_table()
        digest = hash_strings(raw_route_list)
        if digest != self.last_route_table_hash:
            self.last_route_table_hash = digest
            route_list = self.osutil.get_list_of_routes(raw_route_list)
            logger.info("Route table: [{0}]".format(",".join(
                map(networkutil.RouteEntry.to_json, route_list))))

        nic_state = self.osutil.get_nic_state()
        if nic_state != self.last_nic_state:
            description = "Initial" if self.last_nic_state == {} else "Updated"
            logger.info("{0} NIC state: [{1}]".format(
                description, ", ".join(map(str, nic_state.values()))))
            self.last_nic_state = nic_state
Example #7
0
class MonitorHandler(object):

    EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1)
    TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30)
    CGROUP_TELEMETRY_PERIOD = datetime.timedelta(minutes=5)
    # host plugin
    HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5)
    # imds
    IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3)

    def __init__(self):
        self.osutil = get_osutil()
        self.protocol_util = get_protocol_util()
        self.imds_client = get_imds_client()

        self.event_thread = None
        self.last_event_collection = None
        self.last_telemetry_heartbeat = None
        self.last_cgroup_telemetry = None
        self.last_host_plugin_heartbeat = None
        self.last_imds_heartbeat = None
        self.protocol = None
        self.health_service = None
        self.last_route_table_hash = b''
        self.last_nic_state = {}

        self.counter = 0
        self.sysinfo = []
        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(
            min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)
        self.imds_errorstate = ErrorState(
            min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD)

    def run(self):
        self.init_protocols()
        self.init_sysinfo()
        self.init_cgroups()
        self.start()

    def stop(self):
        self.should_run = False
        if self.is_alive():
            self.event_thread.join()

    def init_protocols(self):
        self.protocol = self.protocol_util.get_protocol()
        self.health_service = HealthService(self.protocol.endpoint)

    def is_alive(self):
        return self.event_thread is not None and self.event_thread.is_alive()

    def start(self):
        self.event_thread = threading.Thread(target=self.daemon)
        self.event_thread.setDaemon(True)
        self.event_thread.start()

    def init_sysinfo(self):
        osversion = "{0}:{1}-{2}-{3}:{4}".format(platform.system(),
                                                 DISTRO_NAME, DISTRO_VERSION,
                                                 DISTRO_CODE_NAME,
                                                 platform.release())
        self.sysinfo.append(TelemetryEventParam("OSVersion", osversion))
        self.sysinfo.append(TelemetryEventParam("GAVersion", CURRENT_AGENT))

        try:
            ram = self.osutil.get_total_mem()
            processors = self.osutil.get_processor_cores()
            self.sysinfo.append(TelemetryEventParam("RAM", ram))
            self.sysinfo.append(TelemetryEventParam("Processors", processors))
        except OSUtilError as e:
            logger.warn("Failed to get system info: {0}", e)

        try:
            vminfo = self.protocol.get_vminfo()
            self.sysinfo.append(TelemetryEventParam("VMName", vminfo.vmName))
            self.sysinfo.append(
                TelemetryEventParam("TenantName", vminfo.tenantName))
            self.sysinfo.append(
                TelemetryEventParam("RoleName", vminfo.roleName))
            self.sysinfo.append(
                TelemetryEventParam("RoleInstanceName",
                                    vminfo.roleInstanceName))
            self.sysinfo.append(
                TelemetryEventParam("ContainerId", vminfo.containerId))
        except ProtocolError as e:
            logger.warn("Failed to get system info: {0}", e)

        try:
            vminfo = self.imds_client.get_compute()
            self.sysinfo.append(
                TelemetryEventParam('Location', vminfo.location))
            self.sysinfo.append(
                TelemetryEventParam('SubscriptionId', vminfo.subscriptionId))
            self.sysinfo.append(
                TelemetryEventParam('ResourceGroupName',
                                    vminfo.resourceGroupName))
            self.sysinfo.append(TelemetryEventParam('VMId', vminfo.vmId))
            self.sysinfo.append(
                TelemetryEventParam('ImageOrigin', vminfo.image_origin))
        except (HttpError, ValueError) as e:
            logger.warn("failed to get IMDS info: {0}", e)

    def collect_event(self, evt_file_name):
        try:
            logger.verbose("Found event file: {0}", evt_file_name)
            with open(evt_file_name, "rb") as evt_file:
                # if fail to open or delete the file, throw exception
                data_str = evt_file.read().decode("utf-8", 'ignore')
            logger.verbose("Processed event file: {0}", evt_file_name)
            os.remove(evt_file_name)
            return data_str
        except IOError as e:
            msg = "Failed to process {0}, {1}".format(evt_file_name, e)
            raise EventError(msg)

    def collect_and_send_events(self):
        if self.last_event_collection is None:
            self.last_event_collection = datetime.datetime.utcnow(
            ) - MonitorHandler.EVENT_COLLECTION_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_event_collection +
                MonitorHandler.EVENT_COLLECTION_PERIOD):
            try:
                event_list = TelemetryEventList()
                event_dir = os.path.join(conf.get_lib_dir(), "events")
                event_files = os.listdir(event_dir)
                for event_file in event_files:
                    if not event_file.endswith(".tld"):
                        continue
                    event_file_path = os.path.join(event_dir, event_file)
                    try:
                        data_str = self.collect_event(event_file_path)
                    except EventError as e:
                        logger.error("{0}", e)
                        continue

                    try:
                        event = parse_event(data_str)
                        self.add_sysinfo(event)
                        event_list.events.append(event)
                    except (ValueError, ProtocolError) as e:
                        logger.warn("Failed to decode event file: {0}", e)
                        continue

                if len(event_list.events) == 0:
                    return

                try:
                    self.protocol.report_event(event_list)
                except ProtocolError as e:
                    logger.error("{0}", e)
            except Exception as e:
                logger.warn("Failed to send events: {0}", e)

            self.last_event_collection = datetime.datetime.utcnow()

    def daemon(self):
        min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD,
                        MonitorHandler.CGROUP_TELEMETRY_PERIOD,
                        MonitorHandler.EVENT_COLLECTION_PERIOD,
                        MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD,
                        MonitorHandler.IMDS_HEARTBEAT_PERIOD).seconds
        while self.should_run:
            self.send_telemetry_heartbeat()
            self.send_cgroup_telemetry()
            self.collect_and_send_events()
            self.send_host_plugin_heartbeat()
            self.send_imds_heartbeat()
            self.log_altered_network_configuration()
            time.sleep(min_delta)

    def add_sysinfo(self, event):
        sysinfo_names = [v.name for v in self.sysinfo]
        for param in event.parameters:
            if param.name in sysinfo_names:
                logger.verbose("Remove existing event parameter: [{0}:{1}]",
                               param.name, param.value)
                event.parameters.remove(param)
        event.parameters.extend(self.sysinfo)

    def send_imds_heartbeat(self):
        """
        Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have
        successfully called and validated a response in the last IMDS_HEALTH_PERIOD.
        """

        if self.last_imds_heartbeat is None:
            self.last_imds_heartbeat = datetime.datetime.utcnow(
            ) - MonitorHandler.IMDS_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_imds_heartbeat +
                MonitorHandler.IMDS_HEARTBEAT_PERIOD):
            try:
                is_currently_healthy, response = self.imds_client.validate()

                if is_currently_healthy:
                    self.imds_errorstate.reset()
                else:
                    self.imds_errorstate.incr()

                is_healthy = self.imds_errorstate.is_triggered() is False
                logger.verbose("IMDS health: {0} [{1}]", is_healthy, response)

                self.health_service.report_imds_status(is_healthy, response)

            except Exception as e:
                msg = "Exception sending imds heartbeat: {0}".format(ustr(e))
                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.ImdsHeartbeat,
                          is_success=False,
                          message=msg,
                          log_event=False)

            self.last_imds_heartbeat = datetime.datetime.utcnow()

    def send_host_plugin_heartbeat(self):
        """
        Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to
        communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD.
        """
        if self.last_host_plugin_heartbeat is None:
            self.last_host_plugin_heartbeat = datetime.datetime.utcnow(
            ) - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_host_plugin_heartbeat +
                MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD):
            try:
                host_plugin = self.protocol.client.get_host_plugin()
                host_plugin.ensure_initialized()
                is_currently_healthy = host_plugin.get_health()

                if is_currently_healthy:
                    self.host_plugin_errorstate.reset()
                else:
                    self.host_plugin_errorstate.incr()

                is_healthy = self.host_plugin_errorstate.is_triggered(
                ) is False
                logger.verbose("HostGAPlugin health: {0}", is_healthy)

                self.health_service.report_host_plugin_heartbeat(is_healthy)

            except Exception as e:
                msg = "Exception sending host plugin heartbeat: {0}".format(
                    ustr(e))
                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.HostPluginHeartbeat,
                          is_success=False,
                          message=msg,
                          log_event=False)

            self.last_host_plugin_heartbeat = datetime.datetime.utcnow()

    def send_telemetry_heartbeat(self):

        if self.last_telemetry_heartbeat is None:
            self.last_telemetry_heartbeat = datetime.datetime.utcnow(
            ) - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_telemetry_heartbeat +
                MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD):
            try:
                incarnation = self.protocol.get_incarnation()
                dropped_packets = self.osutil.get_firewall_dropped_packets(
                    self.protocol.endpoint)
                msg = "{0};{1};{2};{3}".format(incarnation, self.counter,
                                               self.heartbeat_id,
                                               dropped_packets)

                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.HeartBeat,
                          is_success=True,
                          message=msg,
                          log_event=False)

                self.counter += 1

                io_errors = IOErrorCounter.get_and_reset()
                hostplugin_errors = io_errors.get("hostplugin")
                protocol_errors = io_errors.get("protocol")
                other_errors = io_errors.get("other")

                if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0:
                    msg = "hostplugin:{0};protocol:{1};other:{2}".format(
                        hostplugin_errors, protocol_errors, other_errors)
                    add_event(name=AGENT_NAME,
                              version=CURRENT_VERSION,
                              op=WALAEventOperation.HttpErrors,
                              is_success=True,
                              message=msg,
                              log_event=False)
            except Exception as e:
                logger.warn("Failed to send heartbeat: {0}", e)

            self.last_telemetry_heartbeat = datetime.datetime.utcnow()

    @staticmethod
    def init_cgroups():
        # Track metrics for the roll-up cgroup and for the agent cgroup
        try:
            CGroupsTelemetry.track_cgroup(CGroups.for_extension(""))
            CGroupsTelemetry.track_agent()
        except Exception as e:
            logger.error(
                "monitor: Exception tracking wrapper and agent: {0} [{1}]", e,
                traceback.format_exc())

    def send_cgroup_telemetry(self):
        if self.last_cgroup_telemetry is None:
            self.last_cgroup_telemetry = datetime.datetime.utcnow()

        if datetime.datetime.utcnow() >= (
                self.last_telemetry_heartbeat +
                MonitorHandler.CGROUP_TELEMETRY_PERIOD):
            try:
                for cgroup_name, metrics in CGroupsTelemetry.collect_all_tracked(
                ).items():
                    for metric_group, metric_name, value in metrics:
                        if value > 0:
                            report_metric(metric_group, metric_name,
                                          cgroup_name, value)
            except Exception as e:
                logger.warn("Failed to collect performance metrics: {0} [{1}]",
                            e, traceback.format_exc())

            # Look for extension cgroups we're not already tracking and track them
            try:
                CGroupsTelemetry.update_tracked(
                    self.protocol.client.get_current_handlers())
            except Exception as e:
                logger.warn(
                    "Monitor: updating tracked extensions raised {0}: {1}", e,
                    traceback.format_exc())

            self.last_cgroup_telemetry = datetime.datetime.utcnow()

    def log_altered_network_configuration(self):
        """
        Check various pieces of network configuration and, if altered since the last check, log the new state.
        """
        raw_route_list = self.osutil.read_route_table()
        digest = hash_strings(raw_route_list)
        if digest != self.last_route_table_hash:
            self.last_route_table_hash = digest
            route_list = self.osutil.get_list_of_routes(raw_route_list)
            logger.info("Route table: [{0}]".format(",".join(
                map(networkutil.RouteEntry.to_json, route_list))))

        nic_state = self.osutil.get_nic_state()
        if nic_state != self.last_nic_state:
            description = "Initial" if self.last_nic_state == {} else "Updated"
            logger.info("{0} NIC state: [{1}]".format(
                description, ", ".join(map(str, nic_state.values()))))
            self.last_nic_state = nic_state
Example #8
0
class ExtHandlersHandler(object):
    def __init__(self):
        self.protocol_util = get_protocol_util()
        self.protocol = None
        self.ext_handlers = None
        self.last_etag = None
        self.log_report = False
        self.log_etag = True
        self.log_process = False

        self.report_status_error_state = ErrorState()
        self.get_artifact_error_state = ErrorState(min_timedelta=ERROR_STATE_DELTA_INSTALL)

    def run(self):
        self.ext_handlers, etag = None, None
        try:
            self.protocol = self.protocol_util.get_protocol()
            self.ext_handlers, etag = self.protocol.get_ext_handlers()
            self.get_artifact_error_state.reset()
        except Exception as e:
            msg = u"Exception retrieving extension handlers: {0}".format(ustr(e))
            detailed_msg = '{0} {1}'.format(msg, traceback.format_exc())

            self.get_artifact_error_state.incr()

            if self.get_artifact_error_state.is_triggered():
                add_event(AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.GetArtifactExtended,
                          is_success=False,
                          message="Failed to get extension artifact for over "
                                  "{0): {1}".format(self.get_artifact_error_state.min_timedelta, msg))
                self.get_artifact_error_state.reset()
            else:
                logger.warn(msg)

            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ExtensionProcessing,
                      is_success=False,
                      message=detailed_msg)
            return

        try:
            msg = u"Handle extensions updates for incarnation {0}".format(etag)
            logger.verbose(msg)
            # Log status report success on new config
            self.log_report = True
            self.handle_ext_handlers(etag)
            self.last_etag = etag

            self.report_ext_handlers_status()
            self.cleanup_outdated_handlers()
        except Exception as e:
            msg = u"Exception processing extension handlers: {0}".format(
                ustr(e))
            logger.warn(msg)
            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ExtensionProcessing,
                      is_success=False,
                      message=msg)
            return

    def cleanup_outdated_handlers(self):
        handlers = []
        pkgs = []

        # Build a collection of uninstalled handlers and orphaned packages
        # Note:
        # -- An orphaned package is one without a corresponding handler
        #    directory
        for item in os.listdir(conf.get_lib_dir()):
            path = os.path.join(conf.get_lib_dir(), item)

            if version.is_agent_package(path) or version.is_agent_path(path):
                continue

            if os.path.isdir(path):
                if re.match(HANDLER_NAME_PATTERN, item) is None:
                    continue
                try:
                    eh = ExtHandler()

                    separator = item.rfind('-')

                    eh.name = item[0:separator]
                    eh.properties.version = str(FlexibleVersion(item[separator+1:]))

                    handler = ExtHandlerInstance(eh, self.protocol)
                except Exception:
                    continue
                if handler.get_handler_state() != ExtHandlerState.NotInstalled:
                    continue
                handlers.append(handler)

            elif os.path.isfile(path) and \
                    not os.path.isdir(path[0:-len(HANDLER_PKG_EXT)]):
                if not re.match(HANDLER_PKG_PATTERN, item):
                    continue
                pkgs.append(path)

        # Then, remove the orphaned packages
        for pkg in pkgs:
            try:
                os.remove(pkg)
                logger.verbose("Removed orphaned extension package {0}".format(pkg))
            except OSError as e:
                logger.warn("Failed to remove orphaned package {0}: {1}".format(pkg, e.strerror))

        # Finally, remove the directories and packages of the
        # uninstalled handlers
        for handler in handlers:
            handler.rm_ext_handler_dir()
            pkg = os.path.join(conf.get_lib_dir(), handler.get_full_name() + HANDLER_PKG_EXT)
            if os.path.isfile(pkg):
                try:
                    os.remove(pkg)
                    logger.verbose("Removed extension package {0}".format(pkg))
                except OSError as e:
                    logger.warn("Failed to remove extension package {0}: {1}".format(pkg, e.strerror))
   
    def handle_ext_handlers(self, etag=None):
        if not conf.get_extensions_enabled():
            logger.verbose("Extension handling is disabled")
            return

        if self.ext_handlers.extHandlers is None or \
                len(self.ext_handlers.extHandlers) == 0:
            logger.verbose("No extension handler config found")
            return

        if conf.get_enable_overprovisioning():
            artifacts_profile = self.protocol.get_artifacts_profile()
            if artifacts_profile and artifacts_profile.is_on_hold():
                logger.info("Extension handling is on hold")
                return

        self.ext_handlers.extHandlers.sort(key=operator.methodcaller('sort_key'))
        for ext_handler in self.ext_handlers.extHandlers:
            # TODO: handle install in sequence, enable in parallel
            self.handle_ext_handler(ext_handler, etag)
    
    def handle_ext_handler(self, ext_handler, etag):
        ext_handler_i = ExtHandlerInstance(ext_handler, self.protocol)

        try:
            state = ext_handler.properties.state
            if ext_handler_i.decide_version(target_state=state) is None:
                version = ext_handler_i.ext_handler.properties.version
                name = ext_handler_i.ext_handler.name
                err_msg = "Unable to find version {0} in manifest for extension {1}".format(version, name)
                ext_handler_i.set_operation(WALAEventOperation.Download)
                ext_handler_i.set_handler_status(message=ustr(err_msg), code=-1)
                ext_handler_i.report_event(message=ustr(err_msg), is_success=False)
                return

            self.get_artifact_error_state.reset()
            if not ext_handler_i.is_upgrade and self.last_etag == etag:
                if self.log_etag:
                    ext_handler_i.logger.verbose("Version {0} is current for etag {1}",
                                                 ext_handler_i.pkg.version,
                                                 etag)
                    self.log_etag = False
                return

            self.log_etag = True

            ext_handler_i.logger.info("Target handler state: {0}", state)
            if state == u"enabled":
                self.handle_enable(ext_handler_i)
            elif state == u"disabled":
                self.handle_disable(ext_handler_i)
            elif state == u"uninstall":
                self.handle_uninstall(ext_handler_i)
            else:
                message = u"Unknown ext handler state:{0}".format(state)
                raise ExtensionError(message)
        except ExtensionError as e:
            self.handle_handle_ext_handler_error(ext_handler_i, e, e.code)
        except Exception as e:
            self.handle_handle_ext_handler_error(ext_handler_i, e)

    def handle_handle_ext_handler_error(self, ext_handler_i, e, code=-1):
        msg = ustr(e)
        ext_handler_i.set_handler_status(message=msg, code=code)

        self.get_artifact_error_state.incr()
        if self.get_artifact_error_state.is_triggered():
            report_event(op=WALAEventOperation.GetArtifactExtended,
                         message="Failed to get artifact for over "
                                 "{0}: {1}".format(self.get_artifact_error_state.min_timedelta, msg),
                         is_success=False)
            self.get_artifact_error_state.reset()
        else:
            ext_handler_i.logger.warn(msg)
    
    def handle_enable(self, ext_handler_i):
        self.log_process = True
        old_ext_handler_i = ext_handler_i.get_installed_ext_handler()

        handler_state = ext_handler_i.get_handler_state()
        ext_handler_i.logger.info("[Enable] current handler state is: {0}",
                                  handler_state.lower())
        if handler_state == ExtHandlerState.NotInstalled:
            ext_handler_i.set_handler_state(ExtHandlerState.NotInstalled)
            ext_handler_i.download()
            ext_handler_i.update_settings()
            if old_ext_handler_i is None:
                ext_handler_i.install()
            elif ext_handler_i.version_ne(old_ext_handler_i):
                old_ext_handler_i.disable()
                ext_handler_i.copy_status_files(old_ext_handler_i)
                if ext_handler_i.version_gt(old_ext_handler_i):
                    ext_handler_i.update()
                else:
                    old_ext_handler_i.update(version=ext_handler_i.ext_handler.properties.version)
                old_ext_handler_i.uninstall()
                old_ext_handler_i.rm_ext_handler_dir()
                ext_handler_i.update_with_install()
        else:
            ext_handler_i.update_settings()

        ext_handler_i.enable() 

    def handle_disable(self, ext_handler_i):
        self.log_process = True
        handler_state = ext_handler_i.get_handler_state()
        ext_handler_i.logger.info("[Disable] current handler state is: {0}",
                                  handler_state.lower())
        if handler_state == ExtHandlerState.Enabled:
            ext_handler_i.disable()

    def handle_uninstall(self, ext_handler_i):
        self.log_process = True
        handler_state = ext_handler_i.get_handler_state()
        ext_handler_i.logger.info("[Uninstall] current handler state is: {0}",
                                  handler_state.lower())
        if handler_state != ExtHandlerState.NotInstalled:
            if handler_state == ExtHandlerState.Enabled:
                ext_handler_i.disable()
            ext_handler_i.uninstall()
        ext_handler_i.rm_ext_handler_dir()

    def report_ext_handlers_status(self):
        """
        Go through handler_state dir, collect and report status
        """
        vm_status = VMStatus(status="Ready", message="Guest Agent is running")
        if self.ext_handlers is not None:
            for ext_handler in self.ext_handlers.extHandlers:
                try:
                    self.report_ext_handler_status(vm_status, ext_handler)
                except ExtensionError as e:
                    add_event(
                        AGENT_NAME,
                        version=CURRENT_VERSION,
                        op=WALAEventOperation.ExtensionProcessing,
                        is_success=False,
                        message=ustr(e))

        logger.verbose("Report vm agent status")
        try:
            self.protocol.report_vm_status(vm_status)
            if self.log_report:
                logger.verbose("Completed vm agent status report")
            self.report_status_error_state.reset()
        except ProtocolNotFoundError as e:
            self.report_status_error_state.incr()
            message = "Failed to report vm agent status: {0}".format(e)
            logger.verbose(message)
        except ProtocolError as e:
            self.report_status_error_state.incr()
            message = "Failed to report vm agent status: {0}".format(e)
            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ExtensionProcessing,
                      is_success=False,
                      message=message)

        if self.report_status_error_state.is_triggered():
            message = "Failed to report vm agent status for more than {0}"\
                .format(self.report_status_error_state.min_timedelta)

            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ReportStatusExtended,
                      is_success=False,
                      message=message)

            self.report_status_error_state.reset()

    def report_ext_handler_status(self, vm_status, ext_handler):
        ext_handler_i = ExtHandlerInstance(ext_handler, self.protocol)
        
        handler_status = ext_handler_i.get_handler_status() 
        if handler_status is None:
            return

        handler_state = ext_handler_i.get_handler_state()
        if handler_state != ExtHandlerState.NotInstalled:
            try:
                active_exts = ext_handler_i.report_ext_status()
                handler_status.extensions.extend(active_exts)
            except ExtensionError as e:
                ext_handler_i.set_handler_status(message=ustr(e), code=e.code)

            try:
                heartbeat = ext_handler_i.collect_heartbeat()
                if heartbeat is not None:
                    handler_status.status = heartbeat.get('status')
            except ExtensionError as e:
                ext_handler_i.set_handler_status(message=ustr(e), code=e.code)

        vm_status.vmAgent.extensionHandlers.append(handler_status)
Example #9
0
class MonitorHandler(object):
    EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1)
    TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30)
    CGROUP_TELEMETRY_PERIOD = datetime.timedelta(minutes=5)
    # host plugin
    HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5)
    # imds
    IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3)

    def __init__(self):
        self.osutil = get_osutil()
        self.protocol_util = get_protocol_util()
        self.imds_client = get_imds_client()

        self.event_thread = None
        self.last_event_collection = None
        self.last_telemetry_heartbeat = None
        self.last_cgroup_telemetry = None
        self.last_host_plugin_heartbeat = None
        self.last_imds_heartbeat = None
        self.protocol = None
        self.health_service = None
        self.last_route_table_hash = b''
        self.last_nic_state = {}

        self.counter = 0
        self.sysinfo = []
        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)
        self.imds_errorstate = ErrorState(min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD)

    def run(self):
        self.init_protocols()
        self.init_sysinfo()
        self.init_cgroups()
        self.start()

    def stop(self):
        self.should_run = False
        if self.is_alive():
            self.event_thread.join()

    def init_protocols(self):
        self.protocol = self.protocol_util.get_protocol()
        self.health_service = HealthService(self.protocol.endpoint)

    def is_alive(self):
        return self.event_thread is not None and self.event_thread.is_alive()

    def start(self):
        self.event_thread = threading.Thread(target=self.daemon)
        self.event_thread.setDaemon(True)
        self.event_thread.start()

    def init_sysinfo(self):
        osversion = "{0}:{1}-{2}-{3}:{4}".format(platform.system(),
                                                 DISTRO_NAME,
                                                 DISTRO_VERSION,
                                                 DISTRO_CODE_NAME,
                                                 platform.release())
        self.sysinfo.append(TelemetryEventParam("OSVersion", osversion))
        self.sysinfo.append(
            TelemetryEventParam("GAVersion", CURRENT_AGENT))

        try:
            ram = self.osutil.get_total_mem()
            processors = self.osutil.get_processor_cores()
            self.sysinfo.append(TelemetryEventParam("RAM", ram))
            self.sysinfo.append(TelemetryEventParam("Processors", processors))
        except OSUtilError as e:
            logger.warn("Failed to get system info: {0}", e)

        try:
            vminfo = self.protocol.get_vminfo()
            self.sysinfo.append(TelemetryEventParam("VMName",
                                                    vminfo.vmName))
            self.sysinfo.append(TelemetryEventParam("TenantName",
                                                    vminfo.tenantName))
            self.sysinfo.append(TelemetryEventParam("RoleName",
                                                    vminfo.roleName))
            self.sysinfo.append(TelemetryEventParam("RoleInstanceName",
                                                    vminfo.roleInstanceName))
            self.sysinfo.append(TelemetryEventParam("ContainerId",
                                                    vminfo.containerId))
        except ProtocolError as e:
            logger.warn("Failed to get system info: {0}", e)

        try:
            vminfo = self.imds_client.get_compute()
            self.sysinfo.append(TelemetryEventParam('Location',
                                                    vminfo.location))
            self.sysinfo.append(TelemetryEventParam('SubscriptionId',
                                                    vminfo.subscriptionId))
            self.sysinfo.append(TelemetryEventParam('ResourceGroupName',
                                                    vminfo.resourceGroupName))
            self.sysinfo.append(TelemetryEventParam('VMId',
                                                    vminfo.vmId))
            self.sysinfo.append(TelemetryEventParam('ImageOrigin',
                                                    vminfo.image_origin))
        except (HttpError, ValueError) as e:
            logger.warn("failed to get IMDS info: {0}", e)

    def collect_event(self, evt_file_name):
        try:
            logger.verbose("Found event file: {0}", evt_file_name)
            with open(evt_file_name, "rb") as evt_file:
                # if fail to open or delete the file, throw exception
                data_str = evt_file.read().decode("utf-8", 'ignore')
            logger.verbose("Processed event file: {0}", evt_file_name)
            os.remove(evt_file_name)
            return data_str
        except IOError as e:
            msg = "Failed to process {0}, {1}".format(evt_file_name, e)
            raise EventError(msg)

    def collect_and_send_events(self):
        if self.last_event_collection is None:
            self.last_event_collection = datetime.datetime.utcnow() - MonitorHandler.EVENT_COLLECTION_PERIOD

        if datetime.datetime.utcnow() >= (self.last_event_collection + MonitorHandler.EVENT_COLLECTION_PERIOD):
            try:
                event_list = TelemetryEventList()
                event_dir = os.path.join(conf.get_lib_dir(), "events")
                event_files = os.listdir(event_dir)
                for event_file in event_files:
                    if not event_file.endswith(".tld"):
                        continue
                    event_file_path = os.path.join(event_dir, event_file)
                    try:
                        data_str = self.collect_event(event_file_path)
                    except EventError as e:
                        logger.error("{0}", e)
                        continue

                    try:
                        event = parse_event(data_str)
                        self.add_sysinfo(event)
                        event_list.events.append(event)
                    except (ValueError, ProtocolError) as e:
                        logger.warn("Failed to decode event file: {0}", e)
                        continue

                if len(event_list.events) == 0:
                    return

                try:
                    self.protocol.report_event(event_list)
                except ProtocolError as e:
                    logger.error("{0}", e)
            except Exception as e:
                logger.warn("Failed to send events: {0}", e)

            self.last_event_collection = datetime.datetime.utcnow()

    def daemon(self):
        min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD,
                        MonitorHandler.CGROUP_TELEMETRY_PERIOD,
                        MonitorHandler.EVENT_COLLECTION_PERIOD,
                        MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD,
                        MonitorHandler.IMDS_HEARTBEAT_PERIOD).seconds
        while self.should_run:
            self.send_telemetry_heartbeat()
            self.send_cgroup_telemetry()
            self.collect_and_send_events()
            self.send_host_plugin_heartbeat()
            self.send_imds_heartbeat()
            self.log_altered_network_configuration()
            time.sleep(min_delta)

    def add_sysinfo(self, event):
        sysinfo_names = [v.name for v in self.sysinfo]
        for param in event.parameters:
            if param.name in sysinfo_names:
                logger.verbose("Remove existing event parameter: [{0}:{1}]",
                               param.name,
                               param.value)
                event.parameters.remove(param)
        event.parameters.extend(self.sysinfo)

    def send_imds_heartbeat(self):
        """
        Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have
        successfully called and validated a response in the last IMDS_HEALTH_PERIOD.
        """

        if self.last_imds_heartbeat is None:
            self.last_imds_heartbeat = datetime.datetime.utcnow() - MonitorHandler.IMDS_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (self.last_imds_heartbeat + MonitorHandler.IMDS_HEARTBEAT_PERIOD):
            try:
                is_currently_healthy, response = self.imds_client.validate()

                if is_currently_healthy:
                    self.imds_errorstate.reset()
                else:
                    self.imds_errorstate.incr()

                is_healthy = self.imds_errorstate.is_triggered() is False
                logger.verbose("IMDS health: {0} [{1}]", is_healthy, response)

                self.health_service.report_imds_status(is_healthy, response)

            except Exception as e:
                msg = "Exception sending imds heartbeat: {0}".format(ustr(e))
                add_event(
                    name=AGENT_NAME,
                    version=CURRENT_VERSION,
                    op=WALAEventOperation.ImdsHeartbeat,
                    is_success=False,
                    message=msg,
                    log_event=False)

            self.last_imds_heartbeat = datetime.datetime.utcnow()

    def send_host_plugin_heartbeat(self):
        """
        Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to
        communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD.
        """
        if self.last_host_plugin_heartbeat is None:
            self.last_host_plugin_heartbeat = datetime.datetime.utcnow() - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_host_plugin_heartbeat + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD):
            try:
                host_plugin = self.protocol.client.get_host_plugin()
                host_plugin.ensure_initialized()
                is_currently_healthy = host_plugin.get_health()

                if is_currently_healthy:
                    self.host_plugin_errorstate.reset()
                else:
                    self.host_plugin_errorstate.incr()

                is_healthy = self.host_plugin_errorstate.is_triggered() is False
                logger.verbose("HostGAPlugin health: {0}", is_healthy)

                self.health_service.report_host_plugin_heartbeat(is_healthy)

                if not is_healthy:
                    add_event(
                        name=AGENT_NAME,
                        version=CURRENT_VERSION,
                        op=WALAEventOperation.HostPluginHeartbeatExtended,
                        is_success=False,
                        message='{0} since successful heartbeat'.format(self.host_plugin_errorstate.fail_time),
                        log_event=False)

            except Exception as e:
                msg = "Exception sending host plugin heartbeat: {0}".format(ustr(e))
                add_event(
                    name=AGENT_NAME,
                    version=CURRENT_VERSION,
                    op=WALAEventOperation.HostPluginHeartbeat,
                    is_success=False,
                    message=msg,
                    log_event=False)

            self.last_host_plugin_heartbeat = datetime.datetime.utcnow()

    def send_telemetry_heartbeat(self):

        if self.last_telemetry_heartbeat is None:
            self.last_telemetry_heartbeat = datetime.datetime.utcnow() - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD):
            try:
                incarnation = self.protocol.get_incarnation()
                dropped_packets = self.osutil.get_firewall_dropped_packets(self.protocol.endpoint)
                msg = "{0};{1};{2};{3}".format(incarnation, self.counter, self.heartbeat_id, dropped_packets)

                add_event(
                    name=AGENT_NAME,
                    version=CURRENT_VERSION,
                    op=WALAEventOperation.HeartBeat,
                    is_success=True,
                    message=msg,
                    log_event=False)

                self.counter += 1

                io_errors = IOErrorCounter.get_and_reset()
                hostplugin_errors = io_errors.get("hostplugin")
                protocol_errors = io_errors.get("protocol")
                other_errors = io_errors.get("other")

                if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0:
                    msg = "hostplugin:{0};protocol:{1};other:{2}".format(hostplugin_errors,
                                                                         protocol_errors,
                                                                         other_errors)
                    add_event(
                        name=AGENT_NAME,
                        version=CURRENT_VERSION,
                        op=WALAEventOperation.HttpErrors,
                        is_success=True,
                        message=msg,
                        log_event=False)
            except Exception as e:
                logger.warn("Failed to send heartbeat: {0}", e)

            self.last_telemetry_heartbeat = datetime.datetime.utcnow()

    @staticmethod
    def init_cgroups():
        # Track metrics for the roll-up cgroup and for the agent cgroup
        try:
            CGroupsTelemetry.track_cgroup(CGroups.for_extension(""))
            CGroupsTelemetry.track_agent()
        except Exception as e:
            # when a hierarchy is not mounted, we raise an exception
            # and we should therefore only issue a warning, since this
            # is not unexpected
            logger.warn("Monitor: cgroups not initialized: {0}", ustr(e))
            logger.verbose(traceback.format_exc())

    def send_cgroup_telemetry(self):
        if self.last_cgroup_telemetry is None:
            self.last_cgroup_telemetry = datetime.datetime.utcnow()

        if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD):
            try:
                metric_reported, metric_threshold = CGroupsTelemetry.collect_all_tracked()
                for cgroup_name, metrics in metric_reported.items():
                    thresholds = metric_threshold[cgroup_name]

                    for metric_group, metric_name, value in metrics:
                        if value > 0:
                            report_metric(metric_group, metric_name, cgroup_name, value)

                        if metric_group == "Memory":
                            if value >= thresholds["memory"]:
                                msg = "CGroup {0}: Crossed the Memory Threshold. Current Value:{1}, Threshold:{2}.".format(
                                    cgroup_name, value, thresholds["memory"])
                                add_event(name=AGENT_NAME,
                                          version=CURRENT_VERSION,
                                          op=WALAEventOperation.CGroupsLimitsCrossed,
                                          is_success=True,
                                          message=msg,
                                          log_event=True)

                        if metric_group == "Process":
                            if value >= thresholds["cpu"]:
                                msg = "CGroup {0}: Crossed the Processor Threshold. Current Value:{1}, Threshold:{2}.".format(
                                    cgroup_name, value, thresholds["cpu"])
                                add_event(name=AGENT_NAME,
                                          version=CURRENT_VERSION,
                                          op=WALAEventOperation.CGroupsLimitsCrossed,
                                          is_success=True,
                                          message=msg,
                                          log_event=True)

            except Exception as e:
                logger.warn("Monitor: failed to collect cgroups performance metrics: {0}", ustr(e))
                logger.verbose(traceback.format_exc())

            # Look for extension cgroups we're not already tracking and track them
            try:
                CGroupsTelemetry.update_tracked(self.protocol.client.get_current_handlers())
            except Exception as e:
                logger.warn("Monitor: failed to update cgroups tracked extensions: {0}", ustr(e))
                logger.verbose(traceback.format_exc())

            self.last_cgroup_telemetry = datetime.datetime.utcnow()

    def log_altered_network_configuration(self):
        """
        Check various pieces of network configuration and, if altered since the last check, log the new state.
        """
        raw_route_list = self.osutil.read_route_table()
        digest = hash_strings(raw_route_list)
        if digest != self.last_route_table_hash:
            self.last_route_table_hash = digest
            route_list = self.osutil.get_list_of_routes(raw_route_list)
            logger.info("Route table: [{0}]".format(",".join(map(networkutil.RouteEntry.to_json, route_list))))

        nic_state = self.osutil.get_nic_state()
        if nic_state != self.last_nic_state:
            description = "Initial" if self.last_nic_state == {} else "Updated"
            logger.info("{0} NIC state: [{1}]".format(description, ", ".join(map(str, nic_state.values()))))
            self.last_nic_state = nic_state
class MonitorHandler(object):
    EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1)
    TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30)
    # extension metrics period
    CGROUP_TELEMETRY_POLLING_PERIOD = datetime.timedelta(minutes=5)
    CGROUP_TELEMETRY_REPORTING_PERIOD = datetime.timedelta(minutes=30)
    # host plugin
    HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5)
    # imds
    IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3)

    # Resetting loggers period
    RESET_LOGGERS_PERIOD = datetime.timedelta(hours=12)

    def __init__(self):
        self.osutil = get_osutil()
        self.protocol_util = get_protocol_util()
        self.imds_client = get_imds_client()

        self.event_thread = None
        self.last_reset_loggers_time = None
        self.last_event_collection = None
        self.last_telemetry_heartbeat = None
        self.last_cgroup_polling_telemetry = None
        self.last_cgroup_report_telemetry = None
        self.last_host_plugin_heartbeat = None
        self.last_imds_heartbeat = None
        self.protocol = None
        self.health_service = None
        self.last_route_table_hash = b''
        self.last_nic_state = {}

        self.counter = 0
        self.sysinfo = []
        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(
            min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)
        self.imds_errorstate = ErrorState(
            min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD)

    def run(self):
        self.init_protocols()
        self.init_sysinfo()
        self.start()

    def stop(self):
        self.should_run = False
        if self.is_alive():
            self.event_thread.join()

    def init_protocols(self):
        self.protocol = self.protocol_util.get_protocol()
        self.health_service = HealthService(self.protocol.endpoint)

    def is_alive(self):
        return self.event_thread is not None and self.event_thread.is_alive()

    def start(self):
        self.event_thread = threading.Thread(target=self.daemon)
        self.event_thread.setDaemon(True)
        self.event_thread.setName("MonitorHandler")
        self.event_thread.start()

    def init_sysinfo(self):
        osversion = "{0}:{1}-{2}-{3}:{4}".format(platform.system(),
                                                 DISTRO_NAME, DISTRO_VERSION,
                                                 DISTRO_CODE_NAME,
                                                 platform.release())
        self.sysinfo.append(TelemetryEventParam("OSVersion", osversion))
        self.sysinfo.append(
            TelemetryEventParam("ExecutionMode", AGENT_EXECUTION_MODE))

        try:
            ram = self.osutil.get_total_mem()
            processors = self.osutil.get_processor_cores()
            self.sysinfo.append(TelemetryEventParam("RAM", ram))
            self.sysinfo.append(TelemetryEventParam("Processors", processors))
        except OSUtilError as e:
            logger.warn("Failed to get system info: {0}", ustr(e))

        try:
            vminfo = self.protocol.get_vminfo()
            self.sysinfo.append(TelemetryEventParam("VMName", vminfo.vmName))
            self.sysinfo.append(
                TelemetryEventParam("TenantName", vminfo.tenantName))
            self.sysinfo.append(
                TelemetryEventParam("RoleName", vminfo.roleName))
            self.sysinfo.append(
                TelemetryEventParam("RoleInstanceName",
                                    vminfo.roleInstanceName))
        except ProtocolError as e:
            logger.warn("Failed to get system info: {0}", ustr(e))

        try:
            vminfo = self.imds_client.get_compute()
            self.sysinfo.append(
                TelemetryEventParam('Location', vminfo.location))
            self.sysinfo.append(
                TelemetryEventParam('SubscriptionId', vminfo.subscriptionId))
            self.sysinfo.append(
                TelemetryEventParam('ResourceGroupName',
                                    vminfo.resourceGroupName))
            self.sysinfo.append(TelemetryEventParam('VMId', vminfo.vmId))
            self.sysinfo.append(
                TelemetryEventParam('ImageOrigin', vminfo.image_origin))
        except (HttpError, ValueError) as e:
            logger.warn("failed to get IMDS info: {0}", ustr(e))

    @staticmethod
    def collect_event(evt_file_name):
        try:
            logger.verbose("Found event file: {0}", evt_file_name)
            with open(evt_file_name, "rb") as evt_file:
                # if fail to open or delete the file, throw exception
                data_str = evt_file.read().decode("utf-8")
            logger.verbose("Processed event file: {0}", evt_file_name)
            os.remove(evt_file_name)
            return data_str
        except (IOError, UnicodeDecodeError) as e:
            os.remove(evt_file_name)
            msg = "Failed to process {0}, {1}".format(evt_file_name, e)
            raise EventError(msg)

    def collect_and_send_events(self):
        """
        Periodically read, parse, and send events located in the events folder. Currently, this is done every minute.
        Any .tld file dropped in the events folder will be emitted. These event files can be created either by the
        agent or the extensions. We don't have control over extension's events parameters, but we will override
        any values they might have set for sys_info parameters.
        """
        if self.last_event_collection is None:
            self.last_event_collection = datetime.datetime.utcnow(
            ) - MonitorHandler.EVENT_COLLECTION_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_event_collection +
                MonitorHandler.EVENT_COLLECTION_PERIOD):
            try:
                event_list = TelemetryEventList()
                event_dir = os.path.join(conf.get_lib_dir(), "events")
                event_files = os.listdir(event_dir)
                for event_file in event_files:
                    if not event_file.endswith(".tld"):
                        continue
                    event_file_path = os.path.join(event_dir, event_file)
                    try:
                        data_str = self.collect_event(event_file_path)
                    except EventError as e:
                        logger.error("{0}", ustr(e))
                        continue

                    try:
                        event = parse_event(data_str)
                        self.add_sysinfo(event)
                        event_list.events.append(event)
                    except (ValueError, ProtocolError) as e:
                        logger.warn("Failed to decode event file: {0}",
                                    ustr(e))
                        continue

                if len(event_list.events) == 0:
                    return

                try:
                    self.protocol.report_event(event_list)
                except ProtocolError as e:
                    logger.error("{0}", ustr(e))
            except Exception as e:
                logger.warn("Failed to send events: {0}", ustr(e))

            self.last_event_collection = datetime.datetime.utcnow()

    def daemon(self):
        min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD,
                        MonitorHandler.CGROUP_TELEMETRY_POLLING_PERIOD,
                        MonitorHandler.CGROUP_TELEMETRY_REPORTING_PERIOD,
                        MonitorHandler.EVENT_COLLECTION_PERIOD,
                        MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD,
                        MonitorHandler.IMDS_HEARTBEAT_PERIOD).seconds
        while self.should_run:
            self.send_telemetry_heartbeat()
            self.poll_telemetry_metrics()
            self.send_telemetry_metrics()
            self.collect_and_send_events()
            self.send_host_plugin_heartbeat()
            self.send_imds_heartbeat()
            self.log_altered_network_configuration()
            self.reset_loggers()
            time.sleep(min_delta)

    def reset_loggers(self):
        """
        The loggers maintain hash-tables in memory and they need to be cleaned up from time to time.
        For reference, please check azurelinuxagent.common.logger.Logger and
        azurelinuxagent.common.event.EventLogger classes
        """
        time_now = datetime.datetime.utcnow()
        if not self.last_reset_loggers_time:
            self.last_reset_loggers_time = time_now

        if time_now >= (self.last_reset_loggers_time +
                        MonitorHandler.RESET_LOGGERS_PERIOD):
            try:
                logger.reset_periodic()
            finally:
                self.last_reset_loggers_time = time_now

    def add_sysinfo(self, event):
        """
        This method is called after parsing the event file in the events folder and before emitting it. This means
        all events, either coming from the agent or from the extensions, are passed through this method. The purpose
        is to add a static list of sys_info parameters such as VMName, Region, RAM, etc. If the sys_info parameters
        are already populated in the event, they will be overwritten by the sys_info values obtained from the agent.
        Since the ContainerId parameter is only populated on the fly for the agent events because it is not a static
        sys_info parameter, an event coming from an extension will not have it, so we explicitly add it.
        :param event: Event to be enriched with sys_info parameters
        :return: Event with all parameters added, ready to be reported
        """
        sysinfo_names = [v.name for v in self.sysinfo]
        final_parameters = []

        # Refer: azurelinuxagent.common.event.EventLogger.add_default_parameters_to_event for agent specific values.
        #
        # Default fields are only populated by Agent and not the extension. Agent will fill up any event if they don't
        # have the default params. Example: GAVersion and ContainerId are populated for agent events on the fly,
        # but not for extension events. Add it if it's missing.
        default_values = [("ContainerId", get_container_id_from_env()),
                          ("GAVersion", CURRENT_AGENT), ("OpcodeName", ""),
                          ("EventTid", 0), ("EventPid", 0), ("TaskName", ""),
                          ("KeywordName", "")]

        for param in event.parameters:
            # Discard any sys_info parameters already in the event, since they will be overwritten
            if param.name in sysinfo_names:
                continue
            final_parameters.append(param)

        # Add sys_info params populated by the agent
        final_parameters.extend(self.sysinfo)

        for default_value in default_values:
            if default_value[0] not in event:
                final_parameters.append(
                    TelemetryEventParam(default_value[0], default_value[1]))

        event.parameters = final_parameters

    def send_imds_heartbeat(self):
        """
        Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have
        successfully called and validated a response in the last IMDS_HEALTH_PERIOD.
        """

        if self.last_imds_heartbeat is None:
            self.last_imds_heartbeat = datetime.datetime.utcnow(
            ) - MonitorHandler.IMDS_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_imds_heartbeat +
                MonitorHandler.IMDS_HEARTBEAT_PERIOD):
            try:
                is_currently_healthy, response = self.imds_client.validate()

                if is_currently_healthy:
                    self.imds_errorstate.reset()
                else:
                    self.imds_errorstate.incr()

                is_healthy = self.imds_errorstate.is_triggered() is False
                logger.verbose("IMDS health: {0} [{1}]", is_healthy, response)

                self.health_service.report_imds_status(is_healthy, response)

            except Exception as e:
                msg = "Exception sending imds heartbeat: {0}".format(ustr(e))
                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.ImdsHeartbeat,
                          is_success=False,
                          message=msg,
                          log_event=False)

            self.last_imds_heartbeat = datetime.datetime.utcnow()

    def send_host_plugin_heartbeat(self):
        """
        Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to
        communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD.
        """
        if self.last_host_plugin_heartbeat is None:
            self.last_host_plugin_heartbeat = datetime.datetime.utcnow(
            ) - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_host_plugin_heartbeat +
                MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD):
            try:
                host_plugin = self.protocol.client.get_host_plugin()
                host_plugin.ensure_initialized()
                is_currently_healthy = host_plugin.get_health()

                if is_currently_healthy:
                    self.host_plugin_errorstate.reset()
                else:
                    self.host_plugin_errorstate.incr()

                is_healthy = self.host_plugin_errorstate.is_triggered(
                ) is False
                logger.verbose("HostGAPlugin health: {0}", is_healthy)

                self.health_service.report_host_plugin_heartbeat(is_healthy)

                if not is_healthy:
                    add_event(
                        name=AGENT_NAME,
                        version=CURRENT_VERSION,
                        op=WALAEventOperation.HostPluginHeartbeatExtended,
                        is_success=False,
                        message='{0} since successful heartbeat'.format(
                            self.host_plugin_errorstate.fail_time),
                        log_event=False)

            except Exception as e:
                msg = "Exception sending host plugin heartbeat: {0}".format(
                    ustr(e))
                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.HostPluginHeartbeat,
                          is_success=False,
                          message=msg,
                          log_event=False)

            self.last_host_plugin_heartbeat = datetime.datetime.utcnow()

    def send_telemetry_heartbeat(self):

        if self.last_telemetry_heartbeat is None:
            self.last_telemetry_heartbeat = datetime.datetime.utcnow(
            ) - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_telemetry_heartbeat +
                MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD):
            try:
                incarnation = self.protocol.get_incarnation()
                dropped_packets = self.osutil.get_firewall_dropped_packets(
                    self.protocol.endpoint)
                msg = "{0};{1};{2};{3}".format(incarnation, self.counter,
                                               self.heartbeat_id,
                                               dropped_packets)

                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.HeartBeat,
                          is_success=True,
                          message=msg,
                          log_event=False)

                self.counter += 1

                io_errors = IOErrorCounter.get_and_reset()
                hostplugin_errors = io_errors.get("hostplugin")
                protocol_errors = io_errors.get("protocol")
                other_errors = io_errors.get("other")

                if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0:
                    msg = "hostplugin:{0};protocol:{1};other:{2}".format(
                        hostplugin_errors, protocol_errors, other_errors)
                    add_event(name=AGENT_NAME,
                              version=CURRENT_VERSION,
                              op=WALAEventOperation.HttpErrors,
                              is_success=True,
                              message=msg,
                              log_event=False)
            except Exception as e:
                logger.warn("Failed to send heartbeat: {0}", ustr(e))

            self.last_telemetry_heartbeat = datetime.datetime.utcnow()

    def poll_telemetry_metrics(self):
        time_now = datetime.datetime.utcnow()
        if not self.last_cgroup_polling_telemetry:
            self.last_cgroup_polling_telemetry = time_now

        if time_now >= (self.last_cgroup_polling_telemetry +
                        MonitorHandler.CGROUP_TELEMETRY_POLLING_PERIOD):
            CGroupsTelemetry.poll_all_tracked()
            self.last_cgroup_polling_telemetry = time_now

    def send_telemetry_metrics(self):
        time_now = datetime.datetime.utcnow()

        if not self.last_cgroup_report_telemetry:
            self.last_cgroup_report_telemetry = time_now

        if time_now >= (self.last_cgroup_report_telemetry +
                        MonitorHandler.CGROUP_TELEMETRY_REPORTING_PERIOD):
            performance_metrics = CGroupsTelemetry.report_all_tracked()
            self.last_cgroup_report_telemetry = time_now

            if performance_metrics:
                message = generate_extension_metrics_telemetry_dictionary(
                    schema_version=1.0,
                    performance_metrics=performance_metrics)
                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.ExtensionMetricsData,
                          is_success=True,
                          message=ustr(message),
                          log_event=False)

    def log_altered_network_configuration(self):
        """
        Check various pieces of network configuration and, if altered since the last check, log the new state.
        """
        raw_route_list = self.osutil.read_route_table()
        digest = hash_strings(raw_route_list)
        if digest != self.last_route_table_hash:
            self.last_route_table_hash = digest
            route_list = self.osutil.get_list_of_routes(raw_route_list)
            logger.info("Route table: [{0}]".format(",".join(
                map(networkutil.RouteEntry.to_json, route_list))))

        nic_state = self.osutil.get_nic_state()
        if nic_state != self.last_nic_state:
            description = "Initial" if self.last_nic_state == {} else "Updated"
            logger.info("{0} NIC state: [{1}]".format(
                description, ", ".join(map(str, nic_state.values()))))
            self.last_nic_state = nic_state
Example #11
0
class MonitorHandler(object):
    # telemetry
    EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1)
    # host plugin
    HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5)
    # imds
    IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3)

    def __init__(self):
        self.osutil = get_osutil()
        self.imds_client = None

        self.event_thread = None
        self._periodic_operations = [
            ResetPeriodicLogMessagesOperation(),
            PeriodicOperation("collect_and_send_events",
                              self.collect_and_send_events,
                              self.EVENT_COLLECTION_PERIOD),
            ReportNetworkErrorsOperation(),
            PollResourceUsageOperation(),
            PeriodicOperation("send_host_plugin_heartbeat",
                              self.send_host_plugin_heartbeat,
                              self.HOST_PLUGIN_HEARTBEAT_PERIOD),
            PeriodicOperation("send_imds_heartbeat", self.send_imds_heartbeat,
                              self.IMDS_HEARTBEAT_PERIOD),
            ReportNetworkConfigurationChangesOperation(),
        ]
        self.protocol = None
        self.protocol_util = None
        self.health_service = None

        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(
            min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)
        self.imds_errorstate = ErrorState(
            min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD)

    def run(self):
        self.start(init_data=True)

    def stop(self):
        self.should_run = False
        if self.is_alive():
            self.join()

    def join(self):
        self.event_thread.join()

    def stopped(self):
        return not self.should_run

    def init_protocols(self):
        # The initialization of ProtocolUtil for the Monitor thread should be done within the thread itself rather
        # than initializing it in the ExtHandler thread. This is done to avoid any concurrency issues as each
        # thread would now have its own ProtocolUtil object as per the SingletonPerThread model.
        self.protocol_util = get_protocol_util()
        self.protocol = self.protocol_util.get_protocol()
        self.health_service = HealthService(self.protocol.get_endpoint())

    def init_imds_client(self):
        wireserver_endpoint = self.protocol_util.get_wireserver_endpoint()
        self.imds_client = get_imds_client(wireserver_endpoint)

    def is_alive(self):
        return self.event_thread is not None and self.event_thread.is_alive()

    def start(self, init_data=False):
        self.event_thread = threading.Thread(target=self.daemon,
                                             args=(init_data, ))
        self.event_thread.setDaemon(True)
        self.event_thread.setName("MonitorHandler")
        self.event_thread.start()

    def daemon(self, init_data=False):
        try:
            if init_data:
                self.init_protocols()
                self.init_imds_client()

            while not self.stopped():
                try:
                    self.protocol.update_host_plugin_from_goal_state()

                    for op in self._periodic_operations:
                        op.run()

                except Exception as e:
                    logger.error(
                        "An error occurred in the monitor thread main loop; will skip the current iteration.\n{0}",
                        ustr(e))
                finally:
                    PeriodicOperation.sleep_until_next_operation(
                        self._periodic_operations)
        except Exception as e:
            logger.error(
                "An error occurred in the monitor thread; will exit the thread.\n{0}",
                ustr(e))

    def collect_and_send_events(self):
        """
        Periodically send any events located in the events folder
        """
        event_list = collect_events()

        if len(event_list.events) > 0:
            self.protocol.report_event(event_list)

    def send_imds_heartbeat(self):
        """
        Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have
        successfully called and validated a response in the last IMDS_HEALTH_PERIOD.
        """
        try:
            is_currently_healthy, response = self.imds_client.validate()

            if is_currently_healthy:
                self.imds_errorstate.reset()
            else:
                self.imds_errorstate.incr()

            is_healthy = self.imds_errorstate.is_triggered() is False
            logger.verbose("IMDS health: {0} [{1}]", is_healthy, response)

            self.health_service.report_imds_status(is_healthy, response)

        except Exception as e:
            msg = "Exception sending imds heartbeat: {0}".format(ustr(e))
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ImdsHeartbeat,
                      is_success=False,
                      message=msg,
                      log_event=False)

    def send_host_plugin_heartbeat(self):
        """
        Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to
        communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD.
        """
        try:
            host_plugin = self.protocol.client.get_host_plugin()
            host_plugin.ensure_initialized()
            is_currently_healthy = host_plugin.get_health()

            if is_currently_healthy:
                self.host_plugin_errorstate.reset()
            else:
                self.host_plugin_errorstate.incr()

            is_healthy = self.host_plugin_errorstate.is_triggered() is False
            logger.verbose("HostGAPlugin health: {0}", is_healthy)

            self.health_service.report_host_plugin_heartbeat(is_healthy)

            if not is_healthy:
                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.HostPluginHeartbeatExtended,
                          is_success=False,
                          message='{0} since successful heartbeat'.format(
                              self.host_plugin_errorstate.fail_time),
                          log_event=False)

        except Exception as e:
            msg = "Exception sending host plugin heartbeat: {0}".format(
                ustr(e))
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.HostPluginHeartbeat,
                      is_success=False,
                      message=msg,
                      log_event=False)
class ExtHandlersHandler(object):
    def __init__(self):
        self.protocol_util = get_protocol_util()
        self.protocol = None
        self.ext_handlers = None
        self.last_etag = None
        self.log_report = False
        self.log_etag = True
        self.log_process = False

        self.report_status_error_state = ErrorState()
        self.get_artifact_error_state = ErrorState(
            min_timedelta=ERROR_STATE_DELTA_INSTALL)

    def run(self):
        self.ext_handlers, etag = None, None
        try:
            self.protocol = self.protocol_util.get_protocol()
            self.ext_handlers, etag = self.protocol.get_ext_handlers()
            self.get_artifact_error_state.reset()
        except Exception as e:
            msg = u"Exception retrieving extension handlers: {0}".format(
                ustr(e))
            detailed_msg = '{0} {1}'.format(msg, traceback.format_exc())

            self.get_artifact_error_state.incr()

            if self.get_artifact_error_state.is_triggered():
                add_event(AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.GetArtifactExtended,
                          is_success=False,
                          message="Failed to get extension artifact for over "
                          "{0): {1}".format(
                              self.get_artifact_error_state.min_timedelta,
                              msg))
                self.get_artifact_error_state.reset()
            else:
                logger.warn(msg)

            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ExtensionProcessing,
                      is_success=False,
                      message=detailed_msg)
            return

        try:
            msg = u"Handle extensions updates for incarnation {0}".format(etag)
            logger.verbose(msg)
            # Log status report success on new config
            self.log_report = True
            self.handle_ext_handlers(etag)
            self.last_etag = etag

            self.report_ext_handlers_status()
            self.cleanup_outdated_handlers()
        except Exception as e:
            msg = u"Exception processing extension handlers: {0}".format(
                ustr(e))
            logger.warn(msg)
            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ExtensionProcessing,
                      is_success=False,
                      message=msg)
            return

    def cleanup_outdated_handlers(self):
        handlers = []
        pkgs = []

        # Build a collection of uninstalled handlers and orphaned packages
        # Note:
        # -- An orphaned package is one without a corresponding handler
        #    directory
        for item in os.listdir(conf.get_lib_dir()):
            path = os.path.join(conf.get_lib_dir(), item)

            if version.is_agent_package(path) or version.is_agent_path(path):
                continue

            if os.path.isdir(path):
                if re.match(HANDLER_NAME_PATTERN, item) is None:
                    continue
                try:
                    eh = ExtHandler()

                    separator = item.rfind('-')

                    eh.name = item[0:separator]
                    eh.properties.version = str(
                        FlexibleVersion(item[separator + 1:]))

                    handler = ExtHandlerInstance(eh, self.protocol)
                except Exception:
                    continue
                if handler.get_handler_state() != ExtHandlerState.NotInstalled:
                    continue
                handlers.append(handler)

            elif os.path.isfile(path) and \
                    not os.path.isdir(path[0:-len(HANDLER_PKG_EXT)]):
                if not re.match(HANDLER_PKG_PATTERN, item):
                    continue
                pkgs.append(path)

        # Then, remove the orphaned packages
        for pkg in pkgs:
            try:
                os.remove(pkg)
                logger.verbose(
                    "Removed orphaned extension package {0}".format(pkg))
            except OSError as e:
                logger.warn(
                    "Failed to remove orphaned package {0}: {1}".format(
                        pkg, e.strerror))

        # Finally, remove the directories and packages of the
        # uninstalled handlers
        for handler in handlers:
            handler.rm_ext_handler_dir()
            pkg = os.path.join(conf.get_lib_dir(),
                               handler.get_full_name() + HANDLER_PKG_EXT)
            if os.path.isfile(pkg):
                try:
                    os.remove(pkg)
                    logger.verbose("Removed extension package {0}".format(pkg))
                except OSError as e:
                    logger.warn(
                        "Failed to remove extension package {0}: {1}".format(
                            pkg, e.strerror))

    def handle_ext_handlers(self, etag=None):
        if not conf.get_extensions_enabled():
            logger.verbose("Extension handling is disabled")
            return

        if self.ext_handlers.extHandlers is None or \
                len(self.ext_handlers.extHandlers) == 0:
            logger.verbose("No extension handler config found")
            return

        if conf.get_enable_overprovisioning():
            artifacts_profile = self.protocol.get_artifacts_profile()
            if artifacts_profile and artifacts_profile.is_on_hold():
                logger.info("Extension handling is on hold")
                return

        self.ext_handlers.extHandlers.sort(
            key=operator.methodcaller('sort_key'))
        for ext_handler in self.ext_handlers.extHandlers:
            # TODO: handle install in sequence, enable in parallel
            self.handle_ext_handler(ext_handler, etag)

    def handle_ext_handler(self, ext_handler, etag):
        ext_handler_i = ExtHandlerInstance(ext_handler, self.protocol)

        try:
            state = ext_handler.properties.state
            if ext_handler_i.decide_version(target_state=state) is None:
                version = ext_handler_i.ext_handler.properties.version
                name = ext_handler_i.ext_handler.name
                err_msg = "Unable to find version {0} in manifest for extension {1}".format(
                    version, name)
                ext_handler_i.set_operation(WALAEventOperation.Download)
                ext_handler_i.set_handler_status(message=ustr(err_msg),
                                                 code=-1)
                ext_handler_i.report_event(message=ustr(err_msg),
                                           is_success=False)
                return

            self.get_artifact_error_state.reset()
            if not ext_handler_i.is_upgrade and self.last_etag == etag:
                if self.log_etag:
                    ext_handler_i.logger.verbose(
                        "Version {0} is current for etag {1}",
                        ext_handler_i.pkg.version, etag)
                    self.log_etag = False
                return

            self.log_etag = True

            ext_handler_i.logger.info("Target handler state: {0}", state)
            if state == u"enabled":
                self.handle_enable(ext_handler_i)
            elif state == u"disabled":
                self.handle_disable(ext_handler_i)
            elif state == u"uninstall":
                self.handle_uninstall(ext_handler_i)
            else:
                message = u"Unknown ext handler state:{0}".format(state)
                raise ExtensionError(message)
        except ExtensionError as e:
            self.handle_handle_ext_handler_error(ext_handler_i, e, e.code)
        except Exception as e:
            self.handle_handle_ext_handler_error(ext_handler_i, e)

    def handle_handle_ext_handler_error(self, ext_handler_i, e, code=-1):
        msg = ustr(e)
        ext_handler_i.set_handler_status(message=msg, code=code)

        self.get_artifact_error_state.incr()
        if self.get_artifact_error_state.is_triggered():
            report_event(op=WALAEventOperation.GetArtifactExtended,
                         message="Failed to get artifact for over "
                         "{0}: {1}".format(
                             self.get_artifact_error_state.min_timedelta, msg),
                         is_success=False)
            self.get_artifact_error_state.reset()
        else:
            ext_handler_i.logger.warn(msg)

    def handle_enable(self, ext_handler_i):
        self.log_process = True
        old_ext_handler_i = ext_handler_i.get_installed_ext_handler()

        handler_state = ext_handler_i.get_handler_state()
        ext_handler_i.logger.info("[Enable] current handler state is: {0}",
                                  handler_state.lower())
        if handler_state == ExtHandlerState.NotInstalled:
            ext_handler_i.set_handler_state(ExtHandlerState.NotInstalled)
            ext_handler_i.download()
            ext_handler_i.update_settings()
            if old_ext_handler_i is None:
                ext_handler_i.install()
            elif ext_handler_i.version_ne(old_ext_handler_i):
                old_ext_handler_i.disable()
                ext_handler_i.copy_status_files(old_ext_handler_i)
                if ext_handler_i.version_gt(old_ext_handler_i):
                    ext_handler_i.update()
                else:
                    old_ext_handler_i.update(
                        version=ext_handler_i.ext_handler.properties.version)
                old_ext_handler_i.uninstall()
                old_ext_handler_i.rm_ext_handler_dir()
                ext_handler_i.update_with_install()
        else:
            ext_handler_i.update_settings()

        ext_handler_i.enable()

    def handle_disable(self, ext_handler_i):
        self.log_process = True
        handler_state = ext_handler_i.get_handler_state()
        ext_handler_i.logger.info("[Disable] current handler state is: {0}",
                                  handler_state.lower())
        if handler_state == ExtHandlerState.Enabled:
            ext_handler_i.disable()

    def handle_uninstall(self, ext_handler_i):
        self.log_process = True
        handler_state = ext_handler_i.get_handler_state()
        ext_handler_i.logger.info("[Uninstall] current handler state is: {0}",
                                  handler_state.lower())
        if handler_state != ExtHandlerState.NotInstalled:
            if handler_state == ExtHandlerState.Enabled:
                ext_handler_i.disable()
            ext_handler_i.uninstall()
        ext_handler_i.rm_ext_handler_dir()

    def report_ext_handlers_status(self):
        """
        Go through handler_state dir, collect and report status
        """
        vm_status = VMStatus(status="Ready", message="Guest Agent is running")
        if self.ext_handlers is not None:
            for ext_handler in self.ext_handlers.extHandlers:
                try:
                    self.report_ext_handler_status(vm_status, ext_handler)
                except ExtensionError as e:
                    add_event(AGENT_NAME,
                              version=CURRENT_VERSION,
                              op=WALAEventOperation.ExtensionProcessing,
                              is_success=False,
                              message=ustr(e))

        logger.verbose("Report vm agent status")
        try:
            self.protocol.report_vm_status(vm_status)
            if self.log_report:
                logger.verbose("Completed vm agent status report")
            self.report_status_error_state.reset()
        except ProtocolNotFoundError as e:
            self.report_status_error_state.incr()
            message = "Failed to report vm agent status: {0}".format(e)
            logger.verbose(message)
        except ProtocolError as e:
            self.report_status_error_state.incr()
            message = "Failed to report vm agent status: {0}".format(e)
            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ExtensionProcessing,
                      is_success=False,
                      message=message)

        if self.report_status_error_state.is_triggered():
            message = "Failed to report vm agent status for more than {0}"\
                .format(self.report_status_error_state.min_timedelta)

            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ReportStatusExtended,
                      is_success=False,
                      message=message)

            self.report_status_error_state.reset()

    def report_ext_handler_status(self, vm_status, ext_handler):
        ext_handler_i = ExtHandlerInstance(ext_handler, self.protocol)

        handler_status = ext_handler_i.get_handler_status()
        if handler_status is None:
            return

        handler_state = ext_handler_i.get_handler_state()
        if handler_state != ExtHandlerState.NotInstalled:
            try:
                active_exts = ext_handler_i.report_ext_status()
                handler_status.extensions.extend(active_exts)
            except ExtensionError as e:
                ext_handler_i.set_handler_status(message=ustr(e), code=e.code)

            try:
                heartbeat = ext_handler_i.collect_heartbeat()
                if heartbeat is not None:
                    handler_status.status = heartbeat.get('status')
            except ExtensionError as e:
                ext_handler_i.set_handler_status(message=ustr(e), code=e.code)

        vm_status.vmAgent.extensionHandlers.append(handler_status)