Esempio n. 1
0
    def run(self, child_args=None):
        #
        # The Container ID in telemetry events is retrieved from the goal state. We can fetch the goal state
        # only after protocol detection, which is done during provisioning.
        #
        # Be aware that telemetry events emitted before that will not include the Container ID.
        #
        logger.info("{0} Version:{1}", AGENT_LONG_NAME, AGENT_VERSION)
        logger.info("OS: {0} {1}", DISTRO_NAME, DISTRO_VERSION)
        logger.info("Python: {0}.{1}.{2}", PY_VERSION_MAJOR, PY_VERSION_MINOR, PY_VERSION_MICRO)

        self.check_pid()
        self.initialize_environment()

        # If FIPS is enabled, set the OpenSSL environment variable
        # Note:
        # -- Subprocesses inherit the current environment
        if conf.get_fips_enabled():
            os.environ[OPENSSL_FIPS_ENVIRONMENT] = '1'

        while self.running:
            try:
                self.daemon(child_args)
            except Exception as e:  # pylint: disable=W0612
                err_msg = textutil.format_exception(e)
                add_event(name=AGENT_NAME, is_success=False, message=ustr(err_msg),
                          op=WALAEventOperation.UnhandledError)
                logger.warn("Daemon ended with exception -- Sleep 15 seconds and restart daemon")
                time.sleep(15)
    def __log_network_setup_service_logs(self):
        # Get logs from journalctl - https://www.freedesktop.org/software/systemd/man/journalctl.html
        cmd = [
            "journalctl", "-u", self._network_setup_service_name, "-b", "--utc"
        ]
        service_failed = self.__verify_network_setup_service_failed()
        try:
            stdout = shellutil.run_command(cmd)
            msg = ustr("Logs from the {0} since system boot:\n {1}").format(
                self._network_setup_service_name, stdout)
            logger.info(msg)
        except CommandError as error:
            msg = "Unable to fetch service logs, Command: {0} failed with ExitCode: {1}\nStdout: {2}\nStderr: {3}".format(
                ' '.join(cmd), error.returncode, error.stdout, error.stderr)
            logger.warn(msg)
        except Exception as e:
            msg = "Ran into unexpected error when getting logs for {0} service. Error: {1}".format(
                self._network_setup_service_name, textutil.format_exception(e))
            logger.warn(msg)

        # Log service status and logs if we can fetch them from journalctl and send it to Kusto,
        # else just log the error of the failure of fetching logs
        add_event(op=WALAEventOperation.PersistFirewallRules,
                  is_success=(not service_failed),
                  message=msg,
                  log_event=False)
Esempio n. 3
0
    def test_no_primary_does_not_throw(self):
        freebsdosutil = FreeBSDOSUtil()

        with patch.object(freebsdosutil, '_get_net_info', return_value=('em0', '10.0.0.1', 'e5:f0:38:aa:da:52')):
            try:
                freebsdosutil.get_first_if()[0]
            except Exception as e:  # pylint: disable=unused-variable
                print(textutil.format_exception(e))
                exception = True  # pylint: disable=unused-variable
Esempio n. 4
0
 def test_no_primary_does_not_throw(self):
     with patch.object(osutil.DefaultOSUtil, 'get_primary_interface') \
             as patch_primary:
         exception = False
         patch_primary.return_value = ''
         try:
             osutil.DefaultOSUtil().get_first_if()[0]
         except Exception as e:  # pylint: disable=unused-variable
             print(textutil.format_exception(e))
             exception = True
         self.assertFalse(exception)
 def _get_events_in_queue(self, first_event):
     yield first_event
     while not self._queue.empty():
         try:
             event = self._queue.get_nowait()
             self._queue.task_done()
             yield event
         except Exception as error:
             logger.error(
                 "Some exception when fetching event from queue: {0}".
                 format(textutil.format_exception(error)))
    def __init__(self, etag, json_text):
        super(_ExtensionsGoalStateFromVmSettings, self).__init__()
        self._id = etag
        self._text = json_text

        try:
            self._parse_vm_settings(json_text)
            self._do_common_validations()
        except Exception as e:
            raise VmSettingsError(
                "Error parsing vmSettings (etag: {0}): {1}\n{2}".format(
                    etag, format_exception(e), self.get_redacted_text()))
    def __init__(self, incarnation, xml_text):
        super(_ExtensionsGoalStateFromExtensionsConfig, self).__init__()
        self._id = incarnation
        self._text = xml_text

        try:
            self._parse_extensions_config(xml_text)
            self._do_common_validations()
        except Exception as e:
            raise ExtensionsConfigError(
                "Error parsing ExtensionsConfig (incarnation: {0}): {1}\n{2}".
                format(incarnation, format_exception(e),
                       self.get_redacted_text()))
Esempio n. 8
0
 def run(self):
     try:
         if self._os_util.jit_enabled:
             # Handle remote access if any.
             self._remote_access = self._protocol.client.get_remote_access()
             self._handle_remote_access()
     except Exception as e:
         msg = u"Exception processing goal state for remote access users: {0}".format(
             textutil.format_exception(e))
         add_event(AGENT_NAME,
                   version=CURRENT_VERSION,
                   op=WALAEventOperation.RemoteAccessHandling,
                   is_success=False,
                   message=msg)
    def _process_telemetry_thread(self):
        logger.info("Successfully started the {0} thread".format(
            self.get_thread_name()))
        try:
            # On demand wait, start processing as soon as there is any data available in the queue. In worst case,
            # also keep checking every SendTelemetryEventsHandler._MAX_TIMEOUT secs to avoid uninterruptible waits.
            # Incase the service is stopped but we have events in queue, ensure we send them out before killing the thread.
            while not self.stopped() or not self._queue.empty():
                first_event = self._wait_for_event_in_queue()
                if first_event:
                    # Start processing queue only if first event is not None (i.e. Queue has atleast 1 event),
                    # else do nothing
                    self._send_events_in_queue(first_event)

        except Exception as error:
            err_msg = "An unknown error occurred in the {0} thread main loop, stopping thread.{1}".format(
                self.get_thread_name(), textutil.format_exception(error))
            add_event(op=WALAEventOperation.UnhandledError,
                      message=err_msg,
                      is_success=False)
Esempio n. 10
0
def main(args=None):
    """
    Parse command line arguments, exit with usage() on error.
    Invoke different methods according to different command
    """
    if args is None:
        args = []
    if len(args) <= 0:
        args = sys.argv[1:]
    command, force, verbose, debug, conf_file_path, log_collector_full_mode, firewall_metadata = parse_args(
        args)
    if command == AgentCommands.Version:
        version()
    elif command == AgentCommands.Help:
        print(usage())
    elif command == AgentCommands.Start:
        start(conf_file_path=conf_file_path)
    else:
        try:
            agent = Agent(verbose, conf_file_path=conf_file_path)
            if command == AgentCommands.DeprovisionUser:
                agent.deprovision(force, deluser=True)
            elif command == AgentCommands.Deprovision:
                agent.deprovision(force, deluser=False)
            elif command == AgentCommands.Provision:
                agent.provision()
            elif command == AgentCommands.RegisterService:
                agent.register_service()
            elif command == AgentCommands.Daemon:
                agent.daemon()
            elif command == AgentCommands.RunExthandlers:
                agent.run_exthandlers(debug)
            elif command == AgentCommands.ShowConfig:
                agent.show_configuration()
            elif command == AgentCommands.CollectLogs:
                agent.collect_logs(log_collector_full_mode)
            elif command == AgentCommands.SetupFirewall:
                agent.setup_firewall(firewall_metadata)
        except Exception as e:
            logger.error(u"Failed to run '{0}': {1}", command,
                         textutil.format_exception(e))
    def _operation(self):

        if self._send_telemetry_events_handler.stopped():
            logger.warn(
                "{0} service is not running, skipping current iteration".
                format(self._send_telemetry_events_handler.get_thread_name()))
            return

        delete_all_event_files = True
        extension_handler_with_event_dirs = []

        try:
            extension_handler_with_event_dirs = self._get_extension_events_dir_with_handler_name(
                conf.get_ext_log_dir())

            if not extension_handler_with_event_dirs:
                logger.verbose("No Extension events directory exist")
                return

            for extension_handler_with_event_dir in extension_handler_with_event_dirs:
                handler_name = extension_handler_with_event_dir[0]
                handler_event_dir_path = extension_handler_with_event_dir[1]
                self._capture_extension_events(handler_name,
                                               handler_event_dir_path)
        except ServiceStoppedError:
            # Since the service stopped, we should not delete the extension files and retry sending them whenever
            # the telemetry service comes back up
            delete_all_event_files = False
        except Exception as error:
            msg = "Unknown error occurred when trying to collect extension events:{0}".format(
                textutil.format_exception(error))
            add_event(op=WALAEventOperation.ExtensionTelemetryEventProcessing,
                      message=msg,
                      is_success=False)
        finally:
            # Always ensure that the events directory are being deleted each run except when Telemetry Service is stopped,
            # even if we run into an error and dont process them this run.
            if delete_all_event_files:
                self._ensure_all_events_directories_empty(
                    extension_handler_with_event_dirs)
Esempio n. 12
0
def read_response_error(resp):
    result = ''
    if resp is not None:
        try:
            result = "[HTTP Failed] [{0}: {1}] {2}".format(
                resp.status, resp.reason, resp.read())

            # this result string is passed upstream to several methods
            # which do a raise HttpError() or a format() of some kind;
            # as a result it cannot have any unicode characters
            if PY_VERSION_MAJOR < 3:
                result = ustr(result, encoding='ascii', errors='ignore')
            else:
                result = result\
                    .encode(encoding='ascii', errors='ignore')\
                    .decode(encoding='ascii', errors='ignore')

            result = textutil.replace_non_ascii(result)

        except Exception as e:
            logger.warn(textutil.format_exception(e))
    return result
    def _capture_extension_events(self, handler_name, handler_event_dir_path):
        """
        Capture Extension events and add them to the events_list
        :param handler_name: Complete Handler Name. Eg: Microsoft.CPlat.Core.RunCommandLinux
        :param handler_event_dir_path: Full path. Eg: '/var/log/azure/Microsoft.CPlat.Core.RunCommandLinux/events'
        """

        # Filter out the files that do not follow the pre-defined EXTENSION_EVENT_FILE_NAME_REGEX
        event_files = [
            event_file
            for event_file in os.listdir(handler_event_dir_path) if re.match(
                self._EXTENSION_EVENT_FILE_NAME_REGEX, event_file) is not None
        ]
        # Pick the latest files first, we'll discard older events if len(events) > MAX_EVENT_COUNT
        event_files.sort(reverse=True)

        captured_extension_events_count = 0
        dropped_events_with_error_count = defaultdict(int)

        try:
            for event_file in event_files:

                event_file_path = os.path.join(handler_event_dir_path,
                                               event_file)
                try:
                    logger.verbose("Processing event file: {0}",
                                   event_file_path)

                    if not self._event_file_size_allowed(event_file_path):
                        continue

                    # We support multiple events in a file, read the file and parse events.
                    captured_extension_events_count = self._enqueue_events_and_get_count(
                        handler_name, event_file_path,
                        captured_extension_events_count,
                        dropped_events_with_error_count)

                    # We only allow MAX_NUMBER_OF_EVENTS_PER_EXTENSION_PER_PERIOD=300 maximum events per period per handler
                    if captured_extension_events_count >= self._MAX_NUMBER_OF_EVENTS_PER_EXTENSION_PER_PERIOD:
                        msg = "Reached max count for the extension: {0}; Max Limit: {1}. Skipping the rest.".format(
                            handler_name, self.
                            _MAX_NUMBER_OF_EVENTS_PER_EXTENSION_PER_PERIOD)
                        logger.warn(msg)
                        add_log_event(level=logger.LogLevel.WARNING,
                                      message=msg,
                                      forced=True)
                        break
                except ServiceStoppedError:
                    # Not logging here as already logged once, re-raising
                    # Since we already started processing this file, deleting it as we could've already sent some events out
                    # This is a trade-off between data replication vs data loss.
                    raise
                except Exception as error:
                    msg = "Failed to process event file {0}:{1}".format(
                        event_file, textutil.format_exception(error))
                    logger.warn(msg)
                    add_log_event(level=logger.LogLevel.WARNING,
                                  message=msg,
                                  forced=True)
                finally:
                    # Todo: We should delete files after ensuring that we sent the data to Wireserver successfully
                    # from our end rather than deleting first and sending later. This is to ensure the data reliability
                    # of the agent telemetry pipeline.
                    os.remove(event_file_path)

        finally:
            if dropped_events_with_error_count:
                msg = "Dropped events for Extension: {0}; Details:\n\t{1}".format(
                    handler_name, '\n\t'.join([
                        "Reason: {0}; Dropped Count: {1}".format(k, v)
                        for k, v in dropped_events_with_error_count.items()
                    ]))
                logger.warn(msg)
                add_log_event(level=logger.LogLevel.WARNING,
                              message=msg,
                              forced=True)

            if captured_extension_events_count > 0:
                logger.info("Collected {0} events for extension: {1}".format(
                    captured_extension_events_count, handler_name))