Beispiel #1
0
def report_status(output, status, current_time):
    print >> output, 'Scalyr Agent status.  See https://www.scalyr.com/help/scalyr-agent-2 for help'
    print >> output, ''
    print >> output, 'Current time:     %s' % scalyr_util.format_time(
        current_time)
    print >> output, 'Agent started at: %s' % scalyr_util.format_time(
        status.launch_time)
    print >> output, 'Version:          %s' % status.version
    print >> output, 'Agent running as: %s' % status.user
    print >> output, 'Agent log:        %s' % status.log_path
    print >> output, 'ServerHost:       %s' % status.server_host
    print >> output, ''
    server = status.scalyr_server
    # We default to https://agent.scalyr.com for the Scalyr server, but to see the status on the web,
    # you should go to https://www.scalyr.com.  So, we do a little clean up before sticking it in
    # the url.  Same goes for https://log.scalyr.com  -- it is really is just https://www.scalyr.com
    server = server.replace('https://agent.', 'https://www.')
    server = server.replace('https://log.', 'https://www.')
    print >> output, 'View data from this agent at: %s/events?filter=$serverHost%%3D%%27%s%%27' % (
        server, status.server_host)
    print >> output, ''
    print >> output, ''

    # Configuration file status:
    print >> output, 'Agent configuration:'
    print >> output, '===================='
    print >> output, ''
    if len(status.config_status.additional_paths) == 0:
        print >> output, 'Configuration file:    %s' % status.config_status.path
    else:
        print >> output, 'Configuration files:   %s' % status.config_status.path
        for x in status.config_status.additional_paths:
            print >> output, '                       %s' % x

    if status.config_status.last_error is None:
        print >> output, 'Status:                Good (files parsed successfully)'
    else:
        print >> output, 'Status:                Bad (could not parse, using last good version)'
    print >> output, 'Last checked:          %s' % scalyr_util.format_time(
        status.config_status.last_check_time)
    print >> output, 'Last changed observed: %s' % scalyr_util.format_time(
        status.config_status.last_read_time)

    if status.config_status.last_error is not None:
        print >> output, 'Parsing error:         %s' % str(
            status.config_status.last_error)

    if status.copying_manager_status is not None:
        print >> output, ''
        print >> output, ''
        __report_copying_manager(output, status.copying_manager_status,
                                 status.log_path,
                                 status.config_status.last_read_time)

    if status.monitor_manager_status is not None:
        print >> output, ''
        print >> output, ''
        __report_monitor_manager(output, status.monitor_manager_status,
                                 status.config_status.last_read_time)
Beispiel #2
0
def report_status(output, status, current_time):
    print >>output, 'Scalyr Agent status.  See https://www.scalyr.com/help/scalyr-agent-2 for help'
    print >>output, ''
    print >>output, 'Current time:     %s' % scalyr_util.format_time(current_time)
    print >>output, 'Agent started at: %s' % scalyr_util.format_time(status.launch_time)
    print >>output, 'Version:          %s' % status.version
    print >>output, 'Agent running as: %s' % status.user
    print >>output, 'Agent log:        %s' % status.log_path
    print >>output, 'ServerHost:       %s' % status.server_host
    print >>output, ''
    server = status.scalyr_server
    # We default to https://agent.scalyr.com for the Scalyr server, but to see the status on the web,
    # you should go to https://www.scalyr.com.  So, we do a little clean up before sticking it in
    # the url.  Same goes for https://log.scalyr.com  -- it is really is just https://www.scalyr.com
    server = server.replace('https://agent.', 'https://www.')
    server = server.replace('https://log.', 'https://www.')
    print >>output, 'View data from this agent at: %s/events?filter=$serverHost%%3D%%27%s%%27' % (
        server, status.server_host)
    print >>output, ''
    print >>output, ''

    # Configuration file status:
    print >>output, 'Agent configuration:'
    print >>output, '===================='
    print >>output, ''
    if len(status.config_status.additional_paths) == 0:
        print >>output, 'Configuration file:    %s' % status.config_status.path
    else:
        print >>output, 'Configuration files:   %s' % status.config_status.path
        for x in status.config_status.additional_paths:
            print >>output, '                       %s' % x

    if status.config_status.last_error is None:
        print >>output, 'Status:                Good (files parsed successfully)'
    else:
        print >>output, 'Status:                Bad (could not parse, using last good version)'
    print >>output, 'Last checked:          %s' % scalyr_util.format_time(status.config_status.last_check_time)
    print >>output, 'Last changed observed: %s' % scalyr_util.format_time(status.config_status.last_read_time)

    if status.config_status.last_error is not None:
        print >>output, 'Parsing error:         %s' % str(status.config_status.last_error)

    if status.copying_manager_status is not None:
        print >>output, ''
        print >>output, ''
        __report_copying_manager(output, status.copying_manager_status, status.log_path,
                                 status.config_status.last_read_time)

    if status.monitor_manager_status is not None:
        print >>output, ''
        print >>output, ''
        __report_monitor_manager(output, status.monitor_manager_status, status.config_status.last_read_time)
Beispiel #3
0
def __report_monitor_manager(output, manager_status, read_time):
    print >> output, "Monitors:"
    print >> output, "========="
    print >> output, ""
    print >> output, "(these statistics cover the period from %s)" % scalyr_util.format_time(
        read_time)
    print >> output, ""
    if manager_status.total_alive_monitors < len(
            manager_status.monitors_status):
        print >> output, "Running monitors:"
        padding = "  "
    else:
        padding = ""

    for entry in manager_status.monitors_status:
        if entry.is_alive:
            print >> output, "%s%s: %d lines emitted, %d errors" % (
                padding,
                entry.monitor_name,
                entry.reported_lines,
                entry.errors,
            )

    dead_monitors = (len(manager_status.monitors_status) -
                     manager_status.total_alive_monitors)
    if dead_monitors > 0:
        print >> output, ""
        print >> output, "Failed monitors:"
        for entry in manager_status.monitors_status:
            if not entry.is_alive:
                print >> output, "  %s %d lines emitted, %d errors" % (
                    entry.monitor_name,
                    entry.reported_lines,
                    entry.errors,
                )
Beispiel #4
0
def __report_monitor_manager(output, manager_status, read_time):
    print >>output, 'Monitors:'
    print >>output, '========='
    print >>output, ''
    print >>output, '(these statistics cover the period from %s)' % scalyr_util.format_time(read_time)
    print >>output, ''
    if manager_status.total_alive_monitors < len(manager_status.monitors_status):
        print >>output, 'Running monitors:'
        padding = '  '
    else:
        padding = ''

    for entry in manager_status.monitors_status:
        if entry.is_alive:
            print >>output, '%s%s: %d lines emitted, %d errors' % (
                padding, entry.monitor_name, entry.reported_lines, entry.errors)

    dead_monitors = len(manager_status.monitors_status) - manager_status.total_alive_monitors
    if dead_monitors > 0:
        print >>output, ''
        print >>output, 'Failed monitors:'
        for entry in manager_status.monitors_status:
            if not entry.is_alive:
                print >>output, '  %s %d lines emitted, %d errors' % (
                    entry.monitor_name, entry.reported_lines, entry.errors)
Beispiel #5
0
def __report_monitor_manager(output, manager_status, read_time):
    print >> output, 'Monitors:'
    print >> output, '========='
    print >> output, ''
    print >> output, '(these statistics cover the period from %s)' % scalyr_util.format_time(
        read_time)
    print >> output, ''
    if manager_status.total_alive_monitors < len(
            manager_status.monitors_status):
        print >> output, 'Running monitors:'
        padding = '  '
    else:
        padding = ''

    for entry in manager_status.monitors_status:
        if entry.is_alive:
            print >> output, '%s%s: %d lines emitted, %d errors' % (
                padding, entry.monitor_name, entry.reported_lines,
                entry.errors)

    dead_monitors = len(
        manager_status.monitors_status) - manager_status.total_alive_monitors
    if dead_monitors > 0:
        print >> output, ''
        print >> output, 'Failed monitors:'
        for entry in manager_status.monitors_status:
            if not entry.is_alive:
                print >> output, '  %s %d lines emitted, %d errors' % (
                    entry.monitor_name, entry.reported_lines, entry.errors)
Beispiel #6
0
def __report_copying_manager(output, manager_status, agent_log_file_path,
                             read_time):
    print >> output, "Log transmission:"
    print >> output, "================="
    print >> output, ""
    print >> output, "(these statistics cover the period from %s)" % scalyr_util.format_time(
        read_time)
    print >> output, ""

    print >> output, "Bytes uploaded successfully:               %ld" % manager_status.total_bytes_uploaded
    print >> output, "Last successful communication with Scalyr: %s" % scalyr_util.format_time(
        manager_status.last_success_time)
    print >> output, "Last attempt:                              %s" % scalyr_util.format_time(
        manager_status.last_attempt_time)
    if manager_status.last_attempt_size is not None:
        print >> output, "Last copy request size:                    %ld" % manager_status.last_attempt_size
    if manager_status.last_response is not None:
        print >> output, "Last copy response size:                   %ld" % len(
            manager_status.last_response)
        print >> output, "Last copy response status:                 %s" % manager_status.last_response_status
        if manager_status.last_response_status != "success":
            print >> output, "Last copy response:                        %s" % scalyr_util.remove_newlines_and_truncate(
                manager_status.last_response, 1000)
    if manager_status.total_errors > 0:
        print >> output, "Total responses with errors:               %d (see '%s' for details)" % (
            manager_status.total_errors,
            agent_log_file_path,
        )
    print >> output, ""

    for matcher_status in manager_status.log_matchers:
        if not matcher_status.is_glob:
            if len(matcher_status.log_processors_status) == 0:
                # This is an absolute file path (no wildcards) and there are not matches.
                print >> output, "Path %s: no matching readable file, last checked %s" % (
                    matcher_status.log_path,
                    scalyr_util.format_time(matcher_status.last_check_time),
                )
            else:
                # We have a match.. matcher_status.log_processors_status should really only have one
                # entry, but we loop anyway.
                for processor_status in matcher_status.log_processors_status:
                    output.write(
                        "Path %s: copied %ld bytes (%ld lines), %ld bytes pending, "
                        % (
                            processor_status.log_path,
                            processor_status.total_bytes_copied,
                            processor_status.total_lines_copied,
                            processor_status.total_bytes_pending,
                        ))
                    if processor_status.total_bytes_skipped > 0:
                        output.write("%ld bytes skipped, " %
                                     processor_status.total_bytes_skipped)
                    if processor_status.total_bytes_failed > 0:
                        output.write("%ld bytes failed, " %
                                     processor_status.total_bytes_failed)
                    if processor_status.total_bytes_dropped_by_sampling > 0:
                        output.write(
                            "%ld bytes dropped by sampling (%ld lines), " % (
                                processor_status.
                                total_bytes_dropped_by_sampling,
                                processor_status.
                                total_lines_dropped_by_sampling,
                            ))

                    if processor_status.total_redactions > 0:
                        output.write("%ld redactions, " %
                                     processor_status.total_redactions)
                    output.write("last checked %s" % scalyr_util.format_time(
                        processor_status.last_scan_time))
                    output.write("\n")
                    output.flush()

    need_to_add_extra_line = True
    for matcher_status in manager_status.log_matchers:
        if matcher_status.is_glob:
            if need_to_add_extra_line:
                need_to_add_extra_line = False
                print >> output, ""
            print >> output, "Glob: %s:: last scanned for glob matches at %s" % (
                matcher_status.log_path,
                scalyr_util.format_time(matcher_status.last_check_time),
            )

            for processor_status in matcher_status.log_processors_status:
                output.write(
                    "  %s: copied %ld bytes (%ld lines), %ld bytes pending, " %
                    (
                        processor_status.log_path,
                        processor_status.total_bytes_copied,
                        processor_status.total_lines_copied,
                        processor_status.total_bytes_pending,
                    ))
                if processor_status.total_bytes_skipped > 0:
                    output.write("%ld bytes skipped, " %
                                 processor_status.total_bytes_skipped)
                if processor_status.total_bytes_failed > 0:
                    output.write("%ld bytes failed, " %
                                 processor_status.total_bytes_failed)
                if processor_status.total_bytes_dropped_by_sampling > 0:
                    output.write(
                        "%ld bytes dropped by sampling (%ld lines), " % (
                            processor_status.total_bytes_dropped_by_sampling,
                            processor_status.total_lines_dropped_by_sampling,
                        ))

                if processor_status.total_redactions > 0:
                    output.write("%ld redactions, " %
                                 processor_status.total_redactions)
                output.write(
                    "last checked %s" %
                    scalyr_util.format_time(processor_status.last_scan_time))
                output.write("\n")
                output.flush()
Beispiel #7
0
def report_status(output, status, current_time):
    print >> output, "Scalyr Agent status.  See https://www.scalyr.com/help/scalyr-agent-2 for help"
    print >> output, ""
    print >> output, "Current time:     %s" % scalyr_util.format_time(
        current_time)
    print >> output, "Agent started at: %s" % scalyr_util.format_time(
        status.launch_time)
    print >> output, "Version:          %s" % status.version
    print >> output, "Agent running as: %s" % status.user
    print >> output, "Agent log:        %s" % status.log_path
    print >> output, "ServerHost:       %s" % status.server_host
    print >> output, ""
    server = status.scalyr_server
    # We default to https://agent.scalyr.com for the Scalyr server, but to see the status on the web,
    # you should go to https://www.scalyr.com.  So, we do a little clean up before sticking it in
    # the url.  Same goes for https://log.scalyr.com  -- it is really is just https://www.scalyr.com
    server = server.replace("https://agent.", "https://www.")
    server = server.replace("https://log.", "https://www.")
    print >> output, "View data from this agent at: %s/events?filter=$serverHost%%3D%%27%s%%27" % (
        server,
        status.server_host,
    )
    print >> output, ""
    print >> output, ""

    # Configuration file status:
    print >> output, "Agent configuration:"
    print >> output, "===================="
    print >> output, ""
    if len(status.config_status.additional_paths) == 0:
        print >> output, "Configuration file:    %s" % status.config_status.path
    else:
        print >> output, "Configuration files:   %s" % status.config_status.path
        for x in status.config_status.additional_paths:
            print >> output, "                       %s" % x

    if status.config_status.last_error is None:
        print >> output, "Status:                Good (files parsed successfully)"
    else:
        print >> output, "Status:                Bad (could not parse, using last good version)"
    print >> output, "Last checked:          %s" % scalyr_util.format_time(
        status.config_status.last_check_time)
    print >> output, "Last changed observed: %s" % scalyr_util.format_time(
        status.config_status.last_read_time)

    if status.config_status.last_error is not None:
        print >> output, "Parsing error:         %s" % str(
            status.config_status.last_error)

    def print_environment():

        # Print scalyr-related env variables in sorted order with critical variables up top. Redact API key.
        main_keys = ["SCALYR_API_KEY", "SCALYR_SERVER"]
        special_case_keys = set(["K8S_EVENT_DISABLE"])
        redacted_keys = set(["SCALYR_API_KEY"])

        # Make a map of uppercase keys -> relevant environment vars (beginning with SCALYR)
        upper2actualkey = {}
        for k in os.environ.keys():
            kup = k.upper()
            if kup.startswith("SCALYR") or kup in special_case_keys:
                upper2actualkey[kup] = k

        # Sorted list of all scalyr keys, including main_keys which may not be present
        # Sort order does not consider letter case.
        sorted_upperkeys = main_keys + sorted(
            set(upper2actualkey.keys()) - set(main_keys))

        print >> output, ""
        row = 0
        for kup in sorted_upperkeys:
            key = upper2actualkey.get(kup, kup)
            val = os.getenv(key)
            if not val:
                val = "<Missing>"
            elif key.upper() in redacted_keys:
                val = "<Redacted>"

            if row == 0:
                print >> output, "Environment variables: %s = %s" % (key, val)
            else:
                print >> output, "                       %s = %s" % (key, val)
            row += 1

    print_environment()

    if status.copying_manager_status is not None:
        print >> output, ""
        print >> output, ""
        __report_copying_manager(
            output,
            status.copying_manager_status,
            status.log_path,
            status.config_status.last_read_time,
        )

    if status.monitor_manager_status is not None:
        print >> output, ""
        print >> output, ""
        __report_monitor_manager(output, status.monitor_manager_status,
                                 status.config_status.last_read_time)
Beispiel #8
0
def __report_copying_manager(output, manager_status, agent_log_file_path, read_time):
    print >>output, 'Log transmission:'
    print >>output, '================='
    print >>output, ''
    print >>output, '(these statistics cover the period from %s)' % scalyr_util.format_time(read_time)
    print >>output, ''

    print >>output, 'Bytes uploaded successfully:               %ld' % manager_status.total_bytes_uploaded
    print >>output, 'Last successful communication with Scalyr: %s' % scalyr_util.format_time(
        manager_status.last_success_time)
    print >>output, 'Last attempt:                              %s' % scalyr_util.format_time(
        manager_status.last_attempt_time)
    if manager_status.last_attempt_size is not None:
        print >>output, 'Last copy request size:                    %ld' % manager_status.last_attempt_size
    if manager_status.last_response is not None:
        print >>output, 'Last copy response size:                   %ld' % len(manager_status.last_response)
        print >>output, 'Last copy response status:                 %s' % manager_status.last_response_status
        if manager_status.last_response_status != 'success':
            print >>output, 'Last copy response:                        %s' % scalyr_util.remove_newlines_and_truncate(
                manager_status.last_response, 1000)
    if manager_status.total_errors > 0:
        print >>output, 'Total responses with errors:               %d (see \'%s\' for details)' % (
            manager_status.total_errors, agent_log_file_path)
    print >>output, ''

    for matcher_status in manager_status.log_matchers:
        if not matcher_status.is_glob:
            if len(matcher_status.log_processors_status) == 0:
                # This is an absolute file path (no wildcards) and there are not matches.
                print >>output, 'Path %s: no matching readable file, last checked %s' % (
                    matcher_status.log_path, scalyr_util.format_time(matcher_status.last_check_time))
            else:
                # We have a match.. matcher_status.log_processors_status should really only have one
                # entry, but we loop anyway.
                for processor_status in matcher_status.log_processors_status:
                    output.write('Path %s: copied %ld bytes (%ld lines), %ld bytes pending, ' % (
                        processor_status.log_path, processor_status.total_bytes_copied,
                        processor_status.total_lines_copied, processor_status.total_bytes_pending))
                    if processor_status.total_bytes_skipped > 0:
                        output.write('%ld bytes skipped, ' % processor_status.total_bytes_skipped)
                    if processor_status.total_bytes_failed > 0:
                        output.write('%ld bytes failed, ' % processor_status.total_bytes_failed)
                    if processor_status.total_bytes_dropped_by_sampling > 0:
                        output.write('%ld bytes dropped by sampling (%ld lines), ' % (
                            processor_status.total_bytes_dropped_by_sampling,
                            processor_status.total_lines_dropped_by_sampling))

                    if processor_status.total_redactions > 0:
                        output.write('%ld redactions, ' % processor_status.total_redactions)
                    output.write('last checked %s' % scalyr_util.format_time(processor_status.last_scan_time))
                    output.write('\n')
                    output.flush()

    need_to_add_extra_line = True
    for matcher_status in manager_status.log_matchers:
        if matcher_status.is_glob:
            if need_to_add_extra_line:
                need_to_add_extra_line = False
                print >>output, ''
            print >>output, 'Glob: %s:: last scanned for glob matches at %s' % (
                matcher_status.log_path, scalyr_util.format_time(matcher_status.last_check_time))

            for processor_status in matcher_status.log_processors_status:
                output.write('  %s: copied %ld bytes (%ld lines), %ld bytes pending, ' % (
                    processor_status.log_path, processor_status.total_bytes_copied,
                    processor_status.total_lines_copied, processor_status.total_bytes_pending))
                if processor_status.total_bytes_skipped > 0:
                    output.write('%ld bytes skipped, ' % processor_status.total_bytes_skipped)
                if processor_status.total_bytes_failed > 0:
                    output.write('%ld bytes failed, ' % processor_status.total_bytes_failed)
                if processor_status.total_bytes_dropped_by_sampling > 0:
                    output.write('%ld bytes dropped by sampling (%ld lines), ' % (
                        processor_status.total_bytes_dropped_by_sampling,
                        processor_status.total_lines_dropped_by_sampling))

                if processor_status.total_redactions > 0:
                    output.write('%ld redactions, ' % processor_status.total_redactions)
                output.write('last checked %s' % scalyr_util.format_time(processor_status.last_scan_time))
                output.write('\n')
                output.flush()
    def run(self):
        """Processes the log files as requested by the configuration, looking for matching log files, reading their
        bytes, applying redaction and sampling rules, and then sending them to the server.

        This method will not terminate until the thread has been stopped.
        """
        # So the scanning.. every scan:
        #   - See if any of the loggers have new files that are being matched
        #   - Update the file length counts of all current scanners:
        #   - Then pick up where you left off, getting X bytes as determined that abstraction
        #   - Send it to the client
        #   - determine success or not.. if success, update it.
        #   - sleep
        # noinspection PyBroadException
        try:
            # Try to read the checkpoint state from disk.
            current_time = time.time()
            checkpoints_state = self.__read_checkpoint_state()
            if checkpoints_state is None:
                log.info('The checkpoints could not be read.  All logs will be copied starting at their current end')
            elif (current_time - checkpoints_state['time']) > self.__config.max_allowed_checkpoint_age:
                log.warn('The current checkpoint is too stale (written at "%s").  Ignoring it.  All log files will be '
                         'copied starting at their current end.', scalyr_util.format_time(
                         checkpoints_state['time']), error_code='staleCheckpointFile')
                checkpoints_state = None

            if checkpoints_state is None:
                checkpoints = None
            else:
                checkpoints = checkpoints_state['checkpoints']

            # Do the initial scan for any log files that match the configured logs we should be copying.  If there
            # are checkpoints for them, make sure we start copying from the position we left off at.
            self.__scan_for_new_logs_if_necessary(current_time=current_time,
                                                  checkpoints=checkpoints,
                                                  logs_initial_positions=self.__logs_initial_positions)

            # The copying params that tell us how much we are allowed to send and how long we have to wait between
            # attempts.
            copying_params = CopyingParameters(self.__config)

            # Just initialize the last time we had a success to now.  Make the logic below easier.
            last_success = time.time()

            # We are about to start copying.  We can tell waiting threads.
            self.__copying_semaphore.release()

            while self._run_state.is_running():
                log.log(scalyr_logging.DEBUG_LEVEL_1, 'At top of copy log files loop.')
                current_time = time.time()
                # noinspection PyBroadException
                try:
                    # If we have a pending request and it's been too taken too long to send it, just drop it
                    # on the ground and advance.
                    if current_time - last_success > self.__config.max_retry_time:
                        if self.__pending_add_events_task is not None:
                            self.__pending_add_events_task.completion_callback(LogFileProcessor.FAIL_AND_DROP)
                            self.__pending_add_events_task = None
                        # Tell all of the processors to go to the end of the current log file.  We will start copying
                        # from there.
                        for processor in self.__log_processors:
                            processor.skip_to_end('Too long since last successful request to server.',
                                                  'skipNoServerSuccess', current_time=current_time)

                    # Check for new logs.  If we do detect some new log files, they must have been created since our
                    # last scan.  In this case, we start copying them from byte zero instead of the end of the file.
                    self.__scan_for_new_logs_if_necessary(current_time=current_time, copy_at_index_zero=True)

                    # Collect log lines to send if we don't have one already.
                    if self.__pending_add_events_task is None:
                        log.log(scalyr_logging.DEBUG_LEVEL_1, 'Getting next batch of events to send.')
                        self.__pending_add_events_task = self.__get_next_add_events_task(
                            copying_params.current_bytes_allowed_to_send)
                    else:
                        log.log(scalyr_logging.DEBUG_LEVEL_1, 'Have pending batch of events, retrying to send.')
                        # Take a look at the file system and see if there are any new bytes pending.  This updates the
                        # statistics for each pending file.  This is important to do for status purposes if we have
                        # not tried to invoke get_next_send_events_task in a while (since that already updates the
                        # statistics).
                        self.__scan_for_new_bytes(current_time=current_time)

                    # Try to send the request if we have one.
                    if self.__pending_add_events_task is not None:
                        (result, bytes_sent, full_response) = self.__send_events(self.__pending_add_events_task)

                        log.log(scalyr_logging.DEBUG_LEVEL_1, 'Sent %ld bytes and received response with status="%s".',
                                bytes_sent, result)

                        if result == 'success' or 'discardBuffer' in result or 'requestTooLarge' in result:
                            if result == 'success':
                                self.__pending_add_events_task.completion_callback(LogFileProcessor.SUCCESS)
                            elif 'discardBuffer' in result:
                                self.__pending_add_events_task.completion_callback(LogFileProcessor.FAIL_AND_DROP)
                            else:
                                self.__pending_add_events_task.completion_callback(LogFileProcessor.FAIL_AND_RETRY)
                            self.__pending_add_events_task = None
                            self.__write_checkpoint_state()

                        if result == 'success':
                            last_success = current_time
                    else:
                        result = 'failedReadingLogs'
                        bytes_sent = 0
                        full_response = ''

                        log.error('Failed to read logs for copying.  Will re-try')

                    # Update the statistics and our copying parameters.
                    self.__lock.acquire()
                    copying_params.update_params(result, bytes_sent)
                    self.__last_attempt_time = current_time
                    self.__last_success_time = last_success
                    self.__last_attempt_size = bytes_sent
                    self.__last_response = full_response
                    self.__last_response_status = result
                    if result == 'success':
                        self.__total_bytes_uploaded += bytes_sent
                    self.__lock.release()

                except Exception:
                    # TODO: Do not catch Exception here.  That is too board.  Disabling warning for now.
                    log.exception('Failed while attempting to scan and transmit logs')
                    log.log(scalyr_logging.DEBUG_LEVEL_1, 'Failed while attempting to scan and transmit logs')
                    self.__lock.acquire()
                    self.__last_attempt_time = current_time
                    self.__total_errors += 1
                    self.__lock.release()

                self._run_state.sleep_but_awaken_if_stopped(copying_params.current_sleep_interval)
        except Exception:
            # If we got an exception here, it is caused by a bug in the program, so let's just terminate.
            log.exception('Log copying failed due to exception')
            sys.exit(1)
Beispiel #10
0
def report_status(output, status, current_time):
    print >>output, 'Scalyr Agent status.  See https://www.scalyr.com/help/scalyr-agent-2 for help'
    print >>output, ''
    print >>output, 'Current time:     %s' % scalyr_util.format_time(current_time)
    print >>output, 'Agent started at: %s' % scalyr_util.format_time(status.launch_time)
    print >>output, 'Version:          %s' % status.version
    print >>output, 'Agent running as: %s' % status.user
    print >>output, 'Agent log:        %s' % status.log_path
    print >>output, 'ServerHost:       %s' % status.server_host
    print >>output, ''
    server = status.scalyr_server
    # We default to https://agent.scalyr.com for the Scalyr server, but to see the status on the web,
    # you should go to https://www.scalyr.com.  So, we do a little clean up before sticking it in
    # the url.  Same goes for https://log.scalyr.com  -- it is really is just https://www.scalyr.com
    server = server.replace('https://agent.', 'https://www.')
    server = server.replace('https://log.', 'https://www.')
    print >>output, 'View data from this agent at: %s/events?filter=$serverHost%%3D%%27%s%%27' % (
        server, status.server_host)
    print >>output, ''
    print >>output, ''

    # Configuration file status:
    print >>output, 'Agent configuration:'
    print >>output, '===================='
    print >>output, ''
    if len(status.config_status.additional_paths) == 0:
        print >>output, 'Configuration file:    %s' % status.config_status.path
    else:
        print >>output, 'Configuration files:   %s' % status.config_status.path
        for x in status.config_status.additional_paths:
            print >>output, '                       %s' % x

    if status.config_status.last_error is None:
        print >>output, 'Status:                Good (files parsed successfully)'
    else:
        print >>output, 'Status:                Bad (could not parse, using last good version)'
    print >>output, 'Last checked:          %s' % scalyr_util.format_time(status.config_status.last_check_time)
    print >>output, 'Last changed observed: %s' % scalyr_util.format_time(status.config_status.last_read_time)

    if status.config_status.last_error is not None:
        print >>output, 'Parsing error:         %s' % str(status.config_status.last_error)

    def print_environment():

        # Print scalyr-related env variables in sorted order with critical variables up top. Redact API key.
        main_keys = ['SCALYR_API_KEY', 'SCALYR_SERVER']
        special_case_keys = set(['K8S_EVENT_DISABLE'])
        redacted_keys = set(['SCALYR_API_KEY'])

        # Make a map of uppercase keys -> relevant environment vars (beginning with SCALYR)
        upper2actualkey = {}
        for k in os.environ.keys():
            kup = k.upper()
            if kup.startswith('SCALYR') or kup in special_case_keys:
                upper2actualkey[kup] = k

        # Sorted list of all scalyr keys, including main_keys which may not be present
        # Sort order does not consider letter case.
        sorted_upperkeys = main_keys + sorted(set(upper2actualkey.keys()) - set(main_keys))

        print >>output, ''
        row = 0
        for kup in sorted_upperkeys:
            key = upper2actualkey.get(kup, kup)
            val = os.getenv(key)
            if not val:
                val = '<Missing>'
            elif key.upper() in redacted_keys:
                val = '<Redacted>'

            if row == 0:
                print >>output, 'Environment variables: %s = %s' % (key, val)
            else:
                print >>output, '                       %s = %s' % (key, val)
            row += 1

    print_environment()

    if status.copying_manager_status is not None:
        print >>output, ''
        print >>output, ''
        __report_copying_manager(output, status.copying_manager_status, status.log_path,
                                 status.config_status.last_read_time)

    if status.monitor_manager_status is not None:
        print >>output, ''
        print >>output, ''
        __report_monitor_manager(output, status.monitor_manager_status, status.config_status.last_read_time)
def __report_copying_manager(output, manager_status, agent_log_file_path,
                             read_time):
    # type: (TextIO, CopyingManagerStatus, six.text_type, float) -> None
    print("Log transmission:", file=output)
    print("=================", file=output)
    print("", file=output)
    print(
        "(these statistics cover the period from %s)" %
        scalyr_util.format_time(read_time),
        file=output,
    )
    print("", file=output)
    workers = manager_status.workers

    # if it is a default configuration, then we just print the stats of the single worker session.
    if manager_status.is_single_worker_session():
        worker_session = workers[-1].sessions[-1]
        _report_worker_session(output,
                               worker_session,
                               manager_status,
                               agent_log_file_path,
                               indent=0)
    else:
        # print some overall information from all workers.
        print(
            "Total bytes uploaded:                            %ld" %
            manager_status.total_bytes_uploaded,
            file=output,
        )

        # print the overlall health chech.
        health_check_message = __get_overall_health_check(manager_status)
        if health_check_message:
            # if message is not empty, write it. In other case we still don't have all health check data.
            _indent_print(
                "Overall health check:                            %s" %
                health_check_message,
                file=output,
                indent=0,
            )
        if manager_status.total_errors > 0:
            print(
                "Total errors occurred:                           %d" %
                manager_status.total_errors,
                file=output,
            )

        print("", file=output)

        # show every statistics for every session in every worker.
        print("Uploads statistics by worker:", file=output)
        for worker in manager_status.workers:
            if not worker.has_files:
                # skip worker if there is no log files.
                continue
            print(" Worker %s:" % worker.worker_id, file=output)
            for worker_session in worker.sessions:
                print("    Session %s:" % worker_session.session_id,
                      file=output)
                _report_worker_session(
                    output,
                    worker_session,
                    manager_status,
                    agent_log_file_path,
                    indent=6,
                )

        # Show in which worker and session each file is located.
        _indent_print(" Log files associated with workers:",
                      file=output,
                      indent=0)
        for worker in manager_status.workers:
            # skip worker if there is no log files.
            if not worker.has_files:
                continue
            print("  Worker %s:" % worker.worker_id, file=output)
            for worker_session in worker.sessions:
                if len(worker.sessions) > 1:
                    print("    Session %s:" % worker_session.session_id,
                          file=output)
                for log_processor in worker_session.log_processors:
                    _indent_print(log_processor.log_path,
                                  file=output,
                                  indent=8)

        print("", file=output)

    for matcher_status in manager_status.log_matchers:
        if not matcher_status.is_glob:
            if len(matcher_status.log_processors_status) == 0:
                # This is an absolute file path (no wildcards) and there are not matches.
                print(
                    "Path %s: no matching readable file, last checked %s" % (
                        matcher_status.log_path,
                        scalyr_util.format_time(
                            matcher_status.last_check_time),
                    ),
                    file=output,
                )
            else:
                # We have a match.. matcher_status.log_processors_status should really only have one
                # entry, but we loop anyway.
                for processor_status in matcher_status.log_processors_status:
                    output.write(
                        "Path %s: copied %ld bytes (%ld lines), %ld bytes pending, "
                        % (
                            processor_status.log_path,
                            processor_status.total_bytes_copied,
                            processor_status.total_lines_copied,
                            processor_status.total_bytes_pending,
                        ))
                    if processor_status.total_bytes_skipped > 0:
                        output.write("%ld bytes skipped, " %
                                     processor_status.total_bytes_skipped)
                    if processor_status.total_bytes_failed > 0:
                        output.write("%ld bytes failed, " %
                                     processor_status.total_bytes_failed)
                    if processor_status.total_bytes_dropped_by_sampling > 0:
                        output.write(
                            "%ld bytes dropped by sampling (%ld lines), " % (
                                processor_status.
                                total_bytes_dropped_by_sampling,
                                processor_status.
                                total_lines_dropped_by_sampling,
                            ))

                    if processor_status.total_redactions > 0:
                        output.write("%ld redactions, " %
                                     processor_status.total_redactions)
                    output.write("last checked %s" % scalyr_util.format_time(
                        processor_status.last_scan_time))
                    output.write("\n")
                    output.flush()

    need_to_add_extra_line = True
    for matcher_status in manager_status.log_matchers:
        if matcher_status.is_glob:
            if need_to_add_extra_line:
                need_to_add_extra_line = False
                print("", file=output)
            print(
                "Glob: %s:: last scanned for glob matches at %s" % (
                    matcher_status.log_path,
                    scalyr_util.format_time(matcher_status.last_check_time),
                ),
                file=output,
            )

            for processor_status in matcher_status.log_processors_status:
                output.write(
                    "  %s: copied %ld bytes (%ld lines), %ld bytes pending, " %
                    (
                        processor_status.log_path,
                        processor_status.total_bytes_copied,
                        processor_status.total_lines_copied,
                        processor_status.total_bytes_pending,
                    ))
                if processor_status.total_bytes_skipped > 0:
                    output.write("%ld bytes skipped, " %
                                 processor_status.total_bytes_skipped)
                if processor_status.total_bytes_failed > 0:
                    output.write("%ld bytes failed, " %
                                 processor_status.total_bytes_failed)
                if processor_status.total_bytes_dropped_by_sampling > 0:
                    output.write(
                        "%ld bytes dropped by sampling (%ld lines), " % (
                            processor_status.total_bytes_dropped_by_sampling,
                            processor_status.total_lines_dropped_by_sampling,
                        ))

                if processor_status.total_redactions > 0:
                    output.write("%ld redactions, " %
                                 processor_status.total_redactions)
                output.write(
                    "last checked %s" %
                    scalyr_util.format_time(processor_status.last_scan_time))
                output.write("\n")
                output.flush()
def _report_worker_session(output, worker_session, manager_status,
                           agent_log_file_path, indent):
    # type: (TextIO, CopyingManagerWorkerSessionStatus, CopyingManagerStatus, six.text_type, int) -> None
    """
    Write worker session status to the file-like object.
    :param output: file-like object.
    :param worker_session: worker session status object.
    :param agent_log_file_path:
    :param indent: Number of spaces to indent on each new line.
    :param manager_status: the manager status to produce the health check.
    :return:
    """
    _indent_print(
        "Bytes uploaded successfully:               %ld" %
        worker_session.total_bytes_uploaded,
        file=output,
        indent=indent,
    )
    _indent_print(
        "Last successful communication with Scalyr: %s" %
        scalyr_util.format_time(worker_session.last_success_time),
        file=output,
        indent=indent,
    )
    _indent_print(
        "Last attempt:                              %s" %
        scalyr_util.format_time(worker_session.last_attempt_time),
        file=output,
        indent=indent,
    )
    if worker_session.last_attempt_size is not None:
        _indent_print(
            "Last copy request size:                    %ld" %
            worker_session.last_attempt_size,
            file=output,
            indent=indent,
        )
    if worker_session.last_response is not None:
        _indent_print(
            "Last copy response size:                   %ld" %
            len(worker_session.last_response),
            file=output,
            indent=indent,
        )
        _indent_print(
            "Last copy response status:                 %s" %
            worker_session.last_response_status,
            file=output,
            indent=indent,
        )
        if worker_session.last_response_status != "success":
            _indent_print(
                "Last copy response:                        %s" %
                scalyr_util.remove_newlines_and_truncate(
                    worker_session.last_response, 1000),
                file=output,
                indent=indent,
            )
    if worker_session.total_errors > 0:
        _indent_print(
            "Total responses with errors:               %d (see '%s' for details)"
            % (
                worker_session.total_errors,
                agent_log_file_path,
            ),
            file=output,
            indent=indent,
        )

    if manager_status.is_single_worker_session():
        health_check_message = __get_overall_health_check(manager_status)
    else:
        health_check_message = worker_session.health_check_result

    if health_check_message:
        # if message is not empty, write it. In other case we still don't have all health check data.
        _indent_print(
            "Health check:                              %s" %
            health_check_message,
            file=output,
            indent=indent,
        )
    print("", file=output)
def report_status(output, status, current_time):
    print(
        "Scalyr Agent status.  See https://www.scalyr.com/help/scalyr-agent-2 for help",
        file=output,
    )
    print("", file=output)
    print(
        "Current time:            %s" % scalyr_util.format_time(current_time),
        file=output,
    )
    print(
        "Agent started at:        %s" %
        scalyr_util.format_time(status.launch_time),
        file=output,
    )

    parent_process_pid = os.getpid()
    print("Main process pid:        %s" % (os.getpid()), file=output)

    # If parent and worker pid is the same, this means we are using a single worker or not using
    # multi process functionality so we don't report pids for child worker processes
    if status.copying_manager_status:
        worker_processes_pids = status.copying_manager_status.get_all_worker_pids(
        )
    else:
        worker_processes_pids = []

    if (len(worker_processes_pids) >= 1
            and parent_process_pid not in worker_processes_pids):
        worker_processes_pids = ", ".join(
            [str(pid) for pid in worker_processes_pids])
        print("Worker processes pids:   %s" % worker_processes_pids,
              file=output)

    print("Version:                 %s" % status.version, file=output)
    print("VCS revision:            %s" % status.revision, file=output)
    print("Python version:          %s" % status.python_version, file=output)
    print("Agent running as:        %s" % status.user, file=output)
    print("Agent log:               %s" % status.log_path, file=output)
    print("ServerHost:              %s" % status.server_host, file=output)
    print("Compression algorithm:   %s" % status.compression_type, file=output)
    print("Compression level:       %s" % status.compression_level,
          file=output)
    print("", file=output)
    server = scalyr_util.get_web_url_from_upload_url(status.scalyr_server)
    # We default to https://agent.scalyr.com for the Scalyr server, but to see the status on the web,
    # you should go to https://www.scalyr.com.  So, we do a little clean up before sticking it in
    # the url.  Same goes for https://log.scalyr.com  -- it is really is just https://www.scalyr.com
    print(
        "View data from this agent at: %s/events?filter=$serverHost%%3D%%27%s%%27"
        % (
            server,
            quote_plus(status.server_host),
        ),
        file=output,
    )
    print("", file=output)
    print("", file=output)

    # Configuration file status:
    print("Agent configuration:", file=output)
    print("====================", file=output)
    print("", file=output)
    if len(status.config_status.additional_paths) == 0:
        print("Configuration file:    %s" % status.config_status.path,
              file=output)
    else:
        print("Configuration files:   %s" % status.config_status.path,
              file=output)
        for x in status.config_status.additional_paths:
            print("                       %s" % x, file=output)

    if status.config_status.last_error is None:
        print("Status:                Good (files parsed successfully)",
              file=output)
    else:
        print(
            "Status:                Bad (could not parse, using last good version)",
            file=output,
        )
    print(
        "Last checked:          %s" %
        scalyr_util.format_time(status.config_status.last_check_time),
        file=output,
    )
    print(
        "Last changed observed: %s" %
        scalyr_util.format_time(status.config_status.last_read_time),
        file=output,
    )

    if status.config_status.last_error is not None:
        print(
            "Parsing error:         %s" %
            six.text_type(status.config_status.last_error),
            file=output,
        )

    def print_environment():

        # Print scalyr-related env variables in sorted order with critical variables up top. Redact API key.
        main_keys = ["SCALYR_API_KEY", "SCALYR_SERVER"]
        special_case_keys = set(["K8S_EVENT_DISABLE"])
        redacted_keys = set(["SCALYR_API_KEY"])

        # Make a map of uppercase keys -> relevant environment vars (beginning with SCALYR)
        upper2actualkey = {}
        for k in os.environ.keys():
            kup = k.upper()
            if kup.startswith("SCALYR") or kup in special_case_keys:
                upper2actualkey[kup] = k

        # Sorted list of all scalyr keys, including main_keys which may not be present
        # Sort order does not consider letter case.
        sorted_upperkeys = main_keys + sorted(
            set(upper2actualkey.keys()) - set(main_keys))

        print("", file=output)
        row = 0
        for kup in sorted_upperkeys:
            key = upper2actualkey.get(kup, kup)
            val = compat.os_getenv_unicode(key)
            if not val:
                val = "<Missing>"
            elif key.upper() in redacted_keys:
                val = "<Redacted>"

            if row == 0:
                print("Environment variables: %s = %s" % (key, val),
                      file=output)
            else:
                print("                       %s = %s" % (key, val),
                      file=output)
            row += 1

    print_environment()

    if status.copying_manager_status is not None:
        print("", file=output)
        print("", file=output)
        __report_copying_manager(
            output,
            status.copying_manager_status,
            status.log_path,
            status.config_status.last_read_time,
        )

    if status.monitor_manager_status is not None:
        print("", file=output)
        print("", file=output)
        __report_monitor_manager(output, status.monitor_manager_status,
                                 status.config_status.last_read_time)
    def run(self):
        """Processes the log files as requested by the configuration, looking for matching log files, reading their
        bytes, applying redaction and sampling rules, and then sending them to the server.

        This method will not terminate until the thread has been stopped.
        """
        # So the scanning.. every scan:
        #   - See if any of the loggers have new files that are being matched
        #   - Update the file length counts of all current scanners:
        #   - Then pick up where you left off, getting X bytes as determined that abstraction
        #   - Send it to the client
        #   - determine success or not.. if success, update it.
        #   - sleep

        # Make sure start_manager was invoked to start the thread and we have a scalyr client instance.
        assert self.__scalyr_client is not None

        if self.__config.copying_thread_profile_interval > 0:
            import cProfile
            profiler = cProfile.Profile()
            profiler.enable()
            profile_dump_interval = self.__config.copying_thread_profile_interval
        else:
            profiler = None
            profile_dump_interval = 0

        try:
            # noinspection PyBroadException
            try:
                # Try to read the checkpoint state from disk.
                current_time = time.time()
                checkpoints_state = self.__read_checkpoint_state()
                if checkpoints_state is None:
                    log.info(
                        'The checkpoints were not read.  All logs will be copied starting at their current end')
                elif (current_time - checkpoints_state['time']) > self.__config.max_allowed_checkpoint_age:
                    log.warn('The current checkpoint is too stale (written at "%s").  Ignoring it.  All log files will '
                             'be copied starting at their current end.', scalyr_util.format_time(
                             checkpoints_state['time']), error_code='staleCheckpointFile')
                    checkpoints_state = None

                if checkpoints_state is None:
                    checkpoints = None
                else:
                    checkpoints = checkpoints_state['checkpoints']

                # Do the initial scan for any log files that match the configured logs we should be copying.  If there
                # are checkpoints for them, make sure we start copying from the position we left off at.
                self.__scan_for_new_logs_if_necessary(current_time=current_time,
                                                      checkpoints=checkpoints,
                                                      logs_initial_positions=self.__logs_initial_positions)

                # The copying params that tell us how much we are allowed to send and how long we have to wait between
                # attempts.
                copying_params = CopyingParameters(self.__config)

                # Just initialize the last time we had a success to now.  Make the logic below easier.
                last_success = time.time()

                pipeline_byte_threshold = self.__config.pipeline_threshold * float(self.__config.max_allowed_request_size)

                # We are about to start copying.  We can tell waiting threads.
                self.__copying_semaphore.release()

                while self._run_state.is_running():
                    log.log(scalyr_logging.DEBUG_LEVEL_1, 'At top of copy log files loop.')
                    current_time = time.time()
                    pipeline_time = 0.0
                    # noinspection PyBroadException
                    try:
                        # If we have a pending request and it's been too taken too long to send it, just drop it
                        # on the ground and advance.
                        if current_time - last_success > self.__config.max_retry_time:
                            if self.__pending_add_events_task is not None:
                                self.__pending_add_events_task.completion_callback(LogFileProcessor.FAIL_AND_DROP)
                                self.__pending_add_events_task = None
                            # Tell all of the processors to go to the end of the current log file.  We will start
                            # copying
                            # from there.
                            for processor in self.__log_processors:
                                processor.skip_to_end('Too long since last successful request to server.',
                                                      'skipNoServerSuccess', current_time=current_time)

                        # Check for new logs.  If we do detect some new log files, they must have been created since our
                        # last scan.  In this case, we start copying them from byte zero instead of the end of the file.
                        self.__scan_for_new_logs_if_necessary(current_time=current_time, copy_at_index_zero=True)

                        # Collect log lines to send if we don't have one already.
                        if self.__pending_add_events_task is None:
                            self.__pending_add_events_task = self.__get_next_add_events_task(
                                copying_params.current_bytes_allowed_to_send)
                        else:
                            log.log(scalyr_logging.DEBUG_LEVEL_1, 'Have pending batch of events, retrying to send.')
                            # Take a look at the file system and see if there are any new bytes pending.  This updates
                            # the statistics for each pending file.  This is important to do for status purposes if we
                            # have not tried to invoke get_next_send_events_task in a while (since that already updates
                            # the statistics).
                            self.__scan_for_new_bytes(current_time=current_time)

                        # Try to send the request if we have one.
                        if self.__pending_add_events_task is not None:
                            log.log(scalyr_logging.DEBUG_LEVEL_1, 'Sending an add event request')
                            # Send the request, but don't block for the response yet.
                            get_response = self._send_events(self.__pending_add_events_task)
                            # If we are sending very large requests, we will try to optimize for future requests
                            # by overlapping building the request with waiting for the response on the current request
                            # (pipelining).
                            if (self.__pending_add_events_task.add_events_request.current_size
                                    >= pipeline_byte_threshold and self.__pending_add_events_task.next_pipelined_task is None):
                                # Time how long it takes us to build it because we will subtract it from how long we
                                # have to wait before we send the next request.
                                pipeline_time = time.time()
                                self.__pending_add_events_task.next_pipelined_task = self.__get_next_add_events_task(
                                    copying_params.current_bytes_allowed_to_send, for_pipelining=True)
                            else:
                                pipeline_time = 0.0

                            # Now block for the response.
                            (result, bytes_sent, full_response) = get_response()

                            if pipeline_time > 0:
                                pipeline_time = time.time() - pipeline_time
                            else:
                                pipeline_time = 0.0

                            log.log(scalyr_logging.DEBUG_LEVEL_1,
                                    'Sent %ld bytes and received response with status="%s".',
                                    bytes_sent, result)

                            if result == 'success' or 'discardBuffer' in result or 'requestTooLarge' in result:
                                next_add_events_task = None
                                try:
                                    if result == 'success':
                                        self.__pending_add_events_task.completion_callback(LogFileProcessor.SUCCESS)
                                        next_add_events_task = self.__pending_add_events_task.next_pipelined_task
                                    elif 'discardBuffer' in result:
                                        self.__pending_add_events_task.completion_callback(
                                            LogFileProcessor.FAIL_AND_DROP)
                                    else:
                                        self.__pending_add_events_task.completion_callback(
                                            LogFileProcessor.FAIL_AND_RETRY)
                                finally:
                                    # No matter what, we want to throw away the current event since the server said we
                                    # could.  We have seen some bugs where we did not throw away the request because
                                    # an exception was thrown during the callback.
                                    self.__pending_add_events_task = next_add_events_task
                                    self.__write_checkpoint_state()

                            if result == 'success':
                                last_success = current_time
                        else:
                            result = 'failedReadingLogs'
                            bytes_sent = 0
                            full_response = ''

                            log.error('Failed to read logs for copying.  Will re-try')

                        # Update the statistics and our copying parameters.
                        self.__lock.acquire()
                        copying_params.update_params(result, bytes_sent)
                        self.__last_attempt_time = current_time
                        self.__last_success_time = last_success
                        self.__last_attempt_size = bytes_sent
                        self.__last_response = full_response
                        self.__last_response_status = result
                        if result == 'success':
                            self.__total_bytes_uploaded += bytes_sent
                        self.__lock.release()

                        if profiler is not None:
                            seconds_past_epoch = int(time.time())
                            if seconds_past_epoch % profile_dump_interval == 0:
                                profiler.disable()
                                profiler.dump_stats('%s%s' % (self.__config.copying_thread_profile_output_path,
                                                              datetime.datetime.now().strftime("%H_%M_%S.out")))
                                profiler.enable()
                    except Exception:
                        # TODO: Do not catch Exception here.  That is too broad.  Disabling warning for now.
                        log.exception('Failed while attempting to scan and transmit logs')
                        log.log(scalyr_logging.DEBUG_LEVEL_1, 'Failed while attempting to scan and transmit logs')
                        self.__lock.acquire()
                        self.__last_attempt_time = current_time
                        self.__total_errors += 1
                        self.__lock.release()

                    if pipeline_time < copying_params.current_sleep_interval:
                        self._sleep_but_awaken_if_stopped(copying_params.current_sleep_interval - pipeline_time)
            except Exception:
                # If we got an exception here, it is caused by a bug in the program, so let's just terminate.
                log.exception('Log copying failed due to exception')
                sys.exit(1)
        finally:
            for processor in self.__log_processors:
                processor.close()
            if profiler is not None:
                profiler.disable()
Beispiel #15
0
    def run(self):
        """Processes the log files as requested by the configuration, looking for matching log files, reading their
        bytes, applying redaction and sampling rules, and then sending them to the server.

        This method will not terminate until the thread has been stopped.
        """
        # So the scanning.. every scan:
        #   - See if any of the loggers have new files that are being matched
        #   - Update the file length counts of all current scanners:
        #   - Then pick up where you left off, getting X bytes as determined that abstraction
        #   - Send it to the client
        #   - determine success or not.. if success, update it.
        #   - sleep

        # Make sure start_manager was invoked to start the thread and we have a scalyr client instance.
        assert self.__scalyr_client is not None

        if self.__config.copying_thread_profile_interval > 0:
            import cProfile
            profiler = cProfile.Profile()
            profiler.enable()
            profile_dump_interval = self.__config.copying_thread_profile_interval
        else:
            profiler = None
            profile_dump_interval = 0

        try:
            # noinspection PyBroadException
            try:
                # Try to read the checkpoint state from disk.
                current_time = time.time()
                checkpoints_state = self.__read_checkpoint_state()
                if checkpoints_state is None:
                    log.info(
                        'The checkpoints were not read.  All logs will be copied starting at their current end'
                    )
                elif (current_time - checkpoints_state['time']
                      ) > self.__config.max_allowed_checkpoint_age:
                    log.warn(
                        'The current checkpoint is too stale (written at "%s").  Ignoring it.  All log files will '
                        'be copied starting at their current end.',
                        scalyr_util.format_time(checkpoints_state['time']),
                        error_code='staleCheckpointFile')
                    checkpoints_state = None

                if checkpoints_state is None:
                    checkpoints = None
                else:
                    checkpoints = checkpoints_state['checkpoints']

                # Do the initial scan for any log files that match the configured logs we should be copying.  If there
                # are checkpoints for them, make sure we start copying from the position we left off at.
                self.__scan_for_new_logs_if_necessary(
                    current_time=current_time,
                    checkpoints=checkpoints,
                    logs_initial_positions=self.__logs_initial_positions)

                # The copying params that tell us how much we are allowed to send and how long we have to wait between
                # attempts.
                copying_params = CopyingParameters(self.__config)

                # Just initialize the last time we had a success to now.  Make the logic below easier.
                last_success = time.time()

                pipeline_byte_threshold = self.__config.pipeline_threshold * float(
                    self.__config.max_allowed_request_size)

                # We are about to start copying.  We can tell waiting threads.
                self.__copying_semaphore.release()

                while self._run_state.is_running():
                    log.log(scalyr_logging.DEBUG_LEVEL_1,
                            'At top of copy log files loop.')
                    current_time = time.time()
                    pipeline_time = 0.0
                    # noinspection PyBroadException
                    try:
                        # If we have a pending request and it's been too taken too long to send it, just drop it
                        # on the ground and advance.
                        if current_time - last_success > self.__config.max_retry_time:
                            if self.__pending_add_events_task is not None:
                                self.__pending_add_events_task.completion_callback(
                                    LogFileProcessor.FAIL_AND_DROP)
                                self.__pending_add_events_task = None
                            # Tell all of the processors to go to the end of the current log file.  We will start
                            # copying
                            # from there.
                            for processor in self.__log_processors:
                                processor.skip_to_end(
                                    'Too long since last successful request to server.',
                                    'skipNoServerSuccess',
                                    current_time=current_time)

                        # Check for new logs.  If we do detect some new log files, they must have been created since our
                        # last scan.  In this case, we start copying them from byte zero instead of the end of the file.
                        self.__scan_for_new_logs_if_necessary(
                            current_time=current_time, copy_at_index_zero=True)

                        # Collect log lines to send if we don't have one already.
                        if self.__pending_add_events_task is None:
                            self.__pending_add_events_task = self.__get_next_add_events_task(
                                copying_params.current_bytes_allowed_to_send)
                        else:
                            log.log(
                                scalyr_logging.DEBUG_LEVEL_1,
                                'Have pending batch of events, retrying to send.'
                            )
                            # Take a look at the file system and see if there are any new bytes pending.  This updates
                            # the statistics for each pending file.  This is important to do for status purposes if we
                            # have not tried to invoke get_next_send_events_task in a while (since that already updates
                            # the statistics).
                            self.__scan_for_new_bytes(
                                current_time=current_time)

                        # Try to send the request if we have one.
                        if self.__pending_add_events_task is not None:
                            log.log(scalyr_logging.DEBUG_LEVEL_1,
                                    'Sending an add event request')
                            # Send the request, but don't block for the response yet.
                            get_response = self._send_events(
                                self.__pending_add_events_task)
                            # If we are sending very large requests, we will try to optimize for future requests
                            # by overlapping building the request with waiting for the response on the current request
                            # (pipelining).
                            if (self.__pending_add_events_task.
                                    add_events_request.current_size >=
                                    pipeline_byte_threshold
                                    and self.__pending_add_events_task.
                                    next_pipelined_task is None):
                                # Time how long it takes us to build it because we will subtract it from how long we
                                # have to wait before we send the next request.
                                pipeline_time = time.time()
                                self.__pending_add_events_task.next_pipelined_task = self.__get_next_add_events_task(
                                    copying_params.
                                    current_bytes_allowed_to_send,
                                    for_pipelining=True)
                            else:
                                pipeline_time = 0.0

                            # Now block for the response.
                            (result, bytes_sent,
                             full_response) = get_response()

                            if pipeline_time > 0:
                                pipeline_time = time.time() - pipeline_time
                            else:
                                pipeline_time = 0.0

                            log.log(
                                scalyr_logging.DEBUG_LEVEL_1,
                                'Sent %ld bytes and received response with status="%s".',
                                bytes_sent, result)

                            if result == 'success' or 'discardBuffer' in result or 'requestTooLarge' in result:
                                next_add_events_task = None
                                try:
                                    if result == 'success':
                                        self.__pending_add_events_task.completion_callback(
                                            LogFileProcessor.SUCCESS)
                                        next_add_events_task = self.__pending_add_events_task.next_pipelined_task
                                    elif 'discardBuffer' in result:
                                        self.__pending_add_events_task.completion_callback(
                                            LogFileProcessor.FAIL_AND_DROP)
                                    else:
                                        self.__pending_add_events_task.completion_callback(
                                            LogFileProcessor.FAIL_AND_RETRY)
                                finally:
                                    # No matter what, we want to throw away the current event since the server said we
                                    # could.  We have seen some bugs where we did not throw away the request because
                                    # an exception was thrown during the callback.
                                    self.__pending_add_events_task = next_add_events_task
                                    self.__write_checkpoint_state()

                            if result == 'success':
                                last_success = current_time
                        else:
                            result = 'failedReadingLogs'
                            bytes_sent = 0
                            full_response = ''

                            log.error(
                                'Failed to read logs for copying.  Will re-try'
                            )

                        # Update the statistics and our copying parameters.
                        self.__lock.acquire()
                        copying_params.update_params(result, bytes_sent)
                        self.__last_attempt_time = current_time
                        self.__last_success_time = last_success
                        self.__last_attempt_size = bytes_sent
                        self.__last_response = full_response
                        self.__last_response_status = result
                        if result == 'success':
                            self.__total_bytes_uploaded += bytes_sent
                        self.__lock.release()

                        if profiler is not None:
                            seconds_past_epoch = int(time.time())
                            if seconds_past_epoch % profile_dump_interval == 0:
                                profiler.disable()
                                profiler.dump_stats(
                                    '%s%s' %
                                    (self.__config.
                                     copying_thread_profile_output_path,
                                     datetime.datetime.now().strftime(
                                         "%H_%M_%S.out")))
                                profiler.enable()
                    except Exception:
                        # TODO: Do not catch Exception here.  That is too broad.  Disabling warning for now.
                        log.exception(
                            'Failed while attempting to scan and transmit logs'
                        )
                        log.log(
                            scalyr_logging.DEBUG_LEVEL_1,
                            'Failed while attempting to scan and transmit logs'
                        )
                        self.__lock.acquire()
                        self.__last_attempt_time = current_time
                        self.__total_errors += 1
                        self.__lock.release()

                    if pipeline_time < copying_params.current_sleep_interval:
                        self._sleep_but_awaken_if_stopped(
                            copying_params.current_sleep_interval -
                            pipeline_time)
            except Exception:
                # If we got an exception here, it is caused by a bug in the program, so let's just terminate.
                log.exception('Log copying failed due to exception')
                sys.exit(1)
        finally:
            for processor in self.__log_processors:
                processor.close()
            if profiler is not None:
                profiler.disable()
    def run(self):
        """Processes the log files as requested by the configuration, looking for matching log files, reading their
        bytes, applying redaction and sampling rules, and then sending them to the server.

        This method will not terminate until the thread has been stopped.
        """
        # So the scanning.. every scan:
        #   - See if any of the loggers have new files that are being matched
        #   - Update the file length counts of all current scanners:
        #   - Then pick up where you left off, getting X bytes as determined that abstraction
        #   - Send it to the client
        #   - determine success or not.. if success, update it.
        #   - sleep
        # noinspection PyBroadException
        try:
            # Try to read the checkpoint state from disk.
            current_time = time.time()
            checkpoints_state = self.__read_checkpoint_state()
            if checkpoints_state is None:
                log.info(
                    'The checkpoints could not be read.  All logs will be copied starting at their current end'
                )
            elif (current_time - checkpoints_state['time']
                  ) > self.__config.max_allowed_checkpoint_age:
                log.warn(
                    'The current checkpoint is too stale (written at "%s").  Ignoring it.  All log files will be '
                    'copied starting at their current end.',
                    scalyr_util.format_time(checkpoints_state['time']),
                    error_code='staleCheckpointFile')
                checkpoints_state = None

            if checkpoints_state is None:
                checkpoints = None
            else:
                checkpoints = checkpoints_state['checkpoints']

            # Do the initial scan for any log files that match the configured logs we should be copying.  If there
            # are checkpoints for them, make sure we start copying from the position we left off at.
            self.__scan_for_new_logs_if_necessary(
                current_time=current_time,
                checkpoints=checkpoints,
                logs_initial_positions=self.__logs_initial_positions)

            # The copying params that tell us how much we are allowed to send and how long we have to wait between
            # attempts.
            copying_params = CopyingParameters(self.__config)

            # Just initialize the last time we had a success to now.  Make the logic below easier.
            last_success = time.time()

            # We are about to start copying.  We can tell waiting threads.
            self.__copying_semaphore.release()

            while self._run_state.is_running():
                log.log(scalyr_logging.DEBUG_LEVEL_1,
                        'At top of copy log files loop.')
                current_time = time.time()
                # noinspection PyBroadException
                try:
                    # If we have a pending request and it's been too taken too long to send it, just drop it
                    # on the ground and advance.
                    if current_time - last_success > self.__config.max_retry_time:
                        if self.__pending_add_events_task is not None:
                            self.__pending_add_events_task.completion_callback(
                                LogFileProcessor.FAIL_AND_DROP)
                            self.__pending_add_events_task = None
                        # Tell all of the processors to go to the end of the current log file.  We will start copying
                        # from there.
                        for processor in self.__log_processors:
                            processor.skip_to_end(
                                'Too long since last successful request to server.',
                                'skipNoServerSuccess',
                                current_time=current_time)

                    # Check for new logs.  If we do detect some new log files, they must have been created since our
                    # last scan.  In this case, we start copying them from byte zero instead of the end of the file.
                    self.__scan_for_new_logs_if_necessary(
                        current_time=current_time, copy_at_index_zero=True)

                    # Collect log lines to send if we don't have one already.
                    if self.__pending_add_events_task is None:
                        log.log(scalyr_logging.DEBUG_LEVEL_1,
                                'Getting next batch of events to send.')
                        self.__pending_add_events_task = self.__get_next_add_events_task(
                            copying_params.current_bytes_allowed_to_send)
                    else:
                        log.log(
                            scalyr_logging.DEBUG_LEVEL_1,
                            'Have pending batch of events, retrying to send.')
                        # Take a look at the file system and see if there are any new bytes pending.  This updates the
                        # statistics for each pending file.  This is important to do for status purposes if we have
                        # not tried to invoke get_next_send_events_task in a while (since that already updates the
                        # statistics).
                        self.__scan_for_new_bytes(current_time=current_time)

                    # Try to send the request if we have one.
                    if self.__pending_add_events_task is not None:
                        (result, bytes_sent,
                         full_response) = self.__send_events(
                             self.__pending_add_events_task)

                        log.log(
                            scalyr_logging.DEBUG_LEVEL_1,
                            'Sent %ld bytes and received response with status="%s".',
                            bytes_sent, result)

                        if result == 'success' or 'discardBuffer' in result or 'requestTooLarge' in result:
                            if result == 'success':
                                self.__pending_add_events_task.completion_callback(
                                    LogFileProcessor.SUCCESS)
                            elif 'discardBuffer' in result:
                                self.__pending_add_events_task.completion_callback(
                                    LogFileProcessor.FAIL_AND_DROP)
                            else:
                                self.__pending_add_events_task.completion_callback(
                                    LogFileProcessor.FAIL_AND_RETRY)
                            self.__pending_add_events_task = None
                            self.__write_checkpoint_state()

                        if result == 'success':
                            last_success = current_time
                    else:
                        result = 'failedReadingLogs'
                        bytes_sent = 0
                        full_response = ''

                        log.error(
                            'Failed to read logs for copying.  Will re-try')

                    # Update the statistics and our copying parameters.
                    self.__lock.acquire()
                    copying_params.update_params(result, bytes_sent)
                    self.__last_attempt_time = current_time
                    self.__last_success_time = last_success
                    self.__last_attempt_size = bytes_sent
                    self.__last_response = full_response
                    self.__last_response_status = result
                    if result == 'success':
                        self.__total_bytes_uploaded += bytes_sent
                    self.__lock.release()

                except Exception:
                    # TODO: Do not catch Exception here.  That is too board.  Disabling warning for now.
                    log.exception(
                        'Failed while attempting to scan and transmit logs')
                    log.log(
                        scalyr_logging.DEBUG_LEVEL_1,
                        'Failed while attempting to scan and transmit logs')
                    self.__lock.acquire()
                    self.__last_attempt_time = current_time
                    self.__total_errors += 1
                    self.__lock.release()

                self._run_state.sleep_but_awaken_if_stopped(
                    copying_params.current_sleep_interval)
        except Exception:
            # If we got an exception here, it is caused by a bug in the program, so let's just terminate.
            log.exception('Log copying failed due to exception')
            sys.exit(1)