Ejemplo n.º 1
0
    def test_get_last_request_for_log(self, mock_docker_client):
        mock_docker_client.containers.return_value = []

        monitor_config = {
            "module": "scalyr_agent.builtin_monitors.docker_monitor",
            "log_mode": "syslog",
            "readback_buffer_size": 5 * 1024,
        }

        container_checker = ContainerChecker(
            config=monitor_config,
            logger=mock.Mock(),
            socket_file=None,
            docker_api_version=None,
            host_hostname=None,
            data_path=self._temp_data_dir,
            log_path=self._temp_log_dir,
        )
        # We mock start time which is used in the calculation to make tests more stable
        mock_start_time = 12345
        mock_start_time_dt = datetime.datetime.utcfromtimestamp(
            mock_start_time)
        container_checker._ContainerChecker__start_time = mock_start_time

        # 1. Existing log file is not available, should use start_time timestamp
        expected_result = scalyr_util.seconds_since_epoch(mock_start_time_dt)

        result = container_checker._ContainerChecker__get_last_request_for_log(
            path="not.exist.123")
        self.assertEqual(result, expected_result)

        # 2. Existing log file, but with no valid data / timestamp, should fall back to start_time
        file_name = "test.1"
        with open(os.path.join(self._temp_log_dir, file_name), "w") as fp:
            fp.write("mock data\n")

        expected_result = scalyr_util.seconds_since_epoch(mock_start_time_dt)
        result = container_checker._ContainerChecker__get_last_request_for_log(
            path=file_name)
        self.assertEqual(result, expected_result)

        # 3. File exists, but it contains invalid / corrupted timestamp
        file_name = "docker-printer-stdout-invalid-ts.log"

        expected_result = scalyr_util.seconds_since_epoch(mock_start_time_dt)
        result = container_checker._ContainerChecker__get_last_request_for_log(
            path=file_name)
        self.assertEqual(result, expected_result)

        # 4. Existing log file is always, should use time stamp from the last log line
        file_name = "docker-printer-stdout.log"
        # Last log line in that log file looks like this: "2020-10-27T17:18:12.878281177Z Line 291"
        start_time_dt = datetime.datetime(2020, 10, 27, 17, 18, 12, 878281)

        expected_result = scalyr_util.seconds_since_epoch(start_time_dt)
        result = container_checker._ContainerChecker__get_last_request_for_log(
            path=file_name)
        self.assertEqual(result, expected_result)
Ejemplo n.º 2
0
    def process_request(self, run_state):
        request = DockerRequest(self.__socket_file)
        request.get('/containers/%s/logs?%s=1&follow=1&tail=%d&timestamps=1' %
                    (self.cid, self.stream, self.__max_previous_lines))

        epoch = datetime.datetime.utcfromtimestamp(0)

        while run_state.is_running():
            line = request.readline()
            while line:
                line = self.strip_docker_header(line)
                dt, log_line = self.split_datetime_from_line(line)
                if not dt:
                    global_log.error('No timestamp found on line: \'%s\'',
                                     line)
                else:
                    timestamp = scalyr_util.seconds_since_epoch(dt, epoch)

                    #see if we log the entire line including timestamps
                    if self.__log_timestamps:
                        log_line = line

                    #check to make sure timestamp is >= to the last request
                    #Note: we can safely read last_request here because we are the only writer
                    if timestamp >= self.__last_request:
                        self.__logger.info(log_line.strip())

                        #but we need to lock for writing
                        self.__last_request_lock.acquire()
                        self.__last_request = timestamp
                        self.__last_request_lock.release()

                line = request.readline()
            run_state.sleep_but_awaken_if_stopped(0.1)

        # we are shutting down, so update our last request to be slightly later than it's current
        # value to prevent duplicate logs when starting up again.
        self.__last_request_lock.acquire()

        #can't be any smaller than 0.01 because the time value is only saved to 2 decimal places
        #on disk
        self.__last_request += 0.01

        self.__last_request_lock.release()
Ejemplo n.º 3
0
    def process_request( self, run_state ):
        request = DockerRequest( self.__socket_file )
        request.get( '/containers/%s/logs?%s=1&follow=1&tail=%d&timestamps=1' % (self.cid, self.stream, self.__max_previous_lines) )

        epoch = datetime.datetime.utcfromtimestamp( 0 )

        while run_state.is_running():
            line = request.readline()
            while line:
                line = self.strip_docker_header( line )
                dt, log_line = self.split_datetime_from_line( line )
                if not dt:
                    global_log.error( 'No timestamp found on line: \'%s\'', line )
                else:
                    timestamp = scalyr_util.seconds_since_epoch( dt, epoch )

                    #see if we log the entire line including timestamps
                    if self.__log_timestamps:
                        log_line = line

                    #check to make sure timestamp is >= to the last request
                    #Note: we can safely read last_request here because we are the only writer
                    if timestamp >= self.__last_request:
                        self.__logger.info( log_line.strip() )

                        #but we need to lock for writing
                        self.__last_request_lock.acquire()
                        self.__last_request = timestamp
                        self.__last_request_lock.release()

                line = request.readline()
            run_state.sleep_but_awaken_if_stopped( 0.1 )

        # we are shutting down, so update our last request to be slightly later than it's current
        # value to prevent duplicate logs when starting up again.
        self.__last_request_lock.acquire()

        #can't be any smaller than 0.01 because the time value is only saved to 2 decimal places
        #on disk
        self.__last_request += 0.01

        self.__last_request_lock.release()
Ejemplo n.º 4
0
 def test_seconds_since_epoch( self ):
     dt = datetime.datetime( 2015, 8, 6, 14, 40, 56 )
     expected = 1438872056.0
     actual = scalyr_util.seconds_since_epoch( dt )
     self.assertEquals( expected, actual )
Ejemplo n.º 5
0
 def test_seconds_since_epoch(self):
     dt = datetime.datetime(2015, 8, 6, 14, 40, 56)
     expected = 1438872056.0
     actual = scalyr_util.seconds_since_epoch(dt)
     self.assertEquals(expected, actual)
Ejemplo n.º 6
0
    def process_request(self, run_state):
        """This function makes a log request on the docker socket for a given container and continues
        to read from the socket until the connection is closed
        """
        try:
            # random delay to prevent all requests from starting at the same time
            delay = random.randint(500, 5000) / 1000
            run_state.sleep_but_awaken_if_stopped(delay)

            self.__logger.log(
                scalyr_logging.DEBUG_LEVEL_3,
                'Starting to retrieve logs for cid=%s' % str(self.cid))
            self.__client = DockerClient(base_url=('unix:/%s' %
                                                   self.__socket_file),
                                         version=self.__docker_api_version)

            epoch = datetime.datetime.utcfromtimestamp(0)
            while run_state.is_running():
                self.__logger.log(
                    scalyr_logging.DEBUG_LEVEL_3,
                    'Attempting to retrieve logs for cid=%s' % str(self.cid))
                sout = False
                serr = False
                if self.stream == 'stdout':
                    sout = True
                else:
                    serr = True

                self.__logs = self.__client.logs(
                    container=self.cid,
                    stdout=sout,
                    stderr=serr,
                    stream=True,
                    timestamps=True,
                    tail=self.__max_previous_lines,
                    follow=True)

                # self.__logs is a generator so don't call len( self.__logs )
                self.__logger.log(
                    scalyr_logging.DEBUG_LEVEL_3,
                    'Found log lines for cid=%s' % (str(self.cid)))
                try:
                    for line in self.__logs:
                        #split the docker timestamp from the frest of the line
                        dt, log_line = _split_datetime_from_line(line)
                        if not dt:
                            global_log.error(
                                'No timestamp found on line: \'%s\'', line)
                        else:
                            timestamp = scalyr_util.seconds_since_epoch(
                                dt, epoch)

                            #see if we log the entire line including timestamps
                            if self.__log_timestamps:
                                log_line = line

                            #check to make sure timestamp is >= to the last request
                            #Note: we can safely read last_request here because we are the only writer
                            if timestamp >= self.__last_request:
                                self.__logger.info(log_line.strip())

                                #but we need to lock for writing
                                self.__last_request_lock.acquire()
                                self.__last_request = timestamp
                                self.__last_request_lock.release()

                        if not run_state.is_running():
                            self.__logger.log(
                                scalyr_logging.DEBUG_LEVEL_3,
                                'Exiting out of container log for cid=%s' %
                                str(self.cid))
                            break
                except ProtocolError, e:
                    if run_state.is_running():
                        global_log.warning(
                            "Stream closed due to protocol error: %s" % str(e))

                if run_state.is_running():
                    global_log.warning(
                        "Log stream has been closed for '%s'.  Check docker.log on the host for possible errors.  Attempting to reconnect, some logs may be lost"
                        % (self.name),
                        limit_once_per_x_secs=300,
                        limit_key='stream-closed-%s' % self.name)
                    delay = random.randint(500, 3000) / 1000
                    run_state.sleep_but_awaken_if_stopped(delay)

            # we are shutting down, so update our last request to be slightly later than it's current
            # value to prevent duplicate logs when starting up again.
            self.__last_request_lock.acquire()

            #can't be any smaller than 0.01 because the time value is only saved to 2 decimal places
            #on disk
            self.__last_request += 0.01

            self.__last_request_lock.release()
Ejemplo n.º 7
0
class ContainerChecker(StoppableThread):
    """
        Monitors containers to check when they start and stop running.
    """
    def __init__(self, config, logger, socket_file, docker_api_version,
                 host_hostname, data_path, log_path):

        self._config = config
        self._logger = logger

        self.__delay = self._config.get('container_check_interval')
        self.__log_prefix = self._config.get('docker_log_prefix')
        name = self._config.get('container_name')

        self.__socket_file = socket_file
        self.__docker_api_version = docker_api_version
        self.__client = DockerClient(base_url=('unix:/%s' %
                                               self.__socket_file),
                                     version=self.__docker_api_version)

        self.container_id = self.__get_scalyr_container_id(self.__client, name)

        self.__checkpoint_file = os.path.join(data_path,
                                              "docker-checkpoints.json")
        self.__log_path = log_path

        self.__host_hostname = host_hostname

        self.__readback_buffer_size = self._config.get('readback_buffer_size')

        self.containers = {}
        self.__checkpoints = {}

        self.__log_watcher = None
        self.__module = None
        self.__start_time = time.time()
        self.__thread = StoppableThread(target=self.check_containers,
                                        name="Container Checker")

    def start(self):
        self.__load_checkpoints()
        self.containers = self.get_running_containers(self.__client)

        # if querying the docker api fails, set the container list to empty
        if self.containers == None:
            self.containers = {}

        self.docker_logs = self.__get_docker_logs(self.containers)
        self.docker_loggers = []

        #create and start the DockerLoggers
        self.__start_docker_logs(self.docker_logs)
        self._logger.log(
            scalyr_logging.DEBUG_LEVEL_1,
            "Initialization complete.  Starting docker monitor for Scalyr")
        self.__thread.start()

    def stop(self, wait_on_join=True, join_timeout=5):
        self.__thread.stop(wait_on_join=wait_on_join,
                           join_timeout=join_timeout)

        #stop the DockerLoggers
        for logger in self.docker_loggers:
            if self.__log_watcher:
                self.__log_watcher.remove_log_path(self.__module,
                                                   logger.log_path)
            logger.stop(wait_on_join, join_timeout)
            self._logger.log(scalyr_logging.DEBUG_LEVEL_1,
                             "Stopping %s - %s" % (logger.name, logger.stream))

        self.__update_checkpoints()

    def check_containers(self, run_state):

        while run_state.is_running():
            self.__update_checkpoints()

            self._logger.log(scalyr_logging.DEBUG_LEVEL_2,
                             'Attempting to retrieve list of containers')
            running_containers = self.get_running_containers(self.__client)

            # if running_containers is None, that means querying the docker api failed.
            # rather than resetting the list of running containers to empty
            # continue using the previous list of containers
            if running_containers == None:
                self._logger.log(scalyr_logging.DEBUG_LEVEL_2,
                                 'Failed to get list of containers')
                running_containers = self.containers

            self._logger.log(scalyr_logging.DEBUG_LEVEL_2,
                             'Found %d containers' % len(running_containers))
            #get the containers that have started since the last sample
            starting = {}
            for cid, info in running_containers.iteritems():
                if cid not in self.containers:
                    self._logger.log(
                        scalyr_logging.DEBUG_LEVEL_1,
                        "Starting logger for container '%s'" % info['name'])
                    starting[cid] = info

            #get the containers that have stopped
            stopping = {}
            for cid, info in self.containers.iteritems():
                if cid not in running_containers:
                    self._logger.log(
                        scalyr_logging.DEBUG_LEVEL_1,
                        "Stopping logger for container '%s'" % info['name'])
                    stopping[cid] = info

            #stop the old loggers
            self.__stop_loggers(stopping)

            #update the list of running containers
            #do this before starting new ones, as starting up new ones
            #will access self.containers
            self.containers = running_containers

            #start the new ones
            self.__start_loggers(starting)

            run_state.sleep_but_awaken_if_stopped(self.__delay)

    def set_log_watcher(self, log_watcher, module):
        self.__log_watcher = log_watcher
        self.__module = module

    def __get_scalyr_container_id(self, client, name):
        """Gets the container id of the scalyr-agent container
        If the config option container_name is empty, then it is assumed that the scalyr agent is running
        on the host and not in a container and None is returned.
        """
        result = None

        regex = None
        if name is not None:
            regex = re.compile(name)

        # get all the containers
        containers = client.containers()

        for container in containers:

            # see if we are checking on names
            if name is not None:
                # if so, loop over all container names for this container
                # Note: containers should only have one name, but the 'Names' field
                # is a list, so iterate over it just in case
                for cname in container['Names']:
                    cname = cname.lstrip('/')
                    # check if the name regex matches
                    m = regex.match(cname)
                    if m:
                        result = container['Id']
                        break
            # not checking container name, so check the Command instead to see if it's the agent
            else:
                if container['Command'].startswith('/usr/sbin/scalyr-agent-2'):
                    result = container['Id']

            if result:
                break

        if not result:
            # only raise an exception if we were looking for a specific name but couldn't find it
            if name is not None:
                raise Exception(
                    "Unable to find a matching container id for container '%s'.  Please make sure that a "
                    "container matching the regular expression '%s' is running."
                    % (name, name))

        return result

    def __update_checkpoints(self):
        """Update the checkpoints for when each docker logger logged a request, and save the checkpoints
        to file.
        """

        for logger in self.docker_loggers:
            last_request = logger.last_request()
            self.__checkpoints[logger.stream_name] = last_request

        # save to disk
        if self.__checkpoints:
            tmp_file = self.__checkpoint_file + '~'
            scalyr_util.atomic_write_dict_as_json_file(self.__checkpoint_file,
                                                       tmp_file,
                                                       self.__checkpoints)

    def __load_checkpoints(self):
        try:
            checkpoints = scalyr_util.read_file_as_json(self.__checkpoint_file)
        except:
            self._logger.info(
                "No checkpoint file '%s' exists.\n\tAll logs will be read starting from their current end.",
                self.__checkpoint_file)
            checkpoints = {}

        if checkpoints:
            for name, last_request in checkpoints.iteritems():
                self.__checkpoints[name] = last_request

    def get_running_containers(self, client, ignore_self=True):
        """Gets a dict of running containers that maps container id to container name
        """
        result = {}
        try:
            response = client.containers()
            for container in response:
                cid = container['Id']

                if ignore_self and cid == self.container_id:
                    continue

                if len(container['Names']) > 0:
                    name = container['Names'][0].lstrip('/')
                    result[cid] = {'name': name}
                else:
                    result[cid] = cid

        except:  # container querying failed
            global_log.warning("Error querying running containers",
                               limit_once_per_x_secs=300,
                               limit_key='docker-api-running-containers')
            result = None

        return result

    def __stop_loggers(self, stopping):
        """
        Stops any DockerLoggers in the 'stopping' dict
        @param: stopping - a dict of container ids => container names. Any running containers that have
        the same container-id as a key in the dict will be stopped.
        """
        if stopping:
            self._logger.log(scalyr_logging.DEBUG_LEVEL_2,
                             'Stopping all docker loggers')
            for logger in self.docker_loggers:
                if logger.cid in stopping:
                    logger.stop(wait_on_join=True, join_timeout=1)
                    if self.__log_watcher:
                        self.__log_watcher.remove_log_path(
                            self.__module, logger.log_path)

            self.docker_loggers[:] = [
                l for l in self.docker_loggers if l.cid not in stopping
            ]
            self.docker_logs[:] = [
                l for l in self.docker_logs if l['cid'] not in stopping
            ]

    def __start_loggers(self, starting):
        """
        Starts a list of DockerLoggers
        @param: starting - a list of DockerLoggers to start
        """
        if starting:
            self._logger.log(scalyr_logging.DEBUG_LEVEL_2,
                             'Starting all docker loggers')
            docker_logs = self.__get_docker_logs(starting)
            self.__start_docker_logs(docker_logs)
            self.docker_logs.extend(docker_logs)

    def __start_docker_logs(self, docker_logs):
        for log in self.docker_logs:
            last_request = self.__get_last_request_for_log(
                log['log_config']['path'])
            if self.__log_watcher:
                log['log_config'] = self.__log_watcher.add_log_config(
                    self.__module, log['log_config'])
            self.docker_loggers.append(
                self.__create_docker_logger(log, last_request))

    def __get_last_request_for_log(self, path):
        result = datetime.datetime.fromtimestamp(self.__start_time)

        try:
            full_path = os.path.join(self.__log_path, path)
            fp = open(full_path, 'r', self.__readback_buffer_size)

            # seek readback buffer bytes from the end of the file
            fp.seek(0, os.SEEK_END)
            size = fp.tell()
            if size < self.__readback_buffer_size:
                fp.seek(0, os.SEEK_SET)
            else:
                fp.seek(size - self.__readback_buffer_size, os.SEEK_SET)

            first = True
            for line in fp:
                # ignore the first line because it likely started somewhere randomly
                # in the line
                if first:
                    first = False
                    continue

                dt, _ = _split_datetime_from_line(line)
                if dt:
                    result = dt
            fp.close()
        except Exception, e:
            global_log.info("%s", str(e))

        return scalyr_util.seconds_since_epoch(result)