def test_get_last_request_for_log(self, mock_docker_client): mock_docker_client.containers.return_value = [] monitor_config = { "module": "scalyr_agent.builtin_monitors.docker_monitor", "log_mode": "syslog", "readback_buffer_size": 5 * 1024, } container_checker = ContainerChecker( config=monitor_config, logger=mock.Mock(), socket_file=None, docker_api_version=None, host_hostname=None, data_path=self._temp_data_dir, log_path=self._temp_log_dir, ) # We mock start time which is used in the calculation to make tests more stable mock_start_time = 12345 mock_start_time_dt = datetime.datetime.utcfromtimestamp( mock_start_time) container_checker._ContainerChecker__start_time = mock_start_time # 1. Existing log file is not available, should use start_time timestamp expected_result = scalyr_util.seconds_since_epoch(mock_start_time_dt) result = container_checker._ContainerChecker__get_last_request_for_log( path="not.exist.123") self.assertEqual(result, expected_result) # 2. Existing log file, but with no valid data / timestamp, should fall back to start_time file_name = "test.1" with open(os.path.join(self._temp_log_dir, file_name), "w") as fp: fp.write("mock data\n") expected_result = scalyr_util.seconds_since_epoch(mock_start_time_dt) result = container_checker._ContainerChecker__get_last_request_for_log( path=file_name) self.assertEqual(result, expected_result) # 3. File exists, but it contains invalid / corrupted timestamp file_name = "docker-printer-stdout-invalid-ts.log" expected_result = scalyr_util.seconds_since_epoch(mock_start_time_dt) result = container_checker._ContainerChecker__get_last_request_for_log( path=file_name) self.assertEqual(result, expected_result) # 4. Existing log file is always, should use time stamp from the last log line file_name = "docker-printer-stdout.log" # Last log line in that log file looks like this: "2020-10-27T17:18:12.878281177Z Line 291" start_time_dt = datetime.datetime(2020, 10, 27, 17, 18, 12, 878281) expected_result = scalyr_util.seconds_since_epoch(start_time_dt) result = container_checker._ContainerChecker__get_last_request_for_log( path=file_name) self.assertEqual(result, expected_result)
def process_request(self, run_state): request = DockerRequest(self.__socket_file) request.get('/containers/%s/logs?%s=1&follow=1&tail=%d×tamps=1' % (self.cid, self.stream, self.__max_previous_lines)) epoch = datetime.datetime.utcfromtimestamp(0) while run_state.is_running(): line = request.readline() while line: line = self.strip_docker_header(line) dt, log_line = self.split_datetime_from_line(line) if not dt: global_log.error('No timestamp found on line: \'%s\'', line) else: timestamp = scalyr_util.seconds_since_epoch(dt, epoch) #see if we log the entire line including timestamps if self.__log_timestamps: log_line = line #check to make sure timestamp is >= to the last request #Note: we can safely read last_request here because we are the only writer if timestamp >= self.__last_request: self.__logger.info(log_line.strip()) #but we need to lock for writing self.__last_request_lock.acquire() self.__last_request = timestamp self.__last_request_lock.release() line = request.readline() run_state.sleep_but_awaken_if_stopped(0.1) # we are shutting down, so update our last request to be slightly later than it's current # value to prevent duplicate logs when starting up again. self.__last_request_lock.acquire() #can't be any smaller than 0.01 because the time value is only saved to 2 decimal places #on disk self.__last_request += 0.01 self.__last_request_lock.release()
def process_request( self, run_state ): request = DockerRequest( self.__socket_file ) request.get( '/containers/%s/logs?%s=1&follow=1&tail=%d×tamps=1' % (self.cid, self.stream, self.__max_previous_lines) ) epoch = datetime.datetime.utcfromtimestamp( 0 ) while run_state.is_running(): line = request.readline() while line: line = self.strip_docker_header( line ) dt, log_line = self.split_datetime_from_line( line ) if not dt: global_log.error( 'No timestamp found on line: \'%s\'', line ) else: timestamp = scalyr_util.seconds_since_epoch( dt, epoch ) #see if we log the entire line including timestamps if self.__log_timestamps: log_line = line #check to make sure timestamp is >= to the last request #Note: we can safely read last_request here because we are the only writer if timestamp >= self.__last_request: self.__logger.info( log_line.strip() ) #but we need to lock for writing self.__last_request_lock.acquire() self.__last_request = timestamp self.__last_request_lock.release() line = request.readline() run_state.sleep_but_awaken_if_stopped( 0.1 ) # we are shutting down, so update our last request to be slightly later than it's current # value to prevent duplicate logs when starting up again. self.__last_request_lock.acquire() #can't be any smaller than 0.01 because the time value is only saved to 2 decimal places #on disk self.__last_request += 0.01 self.__last_request_lock.release()
def test_seconds_since_epoch( self ): dt = datetime.datetime( 2015, 8, 6, 14, 40, 56 ) expected = 1438872056.0 actual = scalyr_util.seconds_since_epoch( dt ) self.assertEquals( expected, actual )
def test_seconds_since_epoch(self): dt = datetime.datetime(2015, 8, 6, 14, 40, 56) expected = 1438872056.0 actual = scalyr_util.seconds_since_epoch(dt) self.assertEquals(expected, actual)
def process_request(self, run_state): """This function makes a log request on the docker socket for a given container and continues to read from the socket until the connection is closed """ try: # random delay to prevent all requests from starting at the same time delay = random.randint(500, 5000) / 1000 run_state.sleep_but_awaken_if_stopped(delay) self.__logger.log( scalyr_logging.DEBUG_LEVEL_3, 'Starting to retrieve logs for cid=%s' % str(self.cid)) self.__client = DockerClient(base_url=('unix:/%s' % self.__socket_file), version=self.__docker_api_version) epoch = datetime.datetime.utcfromtimestamp(0) while run_state.is_running(): self.__logger.log( scalyr_logging.DEBUG_LEVEL_3, 'Attempting to retrieve logs for cid=%s' % str(self.cid)) sout = False serr = False if self.stream == 'stdout': sout = True else: serr = True self.__logs = self.__client.logs( container=self.cid, stdout=sout, stderr=serr, stream=True, timestamps=True, tail=self.__max_previous_lines, follow=True) # self.__logs is a generator so don't call len( self.__logs ) self.__logger.log( scalyr_logging.DEBUG_LEVEL_3, 'Found log lines for cid=%s' % (str(self.cid))) try: for line in self.__logs: #split the docker timestamp from the frest of the line dt, log_line = _split_datetime_from_line(line) if not dt: global_log.error( 'No timestamp found on line: \'%s\'', line) else: timestamp = scalyr_util.seconds_since_epoch( dt, epoch) #see if we log the entire line including timestamps if self.__log_timestamps: log_line = line #check to make sure timestamp is >= to the last request #Note: we can safely read last_request here because we are the only writer if timestamp >= self.__last_request: self.__logger.info(log_line.strip()) #but we need to lock for writing self.__last_request_lock.acquire() self.__last_request = timestamp self.__last_request_lock.release() if not run_state.is_running(): self.__logger.log( scalyr_logging.DEBUG_LEVEL_3, 'Exiting out of container log for cid=%s' % str(self.cid)) break except ProtocolError, e: if run_state.is_running(): global_log.warning( "Stream closed due to protocol error: %s" % str(e)) if run_state.is_running(): global_log.warning( "Log stream has been closed for '%s'. Check docker.log on the host for possible errors. Attempting to reconnect, some logs may be lost" % (self.name), limit_once_per_x_secs=300, limit_key='stream-closed-%s' % self.name) delay = random.randint(500, 3000) / 1000 run_state.sleep_but_awaken_if_stopped(delay) # we are shutting down, so update our last request to be slightly later than it's current # value to prevent duplicate logs when starting up again. self.__last_request_lock.acquire() #can't be any smaller than 0.01 because the time value is only saved to 2 decimal places #on disk self.__last_request += 0.01 self.__last_request_lock.release()
class ContainerChecker(StoppableThread): """ Monitors containers to check when they start and stop running. """ def __init__(self, config, logger, socket_file, docker_api_version, host_hostname, data_path, log_path): self._config = config self._logger = logger self.__delay = self._config.get('container_check_interval') self.__log_prefix = self._config.get('docker_log_prefix') name = self._config.get('container_name') self.__socket_file = socket_file self.__docker_api_version = docker_api_version self.__client = DockerClient(base_url=('unix:/%s' % self.__socket_file), version=self.__docker_api_version) self.container_id = self.__get_scalyr_container_id(self.__client, name) self.__checkpoint_file = os.path.join(data_path, "docker-checkpoints.json") self.__log_path = log_path self.__host_hostname = host_hostname self.__readback_buffer_size = self._config.get('readback_buffer_size') self.containers = {} self.__checkpoints = {} self.__log_watcher = None self.__module = None self.__start_time = time.time() self.__thread = StoppableThread(target=self.check_containers, name="Container Checker") def start(self): self.__load_checkpoints() self.containers = self.get_running_containers(self.__client) # if querying the docker api fails, set the container list to empty if self.containers == None: self.containers = {} self.docker_logs = self.__get_docker_logs(self.containers) self.docker_loggers = [] #create and start the DockerLoggers self.__start_docker_logs(self.docker_logs) self._logger.log( scalyr_logging.DEBUG_LEVEL_1, "Initialization complete. Starting docker monitor for Scalyr") self.__thread.start() def stop(self, wait_on_join=True, join_timeout=5): self.__thread.stop(wait_on_join=wait_on_join, join_timeout=join_timeout) #stop the DockerLoggers for logger in self.docker_loggers: if self.__log_watcher: self.__log_watcher.remove_log_path(self.__module, logger.log_path) logger.stop(wait_on_join, join_timeout) self._logger.log(scalyr_logging.DEBUG_LEVEL_1, "Stopping %s - %s" % (logger.name, logger.stream)) self.__update_checkpoints() def check_containers(self, run_state): while run_state.is_running(): self.__update_checkpoints() self._logger.log(scalyr_logging.DEBUG_LEVEL_2, 'Attempting to retrieve list of containers') running_containers = self.get_running_containers(self.__client) # if running_containers is None, that means querying the docker api failed. # rather than resetting the list of running containers to empty # continue using the previous list of containers if running_containers == None: self._logger.log(scalyr_logging.DEBUG_LEVEL_2, 'Failed to get list of containers') running_containers = self.containers self._logger.log(scalyr_logging.DEBUG_LEVEL_2, 'Found %d containers' % len(running_containers)) #get the containers that have started since the last sample starting = {} for cid, info in running_containers.iteritems(): if cid not in self.containers: self._logger.log( scalyr_logging.DEBUG_LEVEL_1, "Starting logger for container '%s'" % info['name']) starting[cid] = info #get the containers that have stopped stopping = {} for cid, info in self.containers.iteritems(): if cid not in running_containers: self._logger.log( scalyr_logging.DEBUG_LEVEL_1, "Stopping logger for container '%s'" % info['name']) stopping[cid] = info #stop the old loggers self.__stop_loggers(stopping) #update the list of running containers #do this before starting new ones, as starting up new ones #will access self.containers self.containers = running_containers #start the new ones self.__start_loggers(starting) run_state.sleep_but_awaken_if_stopped(self.__delay) def set_log_watcher(self, log_watcher, module): self.__log_watcher = log_watcher self.__module = module def __get_scalyr_container_id(self, client, name): """Gets the container id of the scalyr-agent container If the config option container_name is empty, then it is assumed that the scalyr agent is running on the host and not in a container and None is returned. """ result = None regex = None if name is not None: regex = re.compile(name) # get all the containers containers = client.containers() for container in containers: # see if we are checking on names if name is not None: # if so, loop over all container names for this container # Note: containers should only have one name, but the 'Names' field # is a list, so iterate over it just in case for cname in container['Names']: cname = cname.lstrip('/') # check if the name regex matches m = regex.match(cname) if m: result = container['Id'] break # not checking container name, so check the Command instead to see if it's the agent else: if container['Command'].startswith('/usr/sbin/scalyr-agent-2'): result = container['Id'] if result: break if not result: # only raise an exception if we were looking for a specific name but couldn't find it if name is not None: raise Exception( "Unable to find a matching container id for container '%s'. Please make sure that a " "container matching the regular expression '%s' is running." % (name, name)) return result def __update_checkpoints(self): """Update the checkpoints for when each docker logger logged a request, and save the checkpoints to file. """ for logger in self.docker_loggers: last_request = logger.last_request() self.__checkpoints[logger.stream_name] = last_request # save to disk if self.__checkpoints: tmp_file = self.__checkpoint_file + '~' scalyr_util.atomic_write_dict_as_json_file(self.__checkpoint_file, tmp_file, self.__checkpoints) def __load_checkpoints(self): try: checkpoints = scalyr_util.read_file_as_json(self.__checkpoint_file) except: self._logger.info( "No checkpoint file '%s' exists.\n\tAll logs will be read starting from their current end.", self.__checkpoint_file) checkpoints = {} if checkpoints: for name, last_request in checkpoints.iteritems(): self.__checkpoints[name] = last_request def get_running_containers(self, client, ignore_self=True): """Gets a dict of running containers that maps container id to container name """ result = {} try: response = client.containers() for container in response: cid = container['Id'] if ignore_self and cid == self.container_id: continue if len(container['Names']) > 0: name = container['Names'][0].lstrip('/') result[cid] = {'name': name} else: result[cid] = cid except: # container querying failed global_log.warning("Error querying running containers", limit_once_per_x_secs=300, limit_key='docker-api-running-containers') result = None return result def __stop_loggers(self, stopping): """ Stops any DockerLoggers in the 'stopping' dict @param: stopping - a dict of container ids => container names. Any running containers that have the same container-id as a key in the dict will be stopped. """ if stopping: self._logger.log(scalyr_logging.DEBUG_LEVEL_2, 'Stopping all docker loggers') for logger in self.docker_loggers: if logger.cid in stopping: logger.stop(wait_on_join=True, join_timeout=1) if self.__log_watcher: self.__log_watcher.remove_log_path( self.__module, logger.log_path) self.docker_loggers[:] = [ l for l in self.docker_loggers if l.cid not in stopping ] self.docker_logs[:] = [ l for l in self.docker_logs if l['cid'] not in stopping ] def __start_loggers(self, starting): """ Starts a list of DockerLoggers @param: starting - a list of DockerLoggers to start """ if starting: self._logger.log(scalyr_logging.DEBUG_LEVEL_2, 'Starting all docker loggers') docker_logs = self.__get_docker_logs(starting) self.__start_docker_logs(docker_logs) self.docker_logs.extend(docker_logs) def __start_docker_logs(self, docker_logs): for log in self.docker_logs: last_request = self.__get_last_request_for_log( log['log_config']['path']) if self.__log_watcher: log['log_config'] = self.__log_watcher.add_log_config( self.__module, log['log_config']) self.docker_loggers.append( self.__create_docker_logger(log, last_request)) def __get_last_request_for_log(self, path): result = datetime.datetime.fromtimestamp(self.__start_time) try: full_path = os.path.join(self.__log_path, path) fp = open(full_path, 'r', self.__readback_buffer_size) # seek readback buffer bytes from the end of the file fp.seek(0, os.SEEK_END) size = fp.tell() if size < self.__readback_buffer_size: fp.seek(0, os.SEEK_SET) else: fp.seek(size - self.__readback_buffer_size, os.SEEK_SET) first = True for line in fp: # ignore the first line because it likely started somewhere randomly # in the line if first: first = False continue dt, _ = _split_datetime_from_line(line) if dt: result = dt fp.close() except Exception, e: global_log.info("%s", str(e)) return scalyr_util.seconds_since_epoch(result)