def __init__(self, host, port, retry_interval, report_status=None, logging_timeout=None, stream_backend_map={}): # set up thrift and scribe objects timeout = logging_timeout if logging_timeout is not None else config.scribe_logging_timeout self.socket = thriftpy.transport.socket.TSocket(six.text_type(host), int(port)) if timeout: self.socket.set_timeout(timeout) self.stream_backend_map = stream_backend_map self.transport = TFramedTransportFactory().get_transport(self.socket) protocol = TBinaryProtocolFactory(strict_read=False).get_protocol(self.transport) self.client = TClient(scribe_thrift.scribe, protocol) # our own bookkeeping for connection self.connected = False # whether or not we think we're currently connected to the scribe server self.last_connect_time = 0 # last time we got disconnected or failed to reconnect self.retry_interval = retry_interval self.report_status = report_status or get_default_reporter() self.__lock = threading.RLock() self._birth_pid = os.getpid() self.metrics = MetricsReporter( sample_rate=config.metrics_sample_rate, backend="scribe" )
def test_latency_microseconds(self): metrics = MetricsReporter(backend="test", sample_rate=1) metrics._sample_log_line_latency = mock.Mock(FakeMetric()) with metrics.sampled_request(): pass assert metrics._sample_log_line_latency.record.call_count == 1 # time.time() has a resolution of down to half a microsecond, so seeing under that means we're tracking s or ms. assert metrics._sample_log_line_latency.record.call_args[1][ 'value'] > 0.1
def test_fake_metric_functionality(self): metrics = MetricsReporter(backend="test", sample_rate=1) # Monkeypatch in the fake counters metrics._sample_log_line_latency = FakeMetric() metrics._total_log_line_sent = FakeMetric() assert metrics._sample_counter == 0 # First try, not part of sample with metrics.sampled_request(): assert metrics._sample_counter == 1 assert metrics._sample_counter == 0
def test_metrics_reporter_sampling(self): metrics = MetricsReporter(backend="test", sample_rate=3) assert metrics._sample_counter == 0 # First try, not part of sample with metrics.sampled_request(): assert metrics._sample_counter == 1 # Second try, not part of sample with metrics.sampled_request(): assert metrics._sample_counter == 2 # Third try, part of sample, so called outside the context with metrics.sampled_request(): assert metrics._sample_counter == 3 assert metrics._sample_counter == 0
def __init__(self, client_id, host=None, port=None, stream_backend_map={}): self.stream_prefix = config.monk_stream_prefix self.report_status = get_default_reporter() self.metrics = MetricsReporter( sample_rate=config.metrics_sample_rate, backend="monk" ) self.timeout_backoff_s = config.monk_timeout_backoff_ms / 1000 self.stream_backend_map = stream_backend_map self.last_timeout = time.time() - self.timeout_backoff_s self.producer = MonkProducer( client_id, host, port, timeout_ms=config.monk_timeout_ms, collect_metrics=False )
class MonkLogger(object): """Wrapper around MonkProducer""" def __init__(self, client_id, host=None, port=None, stream_backend_map={}): self.stream_prefix = config.monk_stream_prefix self.report_status = get_default_reporter() self.metrics = MetricsReporter( sample_rate=config.metrics_sample_rate, backend="monk" ) self.timeout_backoff_s = config.monk_timeout_backoff_ms / 1000 self.stream_backend_map = stream_backend_map self.last_timeout = time.time() - self.timeout_backoff_s self.producer = MonkProducer( client_id, host, port, timeout_ms=config.monk_timeout_ms, collect_metrics=False ) def log_line(self, stream, line): backend = self.stream_backend_map.get(stream, config.default_backend) if backend not in ('monk', 'dual'): return if time.time() < self.last_timeout + self.timeout_backoff_s: return with self.metrics.sampled_request(): try: self.producer.send_messages( self.stream_prefix + stream, [line], None ) except socket.timeout as e: self.report_status(True, 'Monk took too long to respond') self.last_timeout = time.time() self.metrics.monk_timeout() except Exception as e: self.report_status( True, 'Exception while sending to monk: %s' % str(e) ) self.metrics.monk_exception()
def test_zero_sample_rate(self): metrics = MetricsReporter(backend="test", sample_rate=0) assert metrics._sample_counter == 0 # Should never sample with metrics.sampled_request(): assert metrics._sample_counter == 1
class ScribeLogger(object): """Implementation that logs to a scribe server. If errors are encountered, drop lines and retry occasionally. :param host: hostname of the scribe server :param port: port number of the scribe server :param retry_interval: number of seconds to wait between retries :param report_status: a function `report_status(is_error, msg)` which is called to print out errors and status messages. The first argument indicates whether what is being printed is an error or not, and the second argument is the actual message. :param logging_timeout: milliseconds to time out scribe logging; "0" means blocking (no timeout) """ def __init__(self, host, port, retry_interval, report_status=None, logging_timeout=None, stream_backend_map={}): # set up thrift and scribe objects timeout = logging_timeout if logging_timeout is not None else config.scribe_logging_timeout self.socket = thriftpy.transport.socket.TSocket(six.text_type(host), int(port)) if timeout: self.socket.set_timeout(timeout) self.stream_backend_map = stream_backend_map self.transport = TFramedTransportFactory().get_transport(self.socket) protocol = TBinaryProtocolFactory(strict_read=False).get_protocol(self.transport) self.client = TClient(scribe_thrift.scribe, protocol) # our own bookkeeping for connection self.connected = False # whether or not we think we're currently connected to the scribe server self.last_connect_time = 0 # last time we got disconnected or failed to reconnect self.retry_interval = retry_interval self.report_status = report_status or get_default_reporter() self.__lock = threading.RLock() self._birth_pid = os.getpid() self.metrics = MetricsReporter( sample_rate=config.metrics_sample_rate, backend="scribe" ) def _maybe_reconnect(self): """Try (re)connecting to the server if it's been long enough since our last attempt. """ assert self.connected == False # don't retry too often now = time.time() if (now - self.last_connect_time) > self.retry_interval: try: self.transport.open() self.connected = True except TTransportException: self.last_connect_time = now self.report_status(True, 'yelp_clog failed to connect to scribe server') def _log_line_no_size_limit(self, stream, line): """Log a single line without size limit. It should not include any newline characters. Since this method is called in log_line, the line should be in utf-8 format and less than MAX_LINE_SIZE_IN_BYTES already. We don't limit traceback size. """ with self.__lock, self.metrics.sampled_request(): if os.getpid() != self._birth_pid: raise ScribeIsNotForkSafeError if not self.connected: self._maybe_reconnect() if self.connected: log_entry = scribe_thrift.LogEntry(category=scribify(stream), message=line + b'\n') try: return self.client.Log(messages=[log_entry]) except Exception as e: try: self.report_status( True, 'yelp_clog failed to log to scribe server with ' ' exception: %s(%s)' % (type(e), six.text_type(e)) ) finally: self.close() self.last_connect_time = time.time() # Don't reconnect if report_status raises an exception self._maybe_reconnect() def log_line(self, stream, line): """Log a single line. It should not include any newline characters. If the line size is over 50 MB, an exception raises and the line will be dropped. If the line size is over 5 MB, a message consisting origin stream information will be recorded at WHO_CLOG_LARGE_LINE_STREAM (in json format). """ backend = self.stream_backend_map.get(stream, config.default_backend) if backend not in ('scribe', 'dual'): return # log unicodes as their utf-8 encoded representation if isinstance(line, six.text_type): line = line.encode('UTF-8') # check log line size if len(line) <= WARNING_LINE_SIZE_IN_BYTES: self._log_line_no_size_limit(stream, line) elif len(line) <= MAX_LINE_SIZE_IN_BYTES: self._log_line_no_size_limit(stream, line) # log the origin of the stream with traceback to WHO_CLOG_LARGE_LINE_STREAM category origin_info = {} origin_info['stream'] = stream origin_info['line_size'] = len(line) origin_info['traceback'] = ''.join(traceback.format_stack()) log_line = json.dumps(origin_info).encode('UTF-8') self._log_line_no_size_limit(WHO_CLOG_LARGE_LINE_STREAM, log_line) self.report_status( False, 'The log line size is larger than %r bytes (monitored in \'%s\')' % (WARNING_LINE_SIZE_IN_BYTES, WHO_CLOG_LARGE_LINE_STREAM) ) else: # raise an exception if too large self.report_status( True, 'The log line is dropped (line size larger than %r bytes)' % MAX_LINE_SIZE_IN_BYTES ) raise LogLineIsTooLongError('The max log line size allowed is %r bytes' % MAX_LINE_SIZE_IN_BYTES) def close(self): self.transport.close() self.connected = False