Example #1
0
def get_asyncio_executor(cpu_count: int = None) -> Executor:
    """
    Returns a global `ProcessPoolExecutor` instance.

    NOTE: We use the ProcessPoolExecutor to offload CPU intensive tasks to
    separate processes to ensure we don't block the main networking process.
    This pattern will only work correctly if used within a single process.  If
    multiple processes use this executor API we'll end up with more workers
    than there are CPU cores at which point the networking process will be
    competing with all the worker processes for CPU resources.  At the point
    where we need this in more than one process we will need to come up with a
    different solution
    """
    global _executor

    if _executor is None:
        # Use CPU_COUNT - 1 processes to make sure we always leave one CPU idle
        # so that it can run asyncio's event loop.
        if cpu_count is None:
            os_cpu_count = os.cpu_count()
            if os_cpu_count in CPU_EMPTY_VALUES:
                # Need this because os.cpu_count() returns None when the # of
                # CPUs is indeterminable.
                logger = logging.getLogger('p2p')
                logger.warning(
                    "Could not determine number of CPUs, defaulting to 1 instead of %s",
                    os_cpu_count,
                )
                cpu_count = 1
            else:
                cpu_count = max(1, os_cpu_count - 1)
        # The following block of code allows us to gracefully handle
        # `KeyboardInterrupt` in the worker processes.  This is accomplished
        # via two "hacks".
        #
        # First: We set the signal handler for SIGINT to the special case
        # `SIG_IGN` which instructs the process to ignore SIGINT, while
        # preserving the original signal handler.  We do this because child
        # processes inherit the signal handlers of their parent processes.
        #
        # Second, we have to force the executor to initialize the worker
        # processes, as they are not initialized on instantiation, but rather
        # lazily when the first work is submitted.  We do this by calling the
        # private method `_start_queue_management_thread`.
        #
        # Finally, we restore the original signal handler now that we know the
        # child processes have been initialized to ensure that
        # `KeyboardInterrupt` in the main process is still handled normally.
        original_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
        _executor = ProcessPoolExecutor(cpu_count)
        _executor._start_queue_management_thread()  # type: ignore
        signal.signal(signal.SIGINT, original_handler)
    return _executor
Example #2
0
class MessageSink(object):
    __slots__ = ['spool', 'loop', 'executor', 'size', 'count', 'messages', 'flushed', 'message_class', 'account', 'raw']

    def __init__(self, spool, message_class, raw):
        self.spool = spool
        self.message_class = message_class
        self.raw = raw
        self.loop = get_event_loop()
        self.executor = ProcessPoolExecutor()
        self.executor._start_queue_management_thread()
        self._schedule_flush()
        self.clear()
        self.account = '000000000000'
        try:
            session = Session(profile_name=spool.profile_name)
            client = session.client('sts', config=spool.config)
            self.account = client.get_caller_identity()['Account']
        except Exception:
            logger.warn('Unable to determine AWS Account ID; using default value.')

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.flush()

    async def write(self, source, message, timestamp):
        self.messages[source].append((message, timestamp))
        self.size += len(message)
        self.count += 1
        if self.size > constant.FLUSH_SIZE:
            await self.flush_async()
        return len(message)

    def clear(self):
        self.size = 0
        self.count = 0
        self.messages = defaultdict(list)
        self.flushed = time.time()

    async def flush_async(self):
        self.loop.run_in_executor(self.executor, self._spool_messages, self.spool, self.messages, self.size, self.message_class, self.account, self.raw)
        self.clear()

    def flush(self):
        self._spool_messages(self.spool, self.messages, self.size, self.message_class, self.account, self.raw)
        self.clear()

    def _schedule_flush(self):
        self.loop.call_later(constant.TIMER_INTERVAL, self._flush_timer)

    def _flush_timer(self):
        logger.debug('flush timer: messages={0} size={1} age={2}'.format(self.count, self.size, time.time() - self.flushed))
        if self.messages and time.time() - self.flushed >= constant.FLUSH_TIME:
            self.loop.create_task(self.flush_async())
        self._schedule_flush()

    @classmethod
    def _spool_messages(cls, spool, messages, size, message_class, account, raw):
        for i_source, i_messages in messages.items():
            events = list(message_class.create_events(i_source, i_messages))
            record = cls._prepare_record(i_source, events, message_class.name, account)
            compressed_record = cls._compress_record(raw, record)
            logger.debug('Events for {0} compressed from {1} to {2} bytes (with JSON framing)'.format(i_source, size, len(compressed_record)))

            if len(compressed_record) * 1.2 > constant.MAX_RECORD_SIZE: # Add multiplier to increase split_count by 20%
                # This approach naievely hopes that splitting a record into even parts will put it
                # below the max record size. Further tuning may be required.
                split_count = math.ceil(len(compressed_record) * 1.2 / constant.MAX_RECORD_SIZE)
                logger.warning('Compressed record size of {0} bytes exceeds maximum Firehose record size of {1} bytes; splitting into {2} records'.format(
                    len(compressed_record),
                    constant.MAX_RECORD_SIZE,
                    split_count
                ))
                start = 0
                size = int(len(record['logEvents']) / split_count)
                while start < len(record['logEvents']):
                    record_part = cls._prepare_record(i_source, record['logEvents'][start:start+size], message_class.name, account)
                    compressed_record = cls._compress_record(raw, record_part)
                    spool.write(compressed_record)
                    start += size
            else:
                spool.write(compressed_record)

    @classmethod
    def _prepare_record(cls, source, events, class_name, account):
        return {
            'owner': account,
            'logGroup': class_name,
            'logStream': source,
            'subscriptionFilters': [class_name],
            'messageType': 'DATA_MESSAGE',
            'logEvents': events,
        }

    @classmethod
    def _compress_record(cls, raw, record):
        if raw:
            events = record['logEvents']
            messages = [m['message'].split(' ', 3)[3] + '\n' for m in events] # Leave only message
            return ''.join(messages).encode()
        return compress(MessageSink.serialize(record))

    @classmethod
    def serialize(cls, data):
        return json.dumps(data).encode()