class SyncClient:
    idle = idle
    backward_class = BackwardWorker
    forward_class = ForwardWorker

    def __init__(self,
                 host_url,
                 resource,
                 auth=None,
                 params={},
                 headers=None,
                 retrievers_params=DEFAULT_RETRIEVERS_PARAMS,
                 adaptive=False,
                 with_priority=False):
        LOGGER.info(f'Init SyncClient for resource {resource}')
        self.host = host_url
        self.auth = auth
        self.resource = resource
        self.adaptive = adaptive
        self.headers = headers

        self.params = params
        self.retrievers_params = retrievers_params
        self.queue = PriorityQueue(maxsize=retrievers_params['queue_size'])

    def init_clients(self):
        self.backward_client = ResourceClient(self.host, self.resource,
                                              self.params, self.auth,
                                              self.headers)
        self.forward_client = ResourceClient(self.host, self.resource,
                                             self.params, self.auth,
                                             self.headers)

    def handle_response_data(self, data):
        for resource_item in data:
            self.queue.put(PrioritizedItem(1, resource_item))

    def worker_watcher(self):
        while True:
            if time() - self.heartbeat > DEFAULT_FORWARD_HEARTBEAT:
                self.restart_sync()
                LOGGER.warning(
                    'Restart sync, reason: Last response from workers greater than 15 min ago.'
                )
            sleep(300)

    def start_sync(self):
        LOGGER.info('Start sync...')

        data = self.backward_client.get_resource_items(self.params)

        self.handle_response_data(data[f'{self.resource}s'])

        forward_params = deepcopy(self.params)
        forward_params.update({
            k: v[0]
            for k, v in parse.parse_qs(parse.urlparse(
                data.links.prev).query).items()
        })
        backward_params = deepcopy(self.params)
        backward_params.update({
            k: v[0]
            for k, v in parse.parse_qs(parse.urlparse(
                data.links.next).query).items()
        })

        self.forward_worker = self.forward_class(sync_client=self,
                                                 client=self.forward_client,
                                                 params=forward_params)
        self.backward_worker = self.backward_class(sync_client=self,
                                                   client=self.backward_client,
                                                   params=backward_params)
        self.workers = [self.forward_worker, self.backward_worker]

        for worker in self.workers:
            worker.start()
        self.heartbeat = time()
        self.watcher = spawn(self.worker_watcher)

    def restart_sync(self):
        """
        Restart retrieving from OCDS API.
        """

        LOGGER.info('Restart workers')
        for worker in self.workers:
            worker.kill()
        self.watcher.kill()
        self.init_clients()
        self.start_sync()

    def get_resource_items(self):
        self.init_clients()
        self.start_sync()
        while True:
            if self.forward_worker.check() or self.backward_worker.check():
                self.restart_sync()
            while not self.queue.empty():
                LOGGER.debug(f'Sync queue size: {self.queue.qsize()}',
                             extra={'SYNC_QUEUE_SIZE': self.queue.qsize()})
                LOGGER.debug('Yield resource item',
                             extra={'MESSAGE_ID': 'sync_yield'})
                item = self.queue.get()
                yield item.data
            LOGGER.debug(f'Sync queue size: {self.queue.qsize()}',
                         extra={'SYNC_QUEUE_SIZE': self.queue.qsize()})
            try:
                self.queue.peek(block=True, timeout=0.1)
            except Empty:
                pass
Beispiel #2
0
class PriorityBlockingQueue(object):
    """
    带优先级的阻塞队列。
    优先级数字越小,优先级越高。

    插入元素:
    * put: 向队列尾部插入一个元素,如果该队列已满,则一直阻塞。
    * offer: 向队列尾部插入一个元素,插入成功返回True。插入失败返回False。

    获取元素:
    * poll: 获取并移除队列的头元素,若队列为空,则返回null。
    * take: 获取并移除队列的头元素,若队列为空,则一直阻塞。
    * peek:获取但不移除队列的头元素,若队列为空,则返回null

    队列状态状态:
    * qsize:获取队列中当前元素数量
    * maxsize:获取队列的最大容量
    """
    def __init__(self, maxsize: int = None):
        """
        init
        :param maxsize: 队列的最大容量
        """
        self.__queue = PriorityQueue(maxsize=maxsize)

    def put(self, item, priority: int = 200) -> None:
        """
        向队列尾部插入一个元素,如果该队列已满,则一直阻塞。
        :param item:
        :param priority: 优先级
        :return:
        """
        while True:
            try:
                self.__queue.put(PriorityEntry(priority, item))
                break
            except Exception as e:
                logger.debug("put data failed error -> {0}".format(e))
            time.sleep(0.5)

    def offer(self, item, priority: int = 200) -> bool:
        """
        向队列尾部插入一个元素,插入成功返回True。插入失败返回False。
        :param item: 元素
        :param priority: 优先级
        :return:
        """
        try:
            self.__queue.put(PriorityEntry(priority, item), block=False)
            return True
        except Exception as e:
            logger.debug("offer data failed error -> {0}".format(e))
        return False

    def poll(self):
        """
        获取并移除队列的头元素,若队列为空,则返回null。
        :return:
        """
        try:
            return self.__queue.get(block=False).data
        except Exception as e:
            logger.debug("poll data failed error -> {0}".format(e))
        return None

    def take(self):
        """
        获取并移除队列的头元素,若队列为空,则一直阻塞。
        :return:
        """
        while True:
            try:
                return self.__queue.get().data
            except Exception as e:
                logger.debug("take data failed error -> {0}".format(e))
            time.sleep(0.5)

    def peek(self):
        """
        获取但不移除队列的头元素,若队列为空,则返回null
        :return:
        """
        try:
            return self.__queue.peek(block=False).data
        except Exception as e:
            logger.debug("peek data failed error -> {0}".format(e))
        return None

    def qsize(self) -> int:
        """
        获取队列中当前元素数量
        :return:
        """
        return self.__queue.qsize()

    def maxsize(self) -> int:
        """
        获取队列的最大容量
        :return:
        """
        return self.__queue.maxsize
Beispiel #3
0
class AWSLogs(object):

    ACTIVE = 1
    EXHAUSTED = 2
    WATCH_SLEEP = 2

    def __init__(self, **kwargs):
        self.connection_cls = kwargs.get('connection_cls', AWSConnection)
        self.aws_region = kwargs.get('aws_region')
        self.aws_access_key_id = kwargs.get('aws_access_key_id')
        self.aws_secret_access_key = kwargs.get('aws_secret_access_key')
        self.log_group_name = kwargs.get('log_group_name')
        self.log_stream_name = kwargs.get('log_stream_name')
        self.watch = kwargs.get('watch')
        self.color_enabled = kwargs.get('color_enabled')
        self.output_stream_enabled = kwargs.get('output_stream_enabled')
        self.output_group_enabled = kwargs.get('output_group_enabled')
        self.start = self.parse_datetime(kwargs.get('start'))
        self.end = self.parse_datetime(kwargs.get('end'))
        self.pool_size = max(kwargs.get('pool_size', 0), 10)
        self.max_group_length = 0
        self.max_stream_length = 0
        self.publishers = []
        self.events_queue = Queue()
        self.raw_events_queue = PriorityQueue()
        self.publishers_queue = PriorityQueue()
        self.publishers = []
        self.stream_status = {}
        self.stream_max_timestamp = {}
        self.connection = self.connection_cls(
            self.aws_region,
            aws_access_key_id=self.aws_access_key_id,
            aws_secret_access_key=self.aws_secret_access_key
        )

    def _get_streams_from_patterns(self, log_group_pattern, log_stream_pattern):
        """Returns pairs of group, stream matching ``log_group_pattern`` and
        ``log_stream_pattern``."""
        for group in self._get_groups_from_pattern(log_group_pattern):
            for stream in self._get_streams_from_pattern(group,
                                                         log_stream_pattern):
                yield group, stream

    def _get_groups_from_pattern(self, pattern):
        """Returns groups matching ``pattern``."""
        pattern = '.*' if pattern == 'ALL' else pattern
        reg = re.compile('^{0}'.format(pattern))
        for group in self.get_groups():
            if re.match(reg, group):
                yield group

    def _get_streams_from_pattern(self, group, pattern):
        """Returns streams in ``group`` matching ``pattern``."""
        pattern = '.*' if pattern == 'ALL' else pattern
        reg = re.compile('^{0}'.format(pattern))
        for stream in self.get_streams(group):
            if re.match(reg, stream):
                yield stream

    def _publisher_queue_consumer(self):
        """Consume ``publishers_queue`` api calls, run them and publish log
        events to ``raw_events_queue``. If ``nextForwardToken`` is present
        register a new api call into ``publishers_queue`` using as weight
        the timestamp of the latest event."""
        while True:
            try:
                _, (log_group_name, log_stream_name, next_token) = self.publishers_queue.get(block=False)
            except Empty:
                if self.watch:
                    gevent.sleep(self.WATCH_SLEEP)
                else:
                    break

            response = self.connection.get_log_events(
                next_token=next_token,
                log_group_name=log_group_name,
                log_stream_name=log_stream_name,
                start_time=self.start,
                end_time=self.end,
                start_from_head=True
            )

            if not len(response['events']):
                self.stream_status[(log_group_name, log_stream_name)] = self.EXHAUSTED
                continue

            self.stream_status[(log_group_name, log_stream_name)] = self.ACTIVE

            for event in response['events']:
                event['group'] = log_group_name
                event['stream'] = log_stream_name
                self.raw_events_queue.put((event['timestamp'], event))
                self.stream_max_timestamp[(log_group_name, log_stream_name)] = event['timestamp']

            if 'nextForwardToken' in response:
                self.publishers_queue.put(
                    (response['events'][-1]['timestamp'],
                     (log_group_name, log_stream_name, response['nextForwardToken']))
                )

    def _get_min_timestamp(self):
        """Return the minimum timestamp available across all active streams."""
        pending = [self.stream_max_timestamp[k] for k, v in self.stream_status.iteritems() if v != self.EXHAUSTED]
        return min(pending) if pending else None

    def _get_all_streams_exhausted(self):
        """Return if all streams are exhausted."""
        return all((s == self.EXHAUSTED for s in self.stream_status.itervalues()))

    def _raw_events_queue_consumer(self):
        """Consume events from ``raw_events_queue`` if all active streams
        have already publish events up to the ``_get_min_timestamp`` and
        register them in order into ``events_queue``."""
        while True:
            if self._get_all_streams_exhausted() and self.raw_events_queue.empty():
                if self.watch:
                    gevent.sleep(self.WATCH_SLEEP)
                    continue
                self.events_queue.put(NO_MORE_EVENTS)
                break

            try:
                timestamp, line = self.raw_events_queue.peek(timeout=1)
            except Empty:
                continue

            min_timestamp = self._get_min_timestamp()
            if min_timestamp and min_timestamp < timestamp:
                gevent.sleep(0.3)
                continue

            timestamp, line = self.raw_events_queue.get()

            output = [line['message']]
            if self.output_stream_enabled:
                output.insert(
                    0,
                    self.color(
                        line['stream'].ljust(self.max_stream_length, ' '),
                        'cyan'
                    )
                )
            if self.output_group_enabled:
                output.insert(
                    0,
                    self.color(
                        line['group'].ljust(self.max_group_length, ' '),
                        'green'
                    )
                )
            self.events_queue.put("{0}\n".format(' '.join(output)))

    def _events_consumer(self):
        """Print events from ``events_queue`` as soon as they are available."""
        while True:
            event = self.events_queue.get(True)
            if event == NO_MORE_EVENTS:
                break
            sys.stdout.write(event)
            sys.stdout.flush()

    def list_logs(self):
        self.register_publishers()

        pool = Pool(size=self.pool_size)
        pool.spawn(self._raw_events_queue_consumer)
        pool.spawn(self._events_consumer)

        if self.watch:
            pool.spawn(self.register_publishers_periodically)

        for i in xrange(self.pool_size):
            pool.spawn(self._publisher_queue_consumer)
        pool.join()

    def register_publishers(self):
        """Register publishers into ``publishers_queue``."""
        for group, stream in self._get_streams_from_patterns(self.log_group_name, self.log_stream_name):
            if (group, stream) in self.publishers:
                continue
            self.publishers.append((group, stream))
            self.max_group_length = max(self.max_group_length, len(group))
            self.max_stream_length = max(self.max_stream_length, len(stream))
            self.publishers_queue.put((0, (group, stream, None)))
            self.stream_status[(group, stream)] = self.ACTIVE
            self.stream_max_timestamp[(group, stream)] = -1

    def register_publishers_periodically(self):
        while True:
            self.register_publishers()
            gevent.sleep(2)

    def list_groups(self):
        """Lists available CloudWatch logs groups"""
        for group in self.get_groups():
            print group

    def list_streams(self, *args, **kwargs):
        """Lists available CloudWatch logs streams in ``log_group_name``."""
        for stream in self.get_streams(*args, **kwargs):
            print stream

    def get_groups(self):
        """Returns available CloudWatch logs groups"""
        next_token = None
        while True:
            response = self.connection.describe_log_groups(next_token=next_token)

            for group in response.get('logGroups', []):
                yield group['logGroupName']

            if 'nextToken' in response:
                next_token = response['nextToken']
            else:
                break

    def get_streams(self, log_group_name=None):
        """Returns available CloudWatch logs streams in ``log_group_name``."""
        log_group_name = log_group_name or self.log_group_name
        next_token = None

        while True:
            response = self.connection.describe_log_streams(
                log_group_name=log_group_name,
                next_token=next_token
            )

            for stream in response.get('logStreams', []):
                yield stream['logStreamName']

            if 'nextToken' in response:
                next_token = response['nextToken']
            else:
                break

    def color(self, text, color):
        """Returns coloured version of ``text`` if ``color_enabled``."""
        if self.color_enabled:
            return colored(text, color)
        return text

    def parse_datetime(self, datetime_text):
        """Parse ``datetime_text`` into a ``datetime``."""
        if not datetime_text:
            return None

        ago_match = re.match(r'(\d+)\s?(m|minute|minutes|h|hour|hours|d|day|days|w|weeks|weeks)(?: ago)?', datetime_text)
        if ago_match:
            amount, unit = ago_match.groups()
            amount = int(amount)
            unit = {'m': 60, 'h': 3600, 'd': 86400, 'w': 604800}[unit[0]]
            date = datetime.now() + timedelta(seconds=unit * amount * -1)
        else:
            try:
                date = parse(datetime_text)
            except ValueError:
                raise exceptions.UnknownDateError(datetime_text)

        return int(date.strftime("%s")) * 1000
Beispiel #4
0
class AWSLogs(object):

    ACTIVE = 1
    EXHAUSTED = 2
    WATCH_SLEEP = 2

    def __init__(self, **kwargs):
        self.connection_cls = kwargs.get('connection_cls', AWSConnection)
        self.aws_region = kwargs.get('aws_region')
        self.aws_access_key_id = kwargs.get('aws_access_key_id')
        self.aws_secret_access_key = kwargs.get('aws_secret_access_key')
        self.log_group_name = kwargs.get('log_group_name')
        self.log_stream_name = kwargs.get('log_stream_name')
        self.watch = kwargs.get('watch')
        self.color_enabled = kwargs.get('color_enabled')
        self.output_stream_enabled = kwargs.get('output_stream_enabled')
        self.output_group_enabled = kwargs.get('output_group_enabled')
        self.start = self.parse_datetime(kwargs.get('start'))
        self.end = self.parse_datetime(kwargs.get('end'))
        self.pool_size = max(kwargs.get('pool_size', 0), 10)
        self.max_group_length = 0
        self.max_stream_length = 0
        self.publishers = []
        self.events_queue = Queue()
        self.raw_events_queue = PriorityQueue()
        self.publishers_queue = PriorityQueue()
        self.publishers = []
        self.stream_status = {}
        self.stream_max_timestamp = {}

        self.connection = self.connection_cls(
            self.aws_region,
            aws_access_key_id=self.aws_access_key_id,
            aws_secret_access_key=self.aws_secret_access_key)

    def _get_streams_from_patterns(self, log_group_pattern,
                                   log_stream_pattern):
        """Returns pairs of group, stream matching ``log_group_pattern`` and
        ``log_stream_pattern``."""
        for group in self._get_groups_from_pattern(log_group_pattern):
            for stream in self._get_streams_from_pattern(
                    group, log_stream_pattern):
                yield group, stream

    def _get_groups_from_pattern(self, pattern):
        """Returns groups matching ``pattern``."""
        pattern = '.*' if pattern == 'ALL' else pattern
        reg = re.compile('^{0}'.format(pattern))
        for group in self.get_groups():
            if re.match(reg, group):
                yield group

    def _get_streams_from_pattern(self, group, pattern):
        """Returns streams in ``group`` matching ``pattern``."""
        pattern = '.*' if pattern == 'ALL' else pattern
        reg = re.compile('^{0}'.format(pattern))
        for stream in self.get_streams(group):
            if re.match(reg, stream):
                yield stream

    def _publisher_queue_consumer(self):
        """Consume ``publishers_queue`` api calls, run them and publish log
        events to ``raw_events_queue``. If ``nextForwardToken`` is present
        register a new api call into ``publishers_queue`` using as weight
        the timestamp of the latest event."""
        while True:
            try:
                _, (log_group_name, log_stream_name,
                    next_token) = self.publishers_queue.get(block=False)
            except Empty:
                if self.watch:
                    gevent.sleep(self.WATCH_SLEEP)
                    continue
                else:
                    break

            response = self.connection.get_log_events(
                next_token=next_token,
                log_group_name=log_group_name,
                log_stream_name=log_stream_name,
                start_time=self.start,
                end_time=self.end,
                start_from_head=True)

            if not len(response['events']):
                self.stream_status[(log_group_name,
                                    log_stream_name)] = self.EXHAUSTED
                continue

            self.stream_status[(log_group_name, log_stream_name)] = self.ACTIVE

            for event in response['events']:
                event['group'] = log_group_name
                event['stream'] = log_stream_name
                self.raw_events_queue.put((event['timestamp'], event))
                self.stream_max_timestamp[(
                    log_group_name, log_stream_name)] = event['timestamp']

            if 'nextForwardToken' in response:
                self.publishers_queue.put((response['events'][-1]['timestamp'],
                                           (log_group_name, log_stream_name,
                                            response['nextForwardToken'])))

    def _get_min_timestamp(self):
        """Return the minimum timestamp available across all active streams."""
        pending = [
            self.stream_max_timestamp[k]
            for k, v in self.stream_status.iteritems() if v != self.EXHAUSTED
        ]
        return min(pending) if pending else None

    def _get_all_streams_exhausted(self):
        """Return if all streams are exhausted."""
        return all(
            (s == self.EXHAUSTED for s in self.stream_status.itervalues()))

    def _raw_events_queue_consumer(self):
        """Consume events from ``raw_events_queue`` if all active streams
        have already publish events up to the ``_get_min_timestamp`` and
        register them in order into ``events_queue``."""
        while True:
            if self._get_all_streams_exhausted(
            ) and self.raw_events_queue.empty():
                if self.watch:
                    gevent.sleep(self.WATCH_SLEEP)
                    continue
                self.events_queue.put(NO_MORE_EVENTS)
                break

            try:
                timestamp, line = self.raw_events_queue.peek(timeout=1)
            except Empty:
                continue

            min_timestamp = self._get_min_timestamp()
            if min_timestamp and min_timestamp < timestamp:
                gevent.sleep(0.3)
                continue

            timestamp, line = self.raw_events_queue.get()

            output = [line['message']]
            if self.output_stream_enabled:
                output.insert(
                    0,
                    self.color(
                        line['stream'].ljust(self.max_stream_length, ' '),
                        'cyan'))
            if self.output_group_enabled:
                output.insert(
                    0,
                    self.color(line['group'].ljust(self.max_group_length, ' '),
                               'green'))
            self.events_queue.put("{0}\n".format(' '.join(output)))

    def _events_consumer(self):
        """Print events from ``events_queue`` as soon as they are available."""
        while True:
            event = self.events_queue.get(True)
            if event == NO_MORE_EVENTS:
                break
            sys.stdout.write(event)
            sys.stdout.flush()

    def list_logs(self):
        self.register_publishers()

        pool = Pool(size=self.pool_size)
        pool.spawn(self._raw_events_queue_consumer)
        pool.spawn(self._events_consumer)

        if self.watch:
            pool.spawn(self.register_publishers_periodically)

        for i in xrange(self.pool_size):
            pool.spawn(self._publisher_queue_consumer)
        pool.join()

    def register_publishers(self):
        """Register publishers into ``publishers_queue``."""
        for group, stream in self._get_streams_from_patterns(
                self.log_group_name, self.log_stream_name):
            if (group, stream) in self.publishers:
                continue
            self.publishers.append((group, stream))
            self.max_group_length = max(self.max_group_length, len(group))
            self.max_stream_length = max(self.max_stream_length, len(stream))
            self.publishers_queue.put((0, (group, stream, None)))
            self.stream_status[(group, stream)] = self.ACTIVE
            self.stream_max_timestamp[(group, stream)] = -1

    def register_publishers_periodically(self):
        while True:
            self.register_publishers()
            gevent.sleep(2)

    def list_groups(self):
        """Lists available CloudWatch logs groups"""
        for group in self.get_groups():
            print group

    def list_streams(self, *args, **kwargs):
        """Lists available CloudWatch logs streams in ``log_group_name``."""
        for stream in self.get_streams(*args, **kwargs):
            print stream

    def get_groups(self):
        """Returns available CloudWatch logs groups"""
        next_token = None
        while True:
            response = self.connection.describe_log_groups(
                next_token=next_token)

            for group in response.get('logGroups', []):
                yield group['logGroupName']

            if 'nextToken' in response:
                next_token = response['nextToken']
            else:
                break

    def get_streams(self, log_group_name=None):
        """Returns available CloudWatch logs streams in ``log_group_name``."""
        log_group_name = log_group_name or self.log_group_name
        next_token = None
        window_start = self.start or 0
        window_end = self.end or sys.maxint

        while True:
            response = self.connection.describe_log_streams(
                log_group_name=log_group_name, next_token=next_token)

            for stream in response.get('logStreams', []):
                if max(stream['firstEventTimestamp'], window_start) <= \
                   min(stream['lastEventTimestamp'], window_end):
                    yield stream['logStreamName']

            if 'nextToken' in response:
                next_token = response['nextToken']
            else:
                break

    def color(self, text, color):
        """Returns coloured version of ``text`` if ``color_enabled``."""
        if self.color_enabled:
            return colored(text, color)
        return text

    def parse_datetime(self, datetime_text):
        """Parse ``datetime_text`` into a ``datetime``."""
        if not datetime_text:
            return None

        ago_match = re.match(
            r'(\d+)\s?(m|minute|minutes|h|hour|hours|d|day|days|w|weeks|weeks)(?: ago)?',
            datetime_text)
        if ago_match:
            amount, unit = ago_match.groups()
            amount = int(amount)
            unit = {'m': 60, 'h': 3600, 'd': 86400, 'w': 604800}[unit[0]]
            date = datetime.now() + timedelta(seconds=unit * amount * -1)
        else:
            try:
                date = parse(datetime_text)
            except ValueError:
                raise exceptions.UnknownDateError(datetime_text)

        return int(date.strftime("%s")) * 1000