Example #1
0
class UIDStack(object):
    """Thin convenience wrapper around gevent.queue.LifoQueue.
    Each entry in the stack is a pair (uid, metadata), where the metadata may
    be None."""
    def __init__(self):
        self._lifoqueue = LifoQueue()

    def empty(self):
        return self._lifoqueue.empty()

    def get(self):
        return self._lifoqueue.get_nowait()

    def peek(self):
        # This should be LifoQueue.peek_nowait(), which is currently buggy in
        # gevent. Can update with gevent version 1.0.2.
        return self._lifoqueue.queue[-1]

    def put(self, uid, metadata):
        self._lifoqueue.put((uid, metadata))

    def discard(self, objects):
        self._lifoqueue.queue = [
            item for item in self._lifoqueue.queue if item not in objects
        ]

    def qsize(self):
        return self._lifoqueue.qsize()

    def __iter__(self):
        for item in self._lifoqueue.queue:
            yield item
Example #2
0
class UIDStack(object):
    """Thin convenience wrapper around gevent.queue.LifoQueue.
    Each entry in the stack is a pair (uid, metadata), where the metadata may
    be None."""
    def __init__(self):
        self._lifoqueue = LifoQueue()

    def empty(self):
        return self._lifoqueue.empty()

    def get(self):
        return self._lifoqueue.get_nowait()

    def peek(self):
        # This should be LifoQueue.peek_nowait(), which is currently buggy in
        # gevent. Can update with gevent version 1.0.2.
        return self._lifoqueue.queue[-1]

    def put(self, uid, metadata):
        self._lifoqueue.put((uid, metadata))

    def discard(self, objects):
        self._lifoqueue.queue = [item for item in self._lifoqueue.queue if item
                                 not in objects]

    def qsize(self):
        return self._lifoqueue.qsize()

    def __iter__(self):
        for item in self._lifoqueue.queue:
            yield item
Example #3
0
class Stack(object):
    """Thin convenience wrapper around gevent.queue.LifoQueue."""
    def __init__(self, key, initial_elements=None):
        self.key = key
        self._lifoqueue = LifoQueue()
        if initial_elements is not None:
            self._lifoqueue.queue = sorted(list(initial_elements),
                                           key=self.key)

    def empty(self):
        return self._lifoqueue.empty()

    def get(self):
        return self._lifoqueue.get_nowait()

    def peek(self):
        # This should be LifoQueue.peek_nowait(), which is currently buggy in
        # gevent. Can update with gevent version 1.0.2.
        return self._lifoqueue.queue[-1]

    def put(self, obj):
        self._lifoqueue.put(obj)

    def update_from(self, objects):
        for obj in sorted(list(objects), key=self.key):
            self._lifoqueue.put(obj)

    def discard(self, objects):
        self._lifoqueue.queue = [item for item in self._lifoqueue.queue if item
                                 not in objects]

    def qsize(self):
        return self._lifoqueue.qsize()

    def __iter__(self):
        for item in self._lifoqueue.queue:
            yield item
Example #4
0
class ClientPool(object):
    DEFAULT_CLIENT_EXPIRE_TIME = 300
    DEFAULT_CLOSE_EXPIRE_CLIENT_INTERVAL = 60

    def __init__(self, pool_name, pool_size, client_class,
                 close_client_handler, *client_args, **client_kwargs):
        assert pool_size > 0
        assert client_class is not None and hasattr(client_class, '__call__')
        assert close_client_handler is None or hasattr(close_client_handler,
                                                       '__call__')
        self._pool_name = pool_name
        self._pool_size = pool_size
        self._client_class = client_class
        self._close_client_handler = close_client_handler
        self._client_args = client_args
        self._client_kwargs = client_kwargs
        self._queue = LifoQueue(maxsize=pool_size)
        for i in range(pool_size):
            self._queue.put(ClientHolder())
        self._client_expire_time = self.DEFAULT_CLIENT_EXPIRE_TIME
        self._gc_task = ScheduleTask(
            name='ClientPool-GC-%s' % pool_name,
            start_after_seconds=0,
            interval_seconds=self.DEFAULT_CLOSE_EXPIRE_CLIENT_INTERVAL,
            handler=self._close_expire_client)
        self._gc_task.run()

    def __del__(self):
        self._gc_task.stop()

    @contextmanager
    def get_client(self,
                   block=True,
                   pool_acquire_client_timeout=1,
                   req_timeout=5):
        client_holder = self._get_client(block, pool_acquire_client_timeout)
        tm = None
        try:
            tm = gevent.Timeout.start_new(req_timeout)
            yield client_holder.get_client()
        except BaseException as e:
            logger.error(
                'Client is out pool for too long %s seconds, raise exception: %s',
                req_timeout, e)
            self._close_client(client_holder)
            raise
        finally:
            if tm:
                tm.cancel()
            self.push(client_holder)

    def _get_client(self, block=True, timeout=1):
        if self.is_empty():
            logger.info('ClientPool: %s is empty.', self._pool_name)
        client_holder = self._queue.get(block=block, timeout=timeout)
        if client_holder.get_client() is None:
            tm = None
            try:
                tm = gevent.Timeout.start_new(timeout)
                client_holder.set_client(self._create_client())
            except BaseException as e:
                client_holder.set_client(None)
                self.push(client_holder)
                raise
            finally:
                if tm:
                    tm.cancel()
        client_holder.set_access_time(time.time())
        return client_holder

    def push(self, client_holder):
        if not self.is_full():
            self._queue.put_nowait(client_holder)

    def is_full(self):
        return self._queue.qsize() >= self._pool_size

    def is_empty(self):
        return self._queue.qsize() <= 0

    def _create_client(self):
        return self._client_class(*self._client_args, **self._client_kwargs)

    def _close_client(self, client_holder):
        if self._close_client_handler and client_holder.get_client():
            try:
                self._close_client_handler(client_holder.get_client())
            except Exception as e:
                logger.error('Close client raise exception: %s', e)
        client_holder.set_client(None)

    def _close_expire_client(self):
        cur_time = time.time()
        need_closed_clients = []
        for client_holder in self._queue.queue:
            if client_holder.get_client(
            ) and cur_time - client_holder.get_access_time(
            ) > self._client_expire_time:
                need_closed_clients.append(client_holder.get_client)

        for client in need_closed_clients:
            self._close_client_handler(client)
Example #5
0
class BaseProcessor(LoggerMixin):
    name = 'base-processor'

    @classmethod
    def from_engine(cls, engine, *args, **kwargs):
        return cls(engine, *args, **kwargs)

    def _request(self):
        return self.engine.request

    request = property(_request)

    def __init__(self, engine, *args, **kwargs):
        from time import time
        from hashlib import md5
        from threading import Lock
        from gevent.queue import LifoQueue

        self.processor_name = '%s:%s' % (self.name, md5(str(
            time())).hexdigest()[:6])

        LoggerMixin.__init__(self)

        self.engine = engine

        self.__redis = None
        self.redis_lock = Lock()

        self.progress = 0

        self.total = 0
        # 忽略统计
        self.bypassed_cnt = 0

        # 超过这一限制时,add_task就暂停向其中添加任务
        self.maxsize = 1000
        self.tasks = LifoQueue()
        self.workers = []

        # 默认的polling间隔为1秒
        self.polling_interval = 1

        import argparse

        arg_parser = argparse.ArgumentParser()
        # 并发数量
        arg_parser.add_argument('--concur', type=int)
        args, leftover = arg_parser.parse_known_args()

        from core import dhaulagiri_settings

        if args.concur:
            dhaulagiri_settings['core']['concur'] = args.concur
        self.concur = dhaulagiri_settings['core']['concur']

        self.checkpoint_ts = None
        self.checkpoint_prog = None
        self.init_ts = time()

        # 心跳任务
        self.heart_beat = None

        # worker的Monitor。Worker在每次循环开始的时候,都会在该对象中进行一次状态更新
        self.worker_monitor = {}

    def update_worker_status(self, worker):
        """
        更新worker的状态
        :param worker:
        :return:
        """
        from time import time

        name = worker.worker_name
        self.worker_monitor[name] = time()

    def get_worker_stat(self):
        """
        获得worker队列的状态
        :return:
        """
        from time import time

        # 如果60秒都没有状态更新,说明该worker进入zombie状态
        time_window = 90

        cur = time()
        active = dict(
            filter(lambda item: item[1] >= cur - time_window,
                   self.worker_monitor.items()))
        zombie = dict(
            filter(lambda item: item[1] < cur - time_window,
                   self.worker_monitor.items()))

        return {'zombie': zombie, 'active': active}

    def incr_progress(self):
        self.progress += 1

    def _start_workers(self):
        def timer():
            """
            每30秒启动一次,输出当前进度
            """
            import time

            while True:
                msg = 'Progress: %d / %d.' % (self.progress, self.total)
                cts = time.time()

                if self.checkpoint_prog is not None and self.checkpoint_ts is not None:
                    rate = (self.progress - self.checkpoint_prog) / (
                        cts - self.checkpoint_ts) * 60
                    msg = '%s %s' % (msg, 'Processing rate: %d items/min' %
                                     int(rate))

                self.checkpoint_ts = cts
                self.checkpoint_prog = self.progress

                # 获得worker monitor统计
                stat = self.get_worker_stat()
                msg += ', active workers: %d, zombie workers: %d' % (len(
                    stat['active']), len(stat['zombie']))

                self.log(msg)
                gevent.sleep(30)

        self.heart_beat = gevent.spawn(timer)

        gevent.signal(signal.SIGKILL, gevent.kill)
        gevent.signal(signal.SIGQUIT, gevent.kill)

        for i in xrange(self.concur):
            worker = Worker.from_processor(self, self.tasks)
            self.workers.append(worker)

    def add_task(self, task, *args, **kwargs):
        # 是否启用流量控制
        flow_control = True
        while flow_control:
            # 如果self.tasks中的项目过多,则暂停添加
            if self.tasks.qsize() > self.maxsize:
                gevent.sleep(self.polling_interval)
            else:
                break

        func = lambda: task(*args, **kwargs)
        task_key = getattr(task, 'task_key', None)
        if task_key:
            setattr(func, 'task_key', task_key)
        self.tasks.put(func, timeout=120)
        self.logger.debug(
            'New task%s added to the queue. Remaining: %d' %
            ('(%s)' % task_key if task_key else '', self.tasks.qsize()))
        gevent.sleep(0)

    def _wait_for_workers(self):
        """
        等待所有的worker是否完成。判据:所有的worker都处于idle状态,并且tasks队列已空
        :return:
        """
        while True:
            if not self.tasks.empty():
                gevent.sleep(self.polling_interval)
                continue

            completed = True
            for w in self.workers:
                if not w.idle:
                    gevent.sleep(self.polling_interval)
                    completed = False
                    break

            if completed:
                break

        gevent.killall([w.gevent for w in self.workers])
        gevent.kill(self.heart_beat)

    def run(self):
        self._start_workers()
        self.populate_tasks()
        self._wait_for_workers()

        import time

        self.log(
            'Processor ended: %d items processed(%d bypassed) in %d minutes' %
            (self.progress, self.bypassed_cnt,
             int((time.time() - self.init_ts) / 60.0)))

    def populate_tasks(self):
        raise NotImplementedError