Example #1
0
def gmail_highestmodseq_update(crispin_client, db_session, log, folder_name,
                               uids, local_uids, status_cb, syncmanager_lock):
    g_metadata = crispin_client.g_metadata(uids)
    to_download = deduplicate_message_download(
        crispin_client, db_session, log, syncmanager_lock, g_metadata, uids)
    if folder_name != crispin_client.folder_names()['all']:
        flags = crispin_client.flags(to_download)
        message_download_stack = LifoQueue()
        for uid in to_download:
            if uid in flags and uid in g_metadata:
                # IMAP will just return no data for a UID if it's disappeared
                # from the folder in the meantime.
                message_download_stack.put(GMessage(
                    uid, g_metadata[uid], flags[uid].flags, flags[uid].labels))
        download_queued_threads(crispin_client, db_session, log, folder_name,
                                message_download_stack, status_cb,
                                syncmanager_lock)
    else:
        uid_download_stack = uid_list_to_stack(to_download)

        download_queued_uids(crispin_client, db_session, log, folder_name,
                             uid_download_stack, 0, uid_download_stack.qsize(),
                             status_cb, syncmanager_lock,
                             gmail_download_and_commit_uids,
                             create_gmail_message)
Example #2
0
    def __init__(self,
                 size,
                 host,
                 port,
                 db=0,
                 passwd=None,
                 socket_timeout=None):
        """
    Args:
      size - Number of connections to maintain in the pool.
      host - The hostname to use for making connections.
      port - The port to use for making connections.
      db - The database number to connect to.
      passwd - The password to use for accessing the database.
      socket_timeout - The socket timeout value for connections.
    """
        self.size = size

        self.all = set()
        self.pool = LifoQueue(maxsize=self.size)

        for _ in xrange(self.size):
            connection = redis.Connection(host,
                                          port,
                                          db,
                                          passwd,
                                          socket_timeout,
                                          encoding='utf-8',
                                          encoding_errors='strict',
                                          parser_class=DefaultParser)
            self.all.add(connection)
            self.pool.put(connection)
Example #3
0
class UIDStack(object):
    """Thin convenience wrapper around gevent.queue.LifoQueue.
    Each entry in the stack is a pair (uid, metadata), where the metadata may
    be None."""
    def __init__(self):
        self._lifoqueue = LifoQueue()

    def empty(self):
        return self._lifoqueue.empty()

    def get(self):
        return self._lifoqueue.get_nowait()

    def peek(self):
        # This should be LifoQueue.peek_nowait(), which is currently buggy in
        # gevent. Can update with gevent version 1.0.2.
        return self._lifoqueue.queue[-1]

    def put(self, uid, metadata):
        self._lifoqueue.put((uid, metadata))

    def discard(self, objects):
        self._lifoqueue.queue = [item for item in self._lifoqueue.queue if item
                                 not in objects]

    def qsize(self):
        return self._lifoqueue.qsize()

    def __iter__(self):
        for item in self._lifoqueue.queue:
            yield item
Example #4
0
def gmail_highestmodseq_update(crispin_client, log, folder_name, new_uids,
                               updated_uids, syncmanager_lock):
    uids = new_uids + updated_uids
    g_metadata = crispin_client.g_metadata(uids)
    to_download = deduplicate_message_download(
        crispin_client, log, syncmanager_lock, g_metadata, uids)

    if folder_name == crispin_client.folder_names()['inbox']:
        flags = crispin_client.flags(to_download)
        message_download_stack = LifoQueue()
        for uid in to_download:
            if uid in flags and uid in g_metadata:
                # IMAP will just return no data for a UID if it's disappeared
                # from the folder in the meantime.
                message_download_stack.put(GMessage(
                    uid, g_metadata[uid], flags[uid].flags, flags[uid].labels))
        download_queued_threads(crispin_client, log, folder_name,
                                message_download_stack, syncmanager_lock)
    elif folder_name in uid_download_folders(crispin_client):
        uid_download_stack = uid_list_to_stack(to_download)
        download_queued_uids(crispin_client, log, folder_name,
                             uid_download_stack, 0, uid_download_stack.qsize(),
                             syncmanager_lock, gmail_download_and_commit_uids,
                             create_gmail_message)
    else:
        raise MailsyncError(
            'Unknown Gmail sync folder: {}'.format(folder_name))
Example #5
0
class UIDStack(object):
    """Thin convenience wrapper around gevent.queue.LifoQueue.
    Each entry in the stack is a pair (uid, metadata), where the metadata may
    be None."""
    def __init__(self):
        self._lifoqueue = LifoQueue()

    def empty(self):
        return self._lifoqueue.empty()

    def get(self):
        return self._lifoqueue.get_nowait()

    def peek(self):
        # This should be LifoQueue.peek_nowait(), which is currently buggy in
        # gevent. Can update with gevent version 1.0.2.
        return self._lifoqueue.queue[-1]

    def put(self, uid, metadata):
        self._lifoqueue.put((uid, metadata))

    def discard(self, objects):
        self._lifoqueue.queue = [
            item for item in self._lifoqueue.queue if item not in objects
        ]

    def qsize(self):
        return self._lifoqueue.qsize()

    def __iter__(self):
        for item in self._lifoqueue.queue:
            yield item
Example #6
0
class RedisConnectionPool(object):
    """Pool of Redis Connections that uses a gevent LifoQueue to block when a
  resource is not available.
  """
    def __init__(self,
                 size,
                 host,
                 port,
                 db=0,
                 passwd=None,
                 socket_timeout=None):
        """
    Args:
      size - Number of connections to maintain in the pool.
      host - The hostname to use for making connections.
      port - The port to use for making connections.
      db - The database number to connect to.
      passwd - The password to use for accessing the database.
      socket_timeout - The socket timeout value for connections.
    """
        self.size = size

        self.all = set()
        self.pool = LifoQueue(maxsize=self.size)

        for _ in xrange(self.size):
            connection = redis.Connection(host,
                                          port,
                                          db,
                                          passwd,
                                          socket_timeout,
                                          encoding='utf-8',
                                          encoding_errors='strict',
                                          parser_class=DefaultParser)
            self.all.add(connection)
            self.pool.put(connection)

    def get_connection(self, command_name, *keys, **options):
        """Get a connection from the pool. If no connection is available, this call
    will block.
    """
        return self.pool.get(timeout=60)

    def release(self, connection):
        """Return a connection to the pool.
    """
        if connection not in self.all:
            raise ValueError()

        self.pool.put(connection)

    def disconnect(self):
        """Close all the connections managed by this pool.
    """
        for connection in self.all:
            connection.disconnect()
Example #7
0
    def __init__(self, engine, *args, **kwargs):
        from time import time
        from hashlib import md5
        from threading import Lock
        from gevent.queue import LifoQueue

        self.processor_name = '%s:%s' % (self.name, md5(str(
            time())).hexdigest()[:6])

        LoggerMixin.__init__(self)

        self.engine = engine

        self.__redis = None
        self.redis_lock = Lock()

        self.progress = 0

        self.total = 0
        # 忽略统计
        self.bypassed_cnt = 0

        # 超过这一限制时,add_task就暂停向其中添加任务
        self.maxsize = 1000
        self.tasks = LifoQueue()
        self.workers = []

        # 默认的polling间隔为1秒
        self.polling_interval = 1

        import argparse

        arg_parser = argparse.ArgumentParser()
        # 并发数量
        arg_parser.add_argument('--concur', type=int)
        args, leftover = arg_parser.parse_known_args()

        from core import dhaulagiri_settings

        if args.concur:
            dhaulagiri_settings['core']['concur'] = args.concur
        self.concur = dhaulagiri_settings['core']['concur']

        self.checkpoint_ts = None
        self.checkpoint_prog = None
        self.init_ts = time()

        # 心跳任务
        self.heart_beat = None

        # worker的Monitor。Worker在每次循环开始的时候,都会在该对象中进行一次状态更新
        self.worker_monitor = {}
Example #8
0
class RedisConnectionPool(object):
  """Pool of Redis Connections that uses a gevent LifoQueue to block when a
  resource is not available.
  """

  def __init__(self, size, host, port, db=0, passwd=None, socket_timeout=None):
    """
    Args:
      size - Number of connections to maintain in the pool.
      host - The hostname to use for making connections.
      port - The port to use for making connections.
      db - The database number to connect to.
      passwd - The password to use for accessing the database.
      socket_timeout - The socket timeout value for connections.
    """
    self.size = size

    self.all = set()
    self.pool = LifoQueue(maxsize=self.size)

    for _ in xrange(self.size):
      connection = redis.Connection(
          host, port, db, passwd,
          socket_timeout,
          encoding='utf-8',
          encoding_errors='strict',
          parser_class=DefaultParser)
      self.all.add(connection)
      self.pool.put(connection)

  def get_connection(self, command_name, *keys, **options):
    """Get a connection from the pool. If no connection is available, this call
    will block.
    """
    return self.pool.get(timeout=60)

  def release(self, connection):
    """Return a connection to the pool.
    """
    if connection not in self.all:
      raise ValueError()

    self.pool.put(connection)

  def disconnect(self):
    """Close all the connections managed by this pool.
    """
    for connection in self.all:
      connection.disconnect()
Example #9
0
def base_initial_sync(crispin_client, log, folder_name, shared_state,
                      initial_sync_fn, msg_create_fn):
    """ Downloads entire messages.

    This function may be retried as many times as you like; it will pick up
    where it left off, delete removed messages if things disappear between
    restarts, and only complete once we have all the UIDs in the given folder
    locally.

    This function also starts up a secondary greenlet that checks for new
    messages periodically, to deal with the case of very large folders---it's
    a bad experience for the user to keep receiving old mail but not receive
    new mail! We use a LIFO queue to make sure we're downloading newest mail
    first.
    """
    log.info('starting initial sync')

    uid_download_stack = LifoQueue()

    crispin_client.select_folder(folder_name,
                                 uidvalidity_cb(crispin_client.account_id))

    with session_scope(ignore_soft_deletes=False) as db_session:
        local_uids = account.all_uids(crispin_client.account_id, db_session,
                                      folder_name)

    initial_sync_fn(crispin_client, log, folder_name, shared_state, local_uids,
                    uid_download_stack, msg_create_fn)

    return 'poll'
Example #10
0
 def __init__(self, pool_name, pool_size, client_class,
              close_client_handler, *client_args, **client_kwargs):
     assert pool_size > 0
     assert client_class is not None and hasattr(client_class, '__call__')
     assert close_client_handler is None or hasattr(close_client_handler,
                                                    '__call__')
     self._pool_name = pool_name
     self._pool_size = pool_size
     self._client_class = client_class
     self._close_client_handler = close_client_handler
     self._client_args = client_args
     self._client_kwargs = client_kwargs
     self._queue = LifoQueue(maxsize=pool_size)
     for i in range(pool_size):
         self._queue.put(ClientHolder())
     self._client_expire_time = self.DEFAULT_CLIENT_EXPIRE_TIME
     self._gc_task = ScheduleTask(
         name='ClientPool-GC-%s' % pool_name,
         start_after_seconds=0,
         interval_seconds=self.DEFAULT_CLOSE_EXPIRE_CLIENT_INTERVAL,
         handler=self._close_expire_client)
     self._gc_task.run()
Example #11
0
    def initial_sync(self):
        with self.conn_pool.get() as crispin_client:
            uid_download_stack = LifoQueue()
            crispin_client.select_folder(
                self.folder_name, uidvalidity_cb(crispin_client.account_id))

            with mailsync_session_scope() as db_session:
                local_uids = common.all_uids(crispin_client.account_id,
                                             db_session, self.folder_name)

            self.initial_sync_impl(crispin_client, local_uids,
                                   uid_download_stack)
        return 'poll'
Example #12
0
    def highestmodseq_callback(self, crispin_client, new_uids, updated_uids):
        uids = new_uids + updated_uids
        g_metadata = crispin_client.g_metadata(uids)
        to_download = self.__deduplicate_message_download(
            crispin_client, g_metadata, uids)

        if self.folder_name == crispin_client.folder_names()['inbox']:
            flags = crispin_client.flags(to_download)
            message_download_stack = LifoQueue()
            for uid in to_download:
                if uid in flags and uid in g_metadata:
                    # IMAP will just return no data for a UID if it's
                    # disappeared from the folder in the meantime.
                    message_download_stack.put(
                        GMessage(uid, g_metadata[uid], flags[uid].flags,
                                 flags[uid].labels))
            self.__download_queued_threads(crispin_client,
                                           message_download_stack)
        elif self.folder_name in uid_download_folders(crispin_client):
            uid_download_stack = uid_list_to_stack(to_download)
            self.download_uids(crispin_client, uid_download_stack)
        else:
            raise MailsyncError('Unknown Gmail sync folder: {}'.format(
                self.folder_name))
    def __init__(self, maxsize=100, maxwait=1.0, expires=None, cleanup=None):
        """
        The pool manages opened connections to the database. The main strategy is to keep the smallest number
        of alive connections which are required for best web service performance.
        In most cases connections are taken from the pool. In case of views-peeks, pool creates some
        extra resources preventing service gone unavailable. In time of low traffic (night) unnecessary
        connections are released.

        Parameters
        ----------
        maxsize : int
                  Soft limit of the number of created connections. After reaching this limit
                  taking the next connection first waits `maxwait` time for any returned slot.
        maxwait : float
                  The time in seconds which is to be wait before creating new connection after the pool gets empty.
                  It may be 0 then immediate connections are created til `maxoverflow` is reached.
        expires : float
                  The time in seconds indicates how long connection should stay alive.
                  It is also used to close unneeded slots.
        """
        if not isinstance(maxsize, integer_types):
            raise TypeError('Expected integer, got %r' % (maxsize, ))

        self._maxsize = maxsize
        self._maxwait = maxwait
        self._expires = expires
        self._cleanup = cleanup
        self._created_at = {}
        self._latest_use = {}
        self._pool = LifoQueue()
        self._size = 0
        self._latest_cleanup = 0 if self._expires or self._cleanup else 0xffffffffffffffff
        self._interval_cleanup = min(
            self._expires or self._cleanup, self._cleanup
            or self._expires) if self._expires or self._cleanup else 0
        self._cleanup_lock = Semaphore(value=1)
Example #14
0
    def highestmodseq_callback(self, crispin_client, new_uids, updated_uids):
        uids = new_uids + updated_uids
        g_metadata = crispin_client.g_metadata(uids)
        to_download = self.__deduplicate_message_download(
            crispin_client, g_metadata, uids)

        if self.folder_name == crispin_client.folder_names()['inbox']:
            flags = crispin_client.flags(to_download)
            message_download_stack = LifoQueue()
            for uid in to_download:
                if uid in flags and uid in g_metadata:
                    # IMAP will just return no data for a UID if it's
                    # disappeared from the folder in the meantime.
                    message_download_stack.put(GMessage(
                        uid, g_metadata[uid], flags[uid].flags,
                        flags[uid].labels))
            self.__download_queued_threads(crispin_client,
                                           message_download_stack)
        elif self.folder_name in uid_download_folders(crispin_client):
            uid_download_stack = uid_list_to_stack(to_download)
            self.download_uids(crispin_client, uid_download_stack)
        else:
            raise MailsyncError(
                'Unknown Gmail sync folder: {}'.format(self.folder_name))
Example #15
0
  def __init__(self,
      service_name,
      sentinel_manager,
      pool_size=8,
      tab_prefix='redis_bsmg_pool',
      connection_class=SentinelManagedConnection,
      connection_kwargs={},
      sentinel_check_connections=False,
      is_master=True):
    """
    Args:
      service_name - Name of the Sentinel service name to connect to.
      sentinel_manager - Sentinel manager object.
      pool_size - Number of connections to maintain in the pool.
      tab_prefix - Tab name prefix for Tabs recorded by this class.
      connection_class - Class to use for creating connections. Must be a
          sub-class of (or be API compatible with) SentinelManagedConnection.
      connection_kwargs - Keyword arguments to pass through to connection
          constructor.
      sentinel_check_connections - Whether to enable Sentinel connection
          checking on establishing each connection.
      is_master - Always True. Included to match SentinelConnectionPool API.
    """
    self.service_name = service_name
    self.sentinel_manager = sentinel_manager
    self.pool_size = pool_size
    self.tab_prefix = tab_prefix
    self.conn_class = connection_class
    self.conn_kwargs = connection_kwargs

    # Sentinel connection pool API member variables.
    self.is_master = is_master
    self.check_connection = sentinel_check_connections
    self.master_address = None

    # Actual pool containers.
    self.closed = False
    self.all = set()
    self.pool = LifoQueue(maxsize=self.pool_size)

    # Initialize the pool.
    for _ in xrange(self.pool_size):
      conn = self.conn_class(
          connection_pool=weakref.proxy(self),
          **connection_kwargs)
      self.all.add(conn)
      self.pool.put(conn)
Example #16
0
  def __init__(self, size, host, port, db=0, passwd=None, socket_timeout=None):
    """
    Args:
      size - Number of connections to maintain in the pool.
      host - The hostname to use for making connections.
      port - The port to use for making connections.
      db - The database number to connect to.
      passwd - The password to use for accessing the database.
      socket_timeout - The socket timeout value for connections.
    """
    self.size = size

    self.all = set()
    self.pool = LifoQueue(maxsize=self.size)

    for _ in xrange(self.size):
      connection = redis.Connection(
          host, port, db, passwd,
          socket_timeout,
          encoding='utf-8',
          encoding_errors='strict',
          parser_class=DefaultParser)
      self.all.add(connection)
      self.pool.put(connection)
Example #17
0
class Stack(object):
    """Thin convenience wrapper around gevent.queue.LifoQueue."""
    def __init__(self, key, initial_elements=None):
        self.key = key
        self._lifoqueue = LifoQueue()
        if initial_elements is not None:
            self._lifoqueue.queue = sorted(list(initial_elements),
                                           key=self.key)

    def empty(self):
        return self._lifoqueue.empty()

    def get(self):
        return self._lifoqueue.get_nowait()

    def peek(self):
        # This should be LifoQueue.peek_nowait(), which is currently buggy in
        # gevent. Can update with gevent version 1.0.2.
        return self._lifoqueue.queue[-1]

    def put(self, obj):
        self._lifoqueue.put(obj)

    def update_from(self, objects):
        for obj in sorted(list(objects), key=self.key):
            self._lifoqueue.put(obj)

    def discard(self, objects):
        self._lifoqueue.queue = [item for item in self._lifoqueue.queue if item
                                 not in objects]

    def qsize(self):
        return self._lifoqueue.qsize()

    def __iter__(self):
        for item in self._lifoqueue.queue:
            yield item
Example #18
0
 def __init__(self, key, initial_elements=None):
     self.key = key
     self._lifoqueue = LifoQueue()
     if initial_elements is not None:
         self._lifoqueue.queue = sorted(list(initial_elements),
                                        key=self.key)
Example #19
0
class BlockingSentinelMasterGeventConnectionPool(object):
  """Blocking, Sentinel enabled Redis connection pool.

  We use this instead of the built-in connection pool in redis-py, because the
  built-in one for sentinel does not support a blocking implementation, which
  Taba relies on. (There is a blocking connection pool in redis-py, but it is
  incompatible with sentinel connections).
  """

  # Timeout, in seconds, when trying to retrieve a connection from the
  # redis connection pool. This is set to infinite (i.e. a worker will wait
  # indefinitely for a connection to become available). Any actual remote
  # failure should be caught and surfaced by the socket timeout.
  GET_CONNECTION_TIMEOUT = None

  def __init__(self,
      service_name,
      sentinel_manager,
      pool_size=8,
      tab_prefix='redis_bsmg_pool',
      connection_class=SentinelManagedConnection,
      connection_kwargs={},
      sentinel_check_connections=False,
      is_master=True):
    """
    Args:
      service_name - Name of the Sentinel service name to connect to.
      sentinel_manager - Sentinel manager object.
      pool_size - Number of connections to maintain in the pool.
      tab_prefix - Tab name prefix for Tabs recorded by this class.
      connection_class - Class to use for creating connections. Must be a
          sub-class of (or be API compatible with) SentinelManagedConnection.
      connection_kwargs - Keyword arguments to pass through to connection
          constructor.
      sentinel_check_connections - Whether to enable Sentinel connection
          checking on establishing each connection.
      is_master - Always True. Included to match SentinelConnectionPool API.
    """
    self.service_name = service_name
    self.sentinel_manager = sentinel_manager
    self.pool_size = pool_size
    self.tab_prefix = tab_prefix
    self.conn_class = connection_class
    self.conn_kwargs = connection_kwargs

    # Sentinel connection pool API member variables.
    self.is_master = is_master
    self.check_connection = sentinel_check_connections
    self.master_address = None

    # Actual pool containers.
    self.closed = False
    self.all = set()
    self.pool = LifoQueue(maxsize=self.pool_size)

    # Initialize the pool.
    for _ in xrange(self.pool_size):
      conn = self.conn_class(
          connection_pool=weakref.proxy(self),
          **connection_kwargs)
      self.all.add(conn)
      self.pool.put(conn)

  def __repr__(self):
    return "%s<%s|%s>" % (
         type(self).__name__,
         self.connection_class.__name__,
         self.connection_kwargs)

  #########################################################
  # Connection Pool API Methods
  #########################################################

  def get_connection(self, command_name, *keys, **options):
    """Get a connection from the pool.

    Args: Ignored. Included to match ConnectionPool API.
    """
    if self.closed:
      raise Empty()

    try:
      return self.pool.get(timeout=self.GET_CONNECTION_TIMEOUT)
    except Empty as e:
      client.Counter(self.tab_prefix + '_redis_conn_pool_get_conn_timeout')
      LOG.error('Cannot get connection for %s:%d' % (self.host, self.port))
      raise e

  def release(self, connection):
    """Releases the connection back to the pool

    Args:
      connection - Connection to put back in the pool. Must have been initially
          taken from this pool.
    """
    if connection not in self.all:
      raise ValueError()
    self.pool.put(connection)

  def disconnect(self):
    """Disconnects all connections in the pool."""
    for conn in self.all:
      conn.disconnect()

  def shutdown(self):
    """Close the pool and disconnect all connections.
    """
    self.closed = True
    try:
      # Wait for all the connections to finish and get returned to the pool.
      def _wait_ready():
        while not self.pool.full():
          time.sleep(0.5)
      thread_util.PerformOperationWithTimeout(30, _wait_ready)

    except Exception as e:
      LOG.error(e)
    finally:
      # Disconnect anyway.
      self.disconnect()

  #########################################################
  # Sentinel Pool API Methods
  #########################################################

  def get_master_address(self):
    """SentinelConnectionPool API compatibility. Get the connection information
    to the service master.

    Returns:
      Tuple of (Master Hostname, Master Port)
    """
    master_address = self.sentinel_manager.discover_master(self.service_name)

    if self.master_address is None:
      self.master_address = master_address
    elif master_address != self.master_address:
      # Master address changed. Reset all connections.
      self.disconnect()

    return master_address

  def rotate_slaves(self):
    """SentinelConnectionPool API compatibility. Not implemented.
    """
    pass
Example #20
0
def _migrator_with_worker_pool(migrator, reindexer, iterable, max_retry,
                               num_workers):
    """Migrate in parallel with worker pool

    When running in steady state, failed doc will be retried up to the
    max retry limit. Documents awaiting retry and all documents that
    started the migration process but did not finish will be saved and
    retried on the next run if the migration is stopped before it
    completes.
    """
    def work_on(doc, key, retry_count):
        try:
            ok = migrator.migrate(doc)
            assert ok, "run_with_worker_pool expects success!"
        except Exception:
            err = traceback.format_exc().strip()
            print("Error processing blob:\n{}".format(err))
            if retry_count < max_retry:
                print("will retry {}".format(key))
                retry_blobs[key] += 1
                queue.put(doc)
                return
            migrator.save_backup(doc, "too many retries")
            print("too many retries {}".format(key))
        retry_blobs.pop(key, None)

    def retry_loop():
        for doc in queue:
            enqueue_doc(doc)

    def enqueue_doc(doc):
        key = reindexer.get_key(doc)
        retry_count = retry_blobs.setdefault(key, 0)
        # pool.spawn will block until a worker is available
        pool.spawn(work_on, doc, key, retry_count)
        # Returning True here means the underlying iterator will think
        # this doc has been processed successfully. Therefore we must
        # process this doc before the process exits or save it to be
        # processed on the next run.
        return True

    queue = LifoQueue()
    loop = gevent.spawn(retry_loop)
    pool = Pool(size=num_workers)

    class gmigrator:
        migrate = staticmethod(enqueue_doc)

    with migrator:
        retry_blobs = iterable.get_iterator_detail("retry_blobs") or {}
        for key in list(retry_blobs):
            queue.put(reindexer.load(key))
        try:
            yield gmigrator
        finally:
            try:
                print("waiting for workers to stop... (Ctrl+C to abort)")
                queue.put(StopIteration)
                loop.join()
                while not pool.join(timeout=10):
                    print("waiting for {} workers to stop...".format(
                        len(pool)))
            finally:
                iterable.set_iterator_detail("retry_blobs", retry_blobs)
                print("done.")
Example #21
0
def gmail_initial_sync(crispin_client, log, folder_name, shared_state,
                       local_uids, uid_download_stack, msg_create_fn):
    remote_uid_count = len(set(crispin_client.all_uids()))
    remote_g_metadata, update_uid_count = get_g_metadata(
        crispin_client, log, folder_name, local_uids,
        shared_state['syncmanager_lock'])
    remote_uids = sorted(remote_g_metadata.keys(), key=int)
    log.info(remote_uid_count=len(remote_uids))
    if folder_name == crispin_client.folder_names()['all']:
        log.info(local_uid_count=len(local_uids))

    with shared_state['syncmanager_lock']:
        log.debug('gmail_initial_sync grabbed syncmanager_lock')
        with session_scope(ignore_soft_deletes=False) as db_session:
            deleted_uids = remove_deleted_uids(
                crispin_client.account_id, db_session, log, folder_name,
                local_uids, remote_uids)
            delete_uid_count = len(deleted_uids)

            local_uids = set(local_uids) - deleted_uids
            unknown_uids = set(remote_uids) - local_uids

            # Persist the num(messages) to sync (any type of sync: download,
            # update or delete) before we start.
            # Note that num_local_deleted, num_local_updated ARE the numbers to
            # delete/update too since we make those changes rightaway before we
            # start downloading messages.
            update_uid_counts(db_session, log, crispin_client.account_id,
                              folder_name, remote_uid_count=remote_uid_count,
                              download_uid_count=len(unknown_uids),
                              update_uid_count=update_uid_count,
                              delete_uid_count=delete_uid_count)

    if folder_name == crispin_client.folder_names()['inbox']:
        # We don't do an initial dedupe for Inbox because we do thread
        # expansion, which means even if we have a given msgid downloaded, we
        # miiight not have the whole thread. This means that restarts cause
        # duplicate work, but hopefully these folders aren't too huge.
        message_download_stack = LifoQueue()
        flags = crispin_client.flags(unknown_uids)
        for uid in unknown_uids:
            if uid in flags:
                message_download_stack.put(
                    GMessage(uid, remote_g_metadata[uid], flags[uid].flags,
                             flags[uid].labels))
        new_uid_poller = spawn(check_new_g_thrids, crispin_client.account_id,
                               crispin_client.PROVIDER, folder_name, log,
                               message_download_stack,
                               shared_state['poll_frequency'],
                               shared_state['syncmanager_lock'])
        download_queued_threads(crispin_client, log, folder_name,
                                message_download_stack,
                                shared_state['syncmanager_lock'])
    elif folder_name in uid_download_folders(crispin_client):
        full_download = deduplicate_message_download(
            crispin_client, log, shared_state['syncmanager_lock'],
            remote_g_metadata, unknown_uids)
        add_uids_to_stack(full_download, uid_download_stack)
        new_uid_poller = spawn(check_new_uids, crispin_client.account_id,
                               folder_name,
                               log, uid_download_stack,
                               shared_state['poll_frequency'],
                               shared_state['syncmanager_lock'])
        download_queued_uids(crispin_client, log, folder_name,
                             uid_download_stack, len(local_uids),
                             len(unknown_uids),
                             shared_state['syncmanager_lock'],
                             gmail_download_and_commit_uids, msg_create_fn)
    else:
        raise MailsyncError(
            'Unknown Gmail sync folder: {}'.format(folder_name))

    # Complete X-GM-MSGID mapping is no longer needed after initial sync.
    rm_cache(remote_g_metadata_cache_file(crispin_client.account_id,
                                          folder_name))

    new_uid_poller.kill()
Example #22
0
class BaseProcessor(LoggerMixin):
    name = 'base-processor'

    @classmethod
    def from_engine(cls, engine, *args, **kwargs):
        return cls(engine, *args, **kwargs)

    def _request(self):
        return self.engine.request

    request = property(_request)

    def __init__(self, engine, *args, **kwargs):
        from time import time
        from hashlib import md5
        from threading import Lock
        from gevent.queue import LifoQueue

        self.processor_name = '%s:%s' % (self.name, md5(str(
            time())).hexdigest()[:6])

        LoggerMixin.__init__(self)

        self.engine = engine

        self.__redis = None
        self.redis_lock = Lock()

        self.progress = 0

        self.total = 0
        # 忽略统计
        self.bypassed_cnt = 0

        # 超过这一限制时,add_task就暂停向其中添加任务
        self.maxsize = 1000
        self.tasks = LifoQueue()
        self.workers = []

        # 默认的polling间隔为1秒
        self.polling_interval = 1

        import argparse

        arg_parser = argparse.ArgumentParser()
        # 并发数量
        arg_parser.add_argument('--concur', type=int)
        args, leftover = arg_parser.parse_known_args()

        from core import dhaulagiri_settings

        if args.concur:
            dhaulagiri_settings['core']['concur'] = args.concur
        self.concur = dhaulagiri_settings['core']['concur']

        self.checkpoint_ts = None
        self.checkpoint_prog = None
        self.init_ts = time()

        # 心跳任务
        self.heart_beat = None

        # worker的Monitor。Worker在每次循环开始的时候,都会在该对象中进行一次状态更新
        self.worker_monitor = {}

    def update_worker_status(self, worker):
        """
        更新worker的状态
        :param worker:
        :return:
        """
        from time import time

        name = worker.worker_name
        self.worker_monitor[name] = time()

    def get_worker_stat(self):
        """
        获得worker队列的状态
        :return:
        """
        from time import time

        # 如果60秒都没有状态更新,说明该worker进入zombie状态
        time_window = 90

        cur = time()
        active = dict(
            filter(lambda item: item[1] >= cur - time_window,
                   self.worker_monitor.items()))
        zombie = dict(
            filter(lambda item: item[1] < cur - time_window,
                   self.worker_monitor.items()))

        return {'zombie': zombie, 'active': active}

    def incr_progress(self):
        self.progress += 1

    def _start_workers(self):
        def timer():
            """
            每30秒启动一次,输出当前进度
            """
            import time

            while True:
                msg = 'Progress: %d / %d.' % (self.progress, self.total)
                cts = time.time()

                if self.checkpoint_prog is not None and self.checkpoint_ts is not None:
                    rate = (self.progress - self.checkpoint_prog) / (
                        cts - self.checkpoint_ts) * 60
                    msg = '%s %s' % (msg, 'Processing rate: %d items/min' %
                                     int(rate))

                self.checkpoint_ts = cts
                self.checkpoint_prog = self.progress

                # 获得worker monitor统计
                stat = self.get_worker_stat()
                msg += ', active workers: %d, zombie workers: %d' % (len(
                    stat['active']), len(stat['zombie']))

                self.log(msg)
                gevent.sleep(30)

        self.heart_beat = gevent.spawn(timer)

        gevent.signal(signal.SIGKILL, gevent.kill)
        gevent.signal(signal.SIGQUIT, gevent.kill)

        for i in xrange(self.concur):
            worker = Worker.from_processor(self, self.tasks)
            self.workers.append(worker)

    def add_task(self, task, *args, **kwargs):
        # 是否启用流量控制
        flow_control = True
        while flow_control:
            # 如果self.tasks中的项目过多,则暂停添加
            if self.tasks.qsize() > self.maxsize:
                gevent.sleep(self.polling_interval)
            else:
                break

        func = lambda: task(*args, **kwargs)
        task_key = getattr(task, 'task_key', None)
        if task_key:
            setattr(func, 'task_key', task_key)
        self.tasks.put(func, timeout=120)
        self.logger.debug(
            'New task%s added to the queue. Remaining: %d' %
            ('(%s)' % task_key if task_key else '', self.tasks.qsize()))
        gevent.sleep(0)

    def _wait_for_workers(self):
        """
        等待所有的worker是否完成。判据:所有的worker都处于idle状态,并且tasks队列已空
        :return:
        """
        while True:
            if not self.tasks.empty():
                gevent.sleep(self.polling_interval)
                continue

            completed = True
            for w in self.workers:
                if not w.idle:
                    gevent.sleep(self.polling_interval)
                    completed = False
                    break

            if completed:
                break

        gevent.killall([w.gevent for w in self.workers])
        gevent.kill(self.heart_beat)

    def run(self):
        self._start_workers()
        self.populate_tasks()
        self._wait_for_workers()

        import time

        self.log(
            'Processor ended: %d items processed(%d bypassed) in %d minutes' %
            (self.progress, self.bypassed_cnt,
             int((time.time() - self.init_ts) / 60.0)))

    def populate_tasks(self):
        raise NotImplementedError
Example #23
0
def gmail_initial_sync(crispin_client, db_session, log, folder_name,
                       shared_state, local_uids, uid_download_stack,
                       msg_create_fn):
    remote_uid_count = len(set(crispin_client.all_uids()))
    remote_g_metadata, sync_info = get_g_metadata(
        crispin_client, db_session, log, folder_name, local_uids,
        shared_state['syncmanager_lock'])
    sync_type, update_uid_count = sync_info
    remote_uids = sorted(remote_g_metadata.keys(), key=int)
    log.info(remote_uid_count=len(remote_uids))
    if folder_name == crispin_client.folder_names()['all']:
        log.info(local_uid_count=len(local_uids))

    with shared_state['syncmanager_lock']:
        log.debug('gmail_initial_sync grabbed syncmanager_lock')
        deleted_uids = remove_deleted_uids(
            crispin_client.account_id, db_session, log, folder_name,
            local_uids, remote_uids)
    delete_uid_count = len(deleted_uids)

    local_uids = set(local_uids) - deleted_uids
    unknown_uids = set(remote_uids) - local_uids

    # Persist the num(messages) to sync (any type of sync: download,
    # update or delete) before we start.
    # Note that num_local_deleted, num_local_updated ARE the numbers to
    # delete/update too since we make those changes rightaway before we start
    # downloading messages.
    update_uid_counts(db_session, log, crispin_client.account_id, folder_name,
                      remote_uid_count=remote_uid_count,
                      download_uid_count=len(unknown_uids),
                      update_uid_count=update_uid_count,
                      delete_uid_count=delete_uid_count,
                      sync_type=sync_type)

    if folder_name == crispin_client.folder_names()['inbox']:
        # We don't do an initial dedupe for Inbox because we do thread
        # expansion, which means even if we have a given msgid downloaded, we
        # miiight not have the whole thread. This means that restarts cause
        # duplicate work, but hopefully these folders aren't too huge.
        message_download_stack = LifoQueue()
        flags = crispin_client.flags(unknown_uids)
        for uid in unknown_uids:
            if uid in flags:
                message_download_stack.put(
                    GMessage(uid, remote_g_metadata[uid], flags[uid].flags,
                             flags[uid].labels))
        new_uid_poller = spawn(check_new_g_thrids, crispin_client.account_id,
                               crispin_client.PROVIDER, folder_name, log,
                               message_download_stack,
                               shared_state['poll_frequency'],
                               shared_state['syncmanager_lock'])
        download_queued_threads(crispin_client, db_session, log, folder_name,
                                message_download_stack,
                                shared_state['syncmanager_lock'])
    elif folder_name in uid_download_folders(crispin_client):
        full_download = deduplicate_message_download(
            crispin_client, db_session, log, shared_state['syncmanager_lock'],
            remote_g_metadata, unknown_uids)
        add_uids_to_stack(full_download, uid_download_stack)
        new_uid_poller = spawn(check_new_uids, crispin_client.account_id,
                               crispin_client.PROVIDER, folder_name,
                               log, uid_download_stack,
                               shared_state['poll_frequency'],
                               shared_state['syncmanager_lock'])
        download_queued_uids(crispin_client, db_session, log, folder_name,
                             uid_download_stack, len(local_uids),
                             len(unknown_uids),
                             shared_state['syncmanager_lock'],
                             gmail_download_and_commit_uids, msg_create_fn)
    else:
        raise MailsyncError(
            'Unknown Gmail sync folder: {}'.format(folder_name))

    # Complete X-GM-MSGID mapping is no longer needed after initial sync.
    rm_cache(remote_g_metadata_cache_file(crispin_client.account_id,
                                          folder_name))

    new_uid_poller.kill()
Example #24
0
    def initial_sync_impl(self, crispin_client, local_uids,
                          uid_download_stack):
        # We wrap the block in a try/finally because the greenlets like
        # new_uid_poller need to be killed when this greenlet is interrupted
        try:
            remote_uid_count = len(set(crispin_client.all_uids()))
            remote_g_metadata, update_uid_count = self.__fetch_g_metadata(
                crispin_client, local_uids)
            remote_uids = sorted(remote_g_metadata.keys(), key=int)
            log.info(remote_uid_count=len(remote_uids))
            if self.folder_name == crispin_client.folder_names()['all']:
                log.info(local_uid_count=len(local_uids))

            with self.syncmanager_lock:
                log.debug('gmail_initial_sync grabbed syncmanager_lock')
                with mailsync_session_scope() as db_session:
                    deleted_uids = self.remove_deleted_uids(
                        db_session, local_uids, remote_uids)
                    delete_uid_count = len(deleted_uids)

                    local_uids = set(local_uids) - deleted_uids
                    unknown_uids = set(remote_uids) - local_uids

                    # Persist the num(messages) to sync (any type of sync:
                    # download, update or delete) before we start.  Note that
                    # num_local_deleted, num_local_updated ARE the numbers to
                    # delete/update too since we make those changes rightaway
                    # before we start downloading messages.
                    self.update_uid_counts(
                        db_session,
                        remote_uid_count=remote_uid_count,
                        download_uid_count=len(unknown_uids),
                        update_uid_count=update_uid_count,
                        delete_uid_count=delete_uid_count)

            if self.folder_name == crispin_client.folder_names()['inbox']:
                # We don't do an initial dedupe for Inbox because we do thread
                # expansion, which means even if we have a given msgid
                # downloaded, we miiight not have the whole thread. This means
                # that restarts cause duplicate work, but hopefully these
                # folders aren't too huge.
                message_download_stack = LifoQueue()
                flags = crispin_client.flags(unknown_uids)
                for uid in unknown_uids:
                    if uid in flags:
                        message_download_stack.put(
                            GMessage(uid, remote_g_metadata[uid],
                                     flags[uid].flags, flags[uid].labels))
                new_uid_poller = spawn(self.__check_new_g_thrids,
                                       message_download_stack)
                self.__download_queued_threads(crispin_client,
                                               message_download_stack)
            elif self.folder_name in uid_download_folders(crispin_client):
                full_download = self.__deduplicate_message_download(
                    crispin_client, remote_g_metadata, unknown_uids)
                add_uids_to_stack(full_download, uid_download_stack)
                new_uid_poller = spawn(self.check_new_uids, uid_download_stack)
                self.download_uids(crispin_client, uid_download_stack)
            else:
                raise MailsyncError('Unknown Gmail sync folder: {}'.format(
                    self.folder_name))

            # Complete X-GM-MSGID mapping is no longer needed after initial
            # sync.
            rm_cache(
                remote_g_metadata_cache_file(self.account_id,
                                             self.folder_name))
        finally:
            new_uid_poller.kill()
 def init_tasks_queue(self, sub_domains):
     tasks_queue = LifoQueue()
     for sub_domain in sub_domains:
         tasks_queue.put(sub_domain)
     return tasks_queue
Example #26
0
 def __init__(self):
     self._lifoqueue = LifoQueue()
Example #27
0
class ClientPool(object):
    DEFAULT_CLIENT_EXPIRE_TIME = 300
    DEFAULT_CLOSE_EXPIRE_CLIENT_INTERVAL = 60

    def __init__(self, pool_name, pool_size, client_class,
                 close_client_handler, *client_args, **client_kwargs):
        assert pool_size > 0
        assert client_class is not None and hasattr(client_class, '__call__')
        assert close_client_handler is None or hasattr(close_client_handler,
                                                       '__call__')
        self._pool_name = pool_name
        self._pool_size = pool_size
        self._client_class = client_class
        self._close_client_handler = close_client_handler
        self._client_args = client_args
        self._client_kwargs = client_kwargs
        self._queue = LifoQueue(maxsize=pool_size)
        for i in range(pool_size):
            self._queue.put(ClientHolder())
        self._client_expire_time = self.DEFAULT_CLIENT_EXPIRE_TIME
        self._gc_task = ScheduleTask(
            name='ClientPool-GC-%s' % pool_name,
            start_after_seconds=0,
            interval_seconds=self.DEFAULT_CLOSE_EXPIRE_CLIENT_INTERVAL,
            handler=self._close_expire_client)
        self._gc_task.run()

    def __del__(self):
        self._gc_task.stop()

    @contextmanager
    def get_client(self,
                   block=True,
                   pool_acquire_client_timeout=1,
                   req_timeout=5):
        client_holder = self._get_client(block, pool_acquire_client_timeout)
        tm = None
        try:
            tm = gevent.Timeout.start_new(req_timeout)
            yield client_holder.get_client()
        except BaseException as e:
            logger.error(
                'Client is out pool for too long %s seconds, raise exception: %s',
                req_timeout, e)
            self._close_client(client_holder)
            raise
        finally:
            if tm:
                tm.cancel()
            self.push(client_holder)

    def _get_client(self, block=True, timeout=1):
        if self.is_empty():
            logger.info('ClientPool: %s is empty.', self._pool_name)
        client_holder = self._queue.get(block=block, timeout=timeout)
        if client_holder.get_client() is None:
            tm = None
            try:
                tm = gevent.Timeout.start_new(timeout)
                client_holder.set_client(self._create_client())
            except BaseException as e:
                client_holder.set_client(None)
                self.push(client_holder)
                raise
            finally:
                if tm:
                    tm.cancel()
        client_holder.set_access_time(time.time())
        return client_holder

    def push(self, client_holder):
        if not self.is_full():
            self._queue.put_nowait(client_holder)

    def is_full(self):
        return self._queue.qsize() >= self._pool_size

    def is_empty(self):
        return self._queue.qsize() <= 0

    def _create_client(self):
        return self._client_class(*self._client_args, **self._client_kwargs)

    def _close_client(self, client_holder):
        if self._close_client_handler and client_holder.get_client():
            try:
                self._close_client_handler(client_holder.get_client())
            except Exception as e:
                logger.error('Close client raise exception: %s', e)
        client_holder.set_client(None)

    def _close_expire_client(self):
        cur_time = time.time()
        need_closed_clients = []
        for client_holder in self._queue.queue:
            if client_holder.get_client(
            ) and cur_time - client_holder.get_access_time(
            ) > self._client_expire_time:
                need_closed_clients.append(client_holder.get_client)

        for client in need_closed_clients:
            self._close_client_handler(client)
Example #28
0
def gmail_initial_sync(crispin_client, db_session, log, folder_name,
                       shared_state, local_uids, uid_download_stack):
    remote_g_metadata = get_g_metadata(crispin_client, db_session, log,
                                       folder_name, local_uids,
                                       shared_state['syncmanager_lock'])
    remote_uids = sorted(remote_g_metadata.keys(), key=int)
    log.info("Found {0} UIDs for folder {1}".format(len(remote_uids),
                                                    folder_name))
    if folder_name == crispin_client.folder_names()['all']:
        log.info("Already have {0} UIDs".format(len(local_uids)))

    with shared_state['syncmanager_lock']:
        log.debug("gmail_initial_sync grabbed syncmanager_lock")
        deleted_uids = remove_deleted_uids(
            crispin_client.account_id, db_session, log, folder_name,
            local_uids, remote_uids)
    local_uids = set(local_uids) - deleted_uids
    unknown_uids = set(remote_uids) - local_uids

    # folders that don't get thread expanded
    uid_download_folders = [crispin_client.folder_names()[tag] for tag in
                            ('trash', 'spam', 'all') if tag in
                            crispin_client.folder_names()]

    if folder_name == crispin_client.folder_names()['inbox']:
        # We don't do an initial dedupe for Inbox because we do thread
        # expansion, which means even if we have a given msgid downloaded, we
        # miiight not have the whole thread. This means that restarts cause
        # duplicate work, but hopefully these folders aren't too huge.
        message_download_stack = LifoQueue()
        flags = crispin_client.flags(unknown_uids)
        for uid in unknown_uids:
            if uid in flags:
                message_download_stack.put(
                    GMessage(uid, remote_g_metadata[uid], flags[uid].flags,
                             flags[uid].labels))
        new_uid_poller = spawn(check_new_g_thrids, crispin_client.account_id,
                               crispin_client.PROVIDER, folder_name, log,
                               message_download_stack,
                               shared_state['poll_frequency'],
                               shared_state['syncmanager_lock'])
        download_queued_threads(crispin_client, db_session, log, folder_name,
                                message_download_stack,
                                shared_state['status_cb'],
                                shared_state['syncmanager_lock'])
    elif folder_name in uid_download_folders:
        full_download = deduplicate_message_download(
            crispin_client, db_session, log, shared_state['syncmanager_lock'],
            remote_g_metadata, unknown_uids)

        add_uids_to_stack(full_download, uid_download_stack)
        new_uid_poller = spawn(check_new_uids, crispin_client.account_id,
                               crispin_client.PROVIDER, folder_name,
                               log, uid_download_stack,
                               shared_state['poll_frequency'],
                               shared_state['syncmanager_lock'])
        download_queued_uids(crispin_client, db_session, log, folder_name,
                             uid_download_stack, len(local_uids),
                             len(remote_uids), shared_state['status_cb'],
                             shared_state['syncmanager_lock'],
                             gmail_download_and_commit_uids,
                             create_gmail_message)
    else:
        raise MailsyncError(
            "Unknown Gmail sync folder: {}".format(folder_name))

    # Complete X-GM-MSGID mapping is no longer needed after initial sync.
    rm_cache(remote_g_metadata_cache_file(crispin_client.account_id,
                                          folder_name))

    new_uid_poller.kill()
Example #29
0
File: gmail.py Project: caitp/inbox
def gmail_initial_sync(crispin_client, db_session, log, folder_name,
                       shared_state, local_uids, uid_download_stack):
    remote_g_metadata = get_g_metadata(crispin_client, db_session, log,
                                       folder_name, local_uids,
                                       shared_state['syncmanager_lock'])
    remote_uids = sorted(remote_g_metadata.keys(), key=int)
    log.info("Found {0} UIDs for folder {1}".format(len(remote_uids),
                                                    folder_name))
    if folder_name == crispin_client.folder_names()['all']:
        log.info("Already have {0} UIDs".format(len(local_uids)))

    with shared_state['syncmanager_lock']:
        log.debug("gmail_initial_sync grabbed syncmanager_lock")
        deleted_uids = remove_deleted_uids(
            crispin_client.account_id, db_session, log, folder_name,
            local_uids, remote_uids)
    local_uids = set(local_uids) - deleted_uids
    unknown_uids = set(remote_uids) - local_uids

    if folder_name == crispin_client.folder_names()['inbox']:
        # We don't do an initial dedupe for Inbox because we do thread
        # expansion, which means even if we have a given msgid downloaded, we
        # miiight not have the whole thread. This means that restarts cause
        # duplicate work, but hopefully these folders aren't too huge.
        message_download_stack = LifoQueue()
        flags = crispin_client.flags(unknown_uids)
        for uid in unknown_uids:
            if uid in flags:
                message_download_stack.put(
                    GMessage(uid, remote_g_metadata[uid], flags[uid].flags,
                             flags[uid].labels))
        new_uid_poller = spawn(check_new_g_thrids, crispin_client.account_id,
                               crispin_client.PROVIDER, folder_name, log,
                               message_download_stack,
                               shared_state['poll_frequency'],
                               shared_state['syncmanager_lock'])
        download_queued_threads(crispin_client, db_session, log, folder_name,
                                message_download_stack,
                                shared_state['status_cb'],
                                shared_state['syncmanager_lock'])
    elif folder_name in uid_download_folders(crispin_client):
        full_download = deduplicate_message_download(
            crispin_client, db_session, log, shared_state['syncmanager_lock'],
            remote_g_metadata, unknown_uids)

        add_uids_to_stack(full_download, uid_download_stack)
        new_uid_poller = spawn(check_new_uids, crispin_client.account_id,
                               crispin_client.PROVIDER, folder_name,
                               log, uid_download_stack,
                               shared_state['poll_frequency'],
                               shared_state['syncmanager_lock'])
        download_queued_uids(crispin_client, db_session, log, folder_name,
                             uid_download_stack, len(local_uids),
                             len(remote_uids), shared_state['status_cb'],
                             shared_state['syncmanager_lock'],
                             gmail_download_and_commit_uids,
                             create_gmail_message)
    else:
        raise MailsyncError(
            "Unknown Gmail sync folder: {}".format(folder_name))

    # Complete X-GM-MSGID mapping is no longer needed after initial sync.
    rm_cache(remote_g_metadata_cache_file(crispin_client.account_id,
                                          folder_name))

    new_uid_poller.kill()
Example #30
0
def uid_list_to_stack(uids):
    """ UID download function needs a stack even for polling. """
    uid_download_stack = LifoQueue()
    for uid in sorted(uids, key=int):
        uid_download_stack.put(uid)
    return uid_download_stack
Example #31
0
import scrapy
from redis import StrictRedis, ConnectionPool
from gevent.queue import LifoQueue
from gevent.pool import Pool
from urllib.parse import quote, unquote
import gevent.monkey
from pymongo import MongoClient
import time
import traceback
import json
import math
import argparse
from pprint import pprint

redis_queue = StrictRedis(connection_pool=ConnectionPool(host='127.0.0.1', port=6379, db=0, decode_responses=True))
task_queue = LifoQueue()

SEARCH_KEY = 'search_key'
SEARCH_HTML = 'search_html'
LIST_LINK = 'list_link'
LIST_HTML = 'list_html'
FIRST_COMMENT_LINK = 'first_comment_link'
FIRST_COMMENT_HTML = 'first_comment_html'
COMMENT_LINK = 'comment_link'
COMMENT_HTML = 'comment_html'
# MAX_PAGE = 'max_page'
IP_KEY = 'ip'


class JDCrawler(object):
    def __init__(self):
Example #32
0
class SQLPlugin(Plugin):
    global_plugin = True

    def load(self, ctx):
        self.models = ctx.get('models', {})
        self.backfills = {}
        self.user_updates = LifoQueue(maxsize=4096)
        super(SQLPlugin, self).load(ctx)

    def unload(self, ctx):
        ctx['models'] = self.models
        super(SQLPlugin, self).unload(ctx)

    @Plugin.schedule(15, init=False)
    def update_users(self):
        already_updated = set()

        while True:
            # Only update so many at a time
            if len(already_updated) > 10000:
                return

            try:
                user_id, data = self.user_updates.get_nowait()
            except Empty:
                return

            if user_id in already_updated:
                continue

            already_updated.add(user_id)

            try:
                User.update(**data).where(User.user_id == user_id).execute()
            except:
                self.log.exception('Failed to update user %s: ', user_id)

    @Plugin.listen('VoiceStateUpdate', priority=Priority.BEFORE)
    def on_voice_state_update(self, event):
        pre_state = self.state.voice_states.get(event.session_id)
        GuildVoiceSession.create_or_update(pre_state, event.state)

    @Plugin.listen('PresenceUpdate')
    def on_presence_update(self, event):
        updates = {}

        if event.user.avatar != UNSET:
            updates['avatar'] = event.user.avatar

        if event.user.username != UNSET:
            updates['username'] = event.user.username

        if event.user.discriminator != UNSET:
            updates['discriminator'] = int(event.user.discriminator)

        if not updates:
            return

        self.user_updates.put((event.user.id, updates))

    @Plugin.listen('MessageCreate')
    def on_message_create(self, event):
        Message.from_disco_message(event.message)

    @Plugin.listen('MessageUpdate')
    def on_message_update(self, event):
        Message.from_disco_message_update(event.message)

    @Plugin.listen('MessageDelete')
    def on_message_delete(self, event):
        Message.update(deleted=True).where(Message.id == event.id).execute()

    @Plugin.listen('MessageDeleteBulk')
    def on_message_delete_bulk(self, event):
        Message.update(deleted=True).where((Message.id << event.ids)).execute()

    @Plugin.listen('MessageReactionAdd', priority=Priority.BEFORE)
    def on_message_reaction_add(self, event):
        Reaction.from_disco_reaction(event)

    @Plugin.listen('MessageReactionRemove', priority=Priority.BEFORE)
    def on_message_reaction_remove(self, event):
        Reaction.delete().where(
            (Reaction.message_id == event.message_id) &
            (Reaction.user_id == event.user_id) &
            (Reaction.emoji_id == (event.emoji.id or None)) &
            (Reaction.emoji_name == (event.emoji.name or None))).execute()

    @Plugin.listen('MessageReactionRemoveAll')
    def on_message_reaction_remove_all(self, event):
        Reaction.delete().where((Reaction.message_id == event.message_id)).execute()

    @Plugin.listen('GuildEmojisUpdate', priority=Priority.BEFORE)
    def on_guild_emojis_update(self, event):
        ids = []

        for emoji in event.emojis:
            GuildEmoji.from_disco_guild_emoji(emoji, event.guild_id)
            ids.append(emoji.id)

        GuildEmoji.update(deleted=True).where(
            (GuildEmoji.guild_id == event.guild_id) &
            (~(GuildEmoji.emoji_id << ids))
        ).execute()

    @Plugin.listen('GuildCreate')
    def on_guild_create(self, event):
        for channel in list(event.channels.values()):
            Channel.from_disco_channel(channel)

        for emoji in list(event.emojis.values()):
            GuildEmoji.from_disco_guild_emoji(emoji, guild_id=event.guild.id)

    @Plugin.listen('GuildDelete')
    def on_guild_delete(self, event):
        if event.deleted:
            Channel.update(deleted=True).where(
                Channel.guild_id == event.id
            ).execute()

    @Plugin.listen('ChannelCreate')
    def on_channel_create(self, event):
        Channel.from_disco_channel(event.channel)

    @Plugin.listen('ChannelUpdate')
    def on_channel_update(self, event):
        Channel.from_disco_channel(event.channel)

    @Plugin.listen('ChannelDelete')
    def on_channel_delete(self, event):
        Channel.update(deleted=True).where(Channel.channel_id == event.channel.id).execute()

    @Plugin.command('sql', level=-1, global_=True)
    def command_sql(self, event):
        conn = database.obj.get_conn()

        try:
            tbl = MessageTable(codeblock=False)

            with conn.cursor() as cur:
                start = time.time()
                cur.execute(event.codeblock.format(e=event))
                dur = time.time() - start

                if not cur.description:
                    return event.msg.reply('_Query took {}ms - no result._'.format(int(dur * 1000)))

                tbl.set_header(*[desc[0] for desc in cur.description])

                for row in cur.fetchall():
                    tbl.add(*row)

                result = tbl.compile()
                if len(result) > 1900:
                    return event.msg.reply(
                        '_Query took {}ms_'.format(int(dur * 1000)),
                        attachments=[('sql_result_{}.txt'.format(event.msg.id), result)]
                    )

                event.msg.reply(u'```{}```_Query took {}ms_'.format(result, int(dur * 1000)))
        except psycopg2.Error as e:
            event.msg.reply(u'```{}```'.format(e.pgerror))

    @Plugin.command('init', '<entity:user|channel>', level=-1, group='markov', global_=True)
    def command_markov(self, event, entity):
        if isinstance(entity, DiscoUser):
            q = Message.select().where(Message.author_id == entity.id).limit(500000)
        else:
            q = Message.select().where(Message.channel_id == entity.id).limit(500000)

        text = [msg.content for msg in q]
        self.models[entity.id] = markovify.NewlineText('\n'.join(text))
        event.msg.reply(u':ok_hand: created markov model for {} using {} messages'.format(entity, len(text)))

    @Plugin.command('one', '<entity:user|channel>', level=-1, group='markov', global_=True)
    def command_markov_one(self, event, entity):
        if entity.id not in self.models:
            return event.msg.reply(':warning: no model created yet for {}'.format(entity))

        sentence = self.models[entity.id].make_sentence(max_overlap_ratio=1, max_overlap_total=500)
        if not sentence:
            event.msg.reply(':warning: not enough data :(')
            return
        event.msg.reply(u'{}: {}'.format(entity, sentence))

    @Plugin.command('many', '<entity:user|channel> [count:int]', level=-1, group='markov', global_=True)
    def command_markov_many(self, event, entity, count=5):
        if entity.id not in self.models:
            return event.msg.reply(':warning: no model created yet for {}'.format(entity))

        for _ in range(int(count)):
            sentence = self.models[entity.id].make_sentence(max_overlap_total=500)
            if not sentence:
                event.msg.reply(':warning: not enough data :(')
                return
            event.msg.reply(u'{}: {}'.format(entity, sentence))

    @Plugin.command('list', level=-1, group='markov', global_=True)
    def command_markov_list(self, event):
        event.msg.reply(u'`{}`'.format(', '.join(map(str, self.models.keys()))))

    @Plugin.command('delete', '<oid:snowflake>', level=-1, group='markov', global_=True)
    def command_markov_delete(self, event, oid):
        if oid not in self.models:
            return event.msg.reply(':warning: no model with that ID')

        del self.models[oid]
        event.msg.reply(':ok_hand: deleted model')

    @Plugin.command('clear', level=-1, group='markov', global_=True)
    def command_markov_clear(self, event):
        self.models = {}
        event.msg.reply(':ok_hand: cleared models')

    @Plugin.command('message', '<channel:snowflake> <message:snowflake>', level=-1, group='backfill', global_=True)
    def command_backfill_message(self, event, channel, message):
        channel = self.state.channels.get(channel)
        Message.from_disco_message(channel.get_message(message))
        return event.msg.reply(':ok_hand: backfilled')

    @Plugin.command('reactions', '<message:snowflake>', level=-1, group='backfill', global_=True)
    def command_sql_reactions(self, event, message):
        try:
            message = Message.get(id=message)
        except Message.DoesNotExist:
            return event.msg.reply(':warning: no message found')

        message = self.state.channels.get(message.channel_id).get_message(message.id)
        for reaction in message.reactions:
            for users in message.get_reactors(reaction.emoji, bulk=True):
                Reaction.from_disco_reactors(message.id, reaction, (i.id for i in users))

    @Plugin.command('global', '<duration:str> [pool:int]', level=-1, global_=True, context={'mode': 'global'}, group='recover')
    @Plugin.command('here', '<duration:str> [pool:int]', level=-1, global_=True, context={'mode': 'here'}, group='recover')
    def command_recover(self, event, duration, pool=4, mode=None):
        if mode == 'global':
            channels = list(self.state.channels.values())
        else:
            channels = list(event.guild.channels.values())

        start_at = parse_duration(duration, negative=True)

        pool = Pool(pool)

        total = len(channels)
        msg = event.msg.reply('Recovery Status: 0/{}'.format(total))
        recoveries = []

        def updater():
            last = len(recoveries)

            while True:
                if last != len(recoveries):
                    last = len(recoveries)
                    msg.edit('Recovery Status: {}/{}'.format(len(recoveries), total))
                gevent.sleep(5)

        u = self.spawn(updater)

        try:
            for channel in channels:
                pool.wait_available()
                r = Recovery(self.log, channel, start_at)
                pool.spawn(r.run)
                recoveries.append(r)
        finally:
            pool.join()
            u.kill()

        msg.edit('RECOVERY COMPLETED ({} total messages)'.format(
            sum([i._recovered for i in recoveries])
        ))

    @Plugin.command('backfill channel', '[channel:snowflake]', level=-1, global_=True)
    def command_backfill_channel(self, event, channel=None):
        channel = self.state.channels.get(channel) if channel else event.channel
        backfill_channel.queue(channel.id)
        event.msg.reply(':ok_hand: enqueued channel to be backfilled')

    @Plugin.command('backfill guild', '[guild:guild] [concurrency:int]', level=-1, global_=True)
    def command_backfill_guild(self, event, guild=None, concurrency=1):
        guild = guild or event.guild
        backfill_guild.queue(guild.id)
        event.msg.reply(':ok_hand: enqueued guild to be backfilled')

    @Plugin.command('usage', '<word:str> [unit:str] [amount:int]', level=-1, group='words')
    def words_usage(self, event, word, unit='days', amount=7):
        sql = '''
            SELECT date, coalesce(count, 0) AS count
            FROM
                generate_series(
                    NOW() - interval %s,
                    NOW(),
                    %s
                ) AS date
            LEFT OUTER JOIN (
                SELECT date_trunc(%s, timestamp) AS dt, count(*) AS count
                FROM messages
                WHERE
                    timestamp >= (NOW() - interval %s) AND
                    timestamp < (NOW()) AND
                    guild_id=%s AND
                    (SELECT count(*) FROM regexp_matches(content, %s)) >= 1
                GROUP BY dt
            ) results
            ON (date_trunc(%s, date) = results.dt);
        '''

        msg = event.msg.reply(':alarm_clock: One moment pls...')

        start = time.time()
        tuples = list(Message.raw(
            sql,
            '{} {}'.format(amount, unit),
            '1 {}'.format(unit),
            unit,
            '{} {}'.format(amount, unit),
            event.guild.id,
            '\s?{}\s?'.format(word),
            unit
        ).tuples())
        sql_duration = time.time() - start

        start = time.time()
        chart = pygal.Line()
        chart.title = 'Usage of {} Over {} {}'.format(
            word, amount, unit,
        )

        if unit == 'days':
            chart.x_labels = [i[0].strftime('%a %d') for i in tuples]
        elif unit == 'minutes':
            chart.x_labels = [i[0].strftime('%X') for i in tuples]
        else:
            chart.x_labels = [i[0].strftime('%x %X') for i in tuples]

        chart.x_labels = [i[0] for i in tuples]
        chart.add(word, [i[1] for i in tuples])

        pngdata = cairosvg.svg2png(
            bytestring=chart.render(),
            dpi=72)
        chart_duration = time.time() - start

        event.msg.reply(
            '_SQL: {}ms_ - _Chart: {}ms_'.format(
                int(sql_duration * 1000),
                int(chart_duration * 1000),
            ),
            attachments=[('chart.png', pngdata)])
        msg.delete()

    @Plugin.command('top', '<target:user|channel|guild>', level=-1, group='words')
    def words_top(self, event, target):
        if isinstance(target, DiscoUser):
            q = 'author_id'
        elif isinstance(target, DiscoChannel):
            q = 'channel_id'
        elif isinstance(target, DiscoGuild):
            q = 'guild_id'
        else:
            raise Exception("You should not be here")

        sql = """
            SELECT word, count(*)
            FROM (
                SELECT regexp_split_to_table(content, '\s') as word
                FROM messages
                WHERE {}=%s
                LIMIT 3000000
            ) t
            GROUP BY word
            ORDER BY 2 DESC
            LIMIT 30
        """.format(q)

        t = MessageTable()
        t.set_header('Word', 'Count')

        for word, count in Message.raw(sql, target.id).tuples():
            if '```' in word:
                continue
            t.add(word, count)

        event.msg.reply(t.compile())
Example #33
0
 def load(self, ctx):
     self.models = ctx.get('models', {})
     self.backfills = {}
     self.user_updates = LifoQueue(maxsize=4096)
     super(SQLPlugin, self).load(ctx)
Example #34
0
def uid_list_to_stack(uids):
    """ UID download function needs a stack even for polling. """
    uid_download_stack = LifoQueue()
    for uid in sorted(uids, key=int):
        uid_download_stack.put(uid)
    return uid_download_stack
class AbstractDatabaseConnectionPool(object):
    def __init__(self, maxsize=100, maxwait=1.0, expires=None, cleanup=None):
        """
        The pool manages opened connections to the database. The main strategy is to keep the smallest number
        of alive connections which are required for best web service performance.
        In most cases connections are taken from the pool. In case of views-peeks, pool creates some
        extra resources preventing service gone unavailable. In time of low traffic (night) unnecessary
        connections are released.

        Parameters
        ----------
        maxsize : int
                  Soft limit of the number of created connections. After reaching this limit
                  taking the next connection first waits `maxwait` time for any returned slot.
        maxwait : float
                  The time in seconds which is to be wait before creating new connection after the pool gets empty.
                  It may be 0 then immediate connections are created til `maxoverflow` is reached.
        expires : float
                  The time in seconds indicates how long connection should stay alive.
                  It is also used to close unneeded slots.
        """
        if not isinstance(maxsize, integer_types):
            raise TypeError('Expected integer, got %r' % (maxsize, ))

        self._maxsize = maxsize
        self._maxwait = maxwait
        self._expires = expires
        self._cleanup = cleanup
        self._created_at = {}
        self._latest_use = {}
        self._pool = LifoQueue()
        self._size = 0
        self._latest_cleanup = 0 if self._expires or self._cleanup else 0xffffffffffffffff
        self._interval_cleanup = min(
            self._expires or self._cleanup, self._cleanup
            or self._expires) if self._expires or self._cleanup else 0
        self._cleanup_lock = Semaphore(value=1)

    def create_connection(self):
        raise NotImplementedError()

    def close_connection(self, item):
        try:
            self._size -= 1
            self._created_at.pop(id(item), None)
            self._latest_use.pop(id(item), None)
            item.close()
        except Exception:
            pass

    def cleanup(self):
        self._cleanup_queue(time.time())

    def _cleanup_queue(self, now):

        if self._latest_cleanup > now:
            return

        with self._cleanup_lock:

            if self._latest_cleanup > now:
                return

            self._latest_cleanup = now + self._interval_cleanup

            cleanup = now - self._cleanup if self._cleanup else None
            expires = now - self._expires if self._expires else None

            # Instead of creating new LIFO for self._pool, the ole one is reused,
            # beacuse some othere might wait for connetion on it.
            fresh_slots = []

            try:
                # try to fill self._pool ASAP, preventing creation of new connections.
                # because after this loop LIFO will be in reversed order
                while not self._pool.empty():
                    item = self._pool.get_nowait()
                    if cleanup and self._latest_use.get(id(item), 0) < cleanup:
                        self.close_connection(item)
                    elif expires and self._created_at.get(id(item),
                                                          0) < expires:
                        self.close_connection(item)
                    else:
                        fresh_slots.append(item)
            except Empty:
                pass

            # Reverse order back (frestest connections should be at the begining)
            for conn in reversed(fresh_slots):
                self._pool.put_nowait(conn)

    def get(self):

        try:
            return self._pool.get_nowait()
        except Empty:
            pass

        if self._size >= self._maxsize:
            try:
                return self._pool.get(timeout=self._maxwait)
            except Empty:
                pass

        # It is posiible that after waiting self._maxwait time, non connection has been returned
        # because of cleaning up old ones on put(), so there is not connection but also LIFO is not full.
        # In that case new connection shouls be created, otherwise exception is risen.
        if self._size >= self._maxsize:
            raise OperationalError(
                "Too many connections created: {} (maxsize is {})".format(
                    self._size, self._maxsize))

        try:
            self._size += 1
            conn = self.create_connection()
        except:
            self._size -= 1
            raise

        now = time.time()
        self._created_at[id(conn)] = now
        self._latest_use[id(conn)] = now
        return conn

    def put(self, conn):
        now = time.time()
        self._pool.put(conn)
        self._latest_use[id(conn)] = now

        self._cleanup_queue(now)

    def closeall(self):
        while not self._pool.empty():
            conn = self._pool.get_nowait()
            try:
                conn.close()
            except Exception:
                pass
        self._size = 0

    @contextlib.contextmanager
    def connection(self, isolation_level=None):
        conn = self.get()
        try:
            if isolation_level is not None:
                if conn.isolation_level == isolation_level:
                    isolation_level = None
                else:
                    conn.set_isolation_level(isolation_level)
            yield conn
        except:
            if conn.closed:
                conn = None
                self.closeall()
            else:
                conn = self._rollback(conn)
            raise
        else:
            if conn.closed:
                raise OperationalError(
                    "Cannot commit because connection was closed: %r" %
                    (conn, ))
            conn.commit()
        finally:
            if conn is not None and not conn.closed:
                if isolation_level is not None:
                    conn.set_isolation_level(isolation_level)
                self.put(conn)

    @contextlib.contextmanager
    def cursor(self, *args, **kwargs):
        isolation_level = kwargs.pop('isolation_level', None)
        with self.connection(isolation_level) as conn:
            yield conn.cursor(*args, **kwargs)

    def _rollback(self, conn):
        try:
            conn.rollback()
        except:
            gevent.get_hub().handle_error(conn, *sys.exc_info())
            return
        return conn

    def execute(self, *args, **kwargs):
        with self.cursor(**kwargs) as cursor:
            cursor.execute(*args)
            return cursor.rowcount

    def fetchone(self, *args, **kwargs):
        with self.cursor(**kwargs) as cursor:
            cursor.execute(*args)
            return cursor.fetchone()

    def fetchall(self, *args, **kwargs):
        with self.cursor(**kwargs) as cursor:
            cursor.execute(*args)
            return cursor.fetchall()

    def fetchiter(self, *args, **kwargs):
        with self.cursor(**kwargs) as cursor:
            cursor.execute(*args)
            while True:
                items = cursor.fetchmany()
                if not items:
                    break
                for item in items:
                    yield item
Example #36
0
 def __init__(self):
     self._lifoqueue = LifoQueue()
Example #37
0
    def initial_sync_impl(self, crispin_client, local_uids,
                          uid_download_stack):
        # We wrap the block in a try/finally because the greenlets like
        # new_uid_poller need to be killed when this greenlet is interrupted
        try:
            remote_uid_count = len(set(crispin_client.all_uids()))
            remote_g_metadata, update_uid_count = self.__fetch_g_metadata(
                crispin_client, local_uids)
            remote_uids = sorted(remote_g_metadata.keys(), key=int)
            log.info(remote_uid_count=len(remote_uids))
            if self.folder_name == crispin_client.folder_names()['all']:
                log.info(local_uid_count=len(local_uids))

            with self.syncmanager_lock:
                log.debug('gmail_initial_sync grabbed syncmanager_lock')
                with mailsync_session_scope() as db_session:
                    deleted_uids = self.remove_deleted_uids(
                        db_session, local_uids, remote_uids)
                    delete_uid_count = len(deleted_uids)

                    local_uids = set(local_uids) - deleted_uids
                    unknown_uids = set(remote_uids) - local_uids

                    # Persist the num(messages) to sync (any type of sync:
                    # download, update or delete) before we start.  Note that
                    # num_local_deleted, num_local_updated ARE the numbers to
                    # delete/update too since we make those changes rightaway
                    # before we start downloading messages.
                    self.update_uid_counts(
                        db_session, remote_uid_count=remote_uid_count,
                        download_uid_count=len(unknown_uids),
                        update_uid_count=update_uid_count,
                        delete_uid_count=delete_uid_count)

            if self.folder_name == crispin_client.folder_names()['inbox']:
                # We don't do an initial dedupe for Inbox because we do thread
                # expansion, which means even if we have a given msgid
                # downloaded, we miiight not have the whole thread. This means
                # that restarts cause duplicate work, but hopefully these
                # folders aren't too huge.
                message_download_stack = LifoQueue()
                flags = crispin_client.flags(unknown_uids)
                for uid in unknown_uids:
                    if uid in flags:
                        message_download_stack.put(
                            GMessage(uid, remote_g_metadata[uid],
                                     flags[uid].flags, flags[uid].labels))
                new_uid_poller = spawn(self.__check_new_g_thrids,
                                       message_download_stack)
                self.__download_queued_threads(crispin_client,
                                               message_download_stack)
            elif self.folder_name in uid_download_folders(crispin_client):
                full_download = self.__deduplicate_message_download(
                    crispin_client, remote_g_metadata, unknown_uids)
                add_uids_to_stack(full_download, uid_download_stack)
                new_uid_poller = spawn(self.check_new_uids, uid_download_stack)
                self.download_uids(crispin_client, uid_download_stack)
            else:
                raise MailsyncError(
                    'Unknown Gmail sync folder: {}'.format(self.folder_name))

            # Complete X-GM-MSGID mapping is no longer needed after initial
            # sync.
            rm_cache(remote_g_metadata_cache_file(self.account_id,
                                                  self.folder_name))
        finally:
            new_uid_poller.kill()
Example #38
0
def _migrator_with_worker_pool(migrator, reindexer, iterable, max_retry, num_workers):
    """Migrate in parallel with worker pool

    When running in steady state, failed doc will be retried up to the
    max retry limit. Documents awaiting retry and all documents that
    started the migration process but did not finish will be saved and
    retried on the next run if the migration is stopped before it
    completes.
    """
    def work_on(doc, key, retry_count):
        try:
            ok = migrator.migrate(doc)
            assert ok, "run_with_worker_pool expects success!"
        except Exception:
            err = traceback.format_exc().strip()
            print("Error processing blob:\n{}".format(err))
            if retry_count < max_retry:
                print("will retry {}".format(key))
                retry_blobs[key] += 1
                queue.put(doc)
                return
            migrator.save_backup(doc, "too many retries")
            print("too many retries {}".format(key))
        retry_blobs.pop(key, None)

    def retry_loop():
        for doc in queue:
            enqueue_doc(doc)

    def enqueue_doc(doc):
        key = reindexer.get_key(doc)
        retry_count = retry_blobs.setdefault(key, 0)
        # pool.spawn will block until a worker is available
        pool.spawn(work_on, doc, key, retry_count)
        # Returning True here means the underlying iterator will think
        # this doc has been processed successfully. Therefore we must
        # process this doc before the process exits or save it to be
        # processed on the next run.
        return True

    queue = LifoQueue()
    loop = gevent.spawn(retry_loop)
    pool = Pool(size=num_workers)

    class gmigrator:
        migrate = staticmethod(enqueue_doc)

    with migrator:
        retry_blobs = iterable.get_iterator_detail("retry_blobs") or {}
        for key in list(retry_blobs):
            queue.put(reindexer.load(key))
        try:
            yield gmigrator
        finally:
            try:
                print("waiting for workers to stop... (Ctrl+C to abort)")
                queue.put(StopIteration)
                loop.join()
                while not pool.join(timeout=10):
                    print("waiting for {} workers to stop...".format(len(pool)))
            finally:
                iterable.set_iterator_detail("retry_blobs", retry_blobs)
                print("done.")
Example #39
0
File: gmail.py Project: jre21/inbox
def gmail_initial_sync(crispin_client, db_session, log, folder_name,
                       shared_state, local_uids, uid_download_stack, c):
    remote_g_metadata = get_g_metadata(crispin_client, db_session, log,
                                       folder_name, local_uids,
                                       shared_state['syncmanager_lock'], c)
    remote_uids = sorted(remote_g_metadata.keys(), key=int)
    log.info("Found {0} UIDs for folder {1}".format(len(remote_uids),
                                                    folder_name))
    if folder_name == crispin_client.folder_names(c)['all']:
        log.info("Already have {0} UIDs".format(len(local_uids)))

    deleted_uids = remove_deleted_uids(
        crispin_client.account_id, db_session, log, folder_name,
        local_uids, remote_uids, shared_state['syncmanager_lock'], c)
    local_uids = set(local_uids) - deleted_uids
    unknown_uids = set(remote_uids) - local_uids

    if folder_name != crispin_client.folder_names(c)['all']:
        # We don't do an initial dedupe for non-All Mail folders because
        # we do thread expansion, which means even if we have a given msgid
        # downloaded, we miiight not have the whole thread. This means that
        # restarts cause duplicate work, but hopefully these folders aren't
        # too huge.
        message_download_stack = LifoQueue()
        flags = crispin_client.flags(unknown_uids, c)
        for uid in unknown_uids:
            if uid in flags:
                message_download_stack.put(
                    GMessage(uid, remote_g_metadata[uid], flags[uid].flags,
                             flags[uid].labels))
        new_uid_poller = spawn(check_new_g_thrids, crispin_client.account_id,
                               crispin_client.PROVIDER, folder_name, log,
                               message_download_stack,
                               shared_state['poll_frequency'],
                               shared_state['syncmanager_lock'])
        download_queued_threads(crispin_client, db_session, log, folder_name,
                                message_download_stack,
                                shared_state['status_cb'],
                                shared_state['syncmanager_lock'], c)
    else:
        full_download = deduplicate_message_download(
            crispin_client, db_session, log, remote_g_metadata, unknown_uids,
            c)

        add_uids_to_stack(full_download, uid_download_stack)
        new_uid_poller = spawn(check_new_uids, crispin_client.account_id,
                               crispin_client.PROVIDER, folder_name,
                               log, uid_download_stack,
                               shared_state['poll_frequency'],
                               shared_state['syncmanager_lock'])
        download_queued_uids(crispin_client, db_session, log, folder_name,
                             uid_download_stack, len(local_uids),
                             len(remote_uids), shared_state['status_cb'],
                             shared_state['syncmanager_lock'],
                             gmail_download_and_commit_uids,
                             account.create_gmail_message, c)

    # Complete X-GM-MSGID mapping is no longer needed after initial sync.
    rm_cache(remote_g_metadata_cache_file(crispin_client.account_id,
                                          folder_name))

    new_uid_poller.kill()