Beispiel #1
0
def gmail_highestmodseq_update(crispin_client, log, folder_name, new_uids,
                               updated_uids, syncmanager_lock):
    uids = new_uids + updated_uids
    g_metadata = crispin_client.g_metadata(uids)
    to_download = deduplicate_message_download(
        crispin_client, log, syncmanager_lock, g_metadata, uids)

    if folder_name == crispin_client.folder_names()['inbox']:
        flags = crispin_client.flags(to_download)
        message_download_stack = LifoQueue()
        for uid in to_download:
            if uid in flags and uid in g_metadata:
                # IMAP will just return no data for a UID if it's disappeared
                # from the folder in the meantime.
                message_download_stack.put(GMessage(
                    uid, g_metadata[uid], flags[uid].flags, flags[uid].labels))
        download_queued_threads(crispin_client, log, folder_name,
                                message_download_stack, syncmanager_lock)
    elif folder_name in uid_download_folders(crispin_client):
        uid_download_stack = uid_list_to_stack(to_download)
        download_queued_uids(crispin_client, log, folder_name,
                             uid_download_stack, 0, uid_download_stack.qsize(),
                             syncmanager_lock, gmail_download_and_commit_uids,
                             create_gmail_message)
    else:
        raise MailsyncError(
            'Unknown Gmail sync folder: {}'.format(folder_name))
Beispiel #2
0
def base_initial_sync(crispin_client, log, folder_name, shared_state,
                      initial_sync_fn, msg_create_fn):
    """ Downloads entire messages.

    This function may be retried as many times as you like; it will pick up
    where it left off, delete removed messages if things disappear between
    restarts, and only complete once we have all the UIDs in the given folder
    locally.

    This function also starts up a secondary greenlet that checks for new
    messages periodically, to deal with the case of very large folders---it's
    a bad experience for the user to keep receiving old mail but not receive
    new mail! We use a LIFO queue to make sure we're downloading newest mail
    first.
    """
    log.info('starting initial sync')

    uid_download_stack = LifoQueue()

    crispin_client.select_folder(folder_name,
                                 uidvalidity_cb(crispin_client.account_id))

    with session_scope(ignore_soft_deletes=False) as db_session:
        local_uids = account.all_uids(crispin_client.account_id, db_session,
                                      folder_name)

    initial_sync_fn(crispin_client, log, folder_name, shared_state, local_uids,
                    uid_download_stack, msg_create_fn)

    return 'poll'
Beispiel #3
0
    def __init__(self,
                 size,
                 host,
                 port,
                 db=0,
                 passwd=None,
                 socket_timeout=None):
        """
    Args:
      size - Number of connections to maintain in the pool.
      host - The hostname to use for making connections.
      port - The port to use for making connections.
      db - The database number to connect to.
      passwd - The password to use for accessing the database.
      socket_timeout - The socket timeout value for connections.
    """
        self.size = size

        self.all = set()
        self.pool = LifoQueue(maxsize=self.size)

        for _ in xrange(self.size):
            connection = redis.Connection(host,
                                          port,
                                          db,
                                          passwd,
                                          socket_timeout,
                                          encoding='utf-8',
                                          encoding_errors='strict',
                                          parser_class=DefaultParser)
            self.all.add(connection)
            self.pool.put(connection)
Beispiel #4
0
    def __init__(self, engine, *args, **kwargs):
        from time import time
        from hashlib import md5
        from threading import Lock
        from gevent.queue import LifoQueue

        self.processor_name = '%s:%s' % (self.name, md5(str(
            time())).hexdigest()[:6])

        LoggerMixin.__init__(self)

        self.engine = engine

        self.__redis = None
        self.redis_lock = Lock()

        self.progress = 0

        self.total = 0
        # 忽略统计
        self.bypassed_cnt = 0

        # 超过这一限制时,add_task就暂停向其中添加任务
        self.maxsize = 1000
        self.tasks = LifoQueue()
        self.workers = []

        # 默认的polling间隔为1秒
        self.polling_interval = 1

        import argparse

        arg_parser = argparse.ArgumentParser()
        # 并发数量
        arg_parser.add_argument('--concur', type=int)
        args, leftover = arg_parser.parse_known_args()

        from core import dhaulagiri_settings

        if args.concur:
            dhaulagiri_settings['core']['concur'] = args.concur
        self.concur = dhaulagiri_settings['core']['concur']

        self.checkpoint_ts = None
        self.checkpoint_prog = None
        self.init_ts = time()

        # 心跳任务
        self.heart_beat = None

        # worker的Monitor。Worker在每次循环开始的时候,都会在该对象中进行一次状态更新
        self.worker_monitor = {}
Beispiel #5
0
    def initial_sync(self):
        with self.conn_pool.get() as crispin_client:
            uid_download_stack = LifoQueue()
            crispin_client.select_folder(
                self.folder_name, uidvalidity_cb(crispin_client.account_id))

            with mailsync_session_scope() as db_session:
                local_uids = common.all_uids(crispin_client.account_id,
                                             db_session, self.folder_name)

            self.initial_sync_impl(crispin_client, local_uids,
                                   uid_download_stack)
        return 'poll'
Beispiel #6
0
 def __init__(self, pool_name, pool_size, client_class,
              close_client_handler, *client_args, **client_kwargs):
     assert pool_size > 0
     assert client_class is not None and hasattr(client_class, '__call__')
     assert close_client_handler is None or hasattr(close_client_handler,
                                                    '__call__')
     self._pool_name = pool_name
     self._pool_size = pool_size
     self._client_class = client_class
     self._close_client_handler = close_client_handler
     self._client_args = client_args
     self._client_kwargs = client_kwargs
     self._queue = LifoQueue(maxsize=pool_size)
     for i in range(pool_size):
         self._queue.put(ClientHolder())
     self._client_expire_time = self.DEFAULT_CLIENT_EXPIRE_TIME
     self._gc_task = ScheduleTask(
         name='ClientPool-GC-%s' % pool_name,
         start_after_seconds=0,
         interval_seconds=self.DEFAULT_CLOSE_EXPIRE_CLIENT_INTERVAL,
         handler=self._close_expire_client)
     self._gc_task.run()
    def __init__(self, maxsize=100, maxwait=1.0, expires=None, cleanup=None):
        """
        The pool manages opened connections to the database. The main strategy is to keep the smallest number
        of alive connections which are required for best web service performance.
        In most cases connections are taken from the pool. In case of views-peeks, pool creates some
        extra resources preventing service gone unavailable. In time of low traffic (night) unnecessary
        connections are released.

        Parameters
        ----------
        maxsize : int
                  Soft limit of the number of created connections. After reaching this limit
                  taking the next connection first waits `maxwait` time for any returned slot.
        maxwait : float
                  The time in seconds which is to be wait before creating new connection after the pool gets empty.
                  It may be 0 then immediate connections are created til `maxoverflow` is reached.
        expires : float
                  The time in seconds indicates how long connection should stay alive.
                  It is also used to close unneeded slots.
        """
        if not isinstance(maxsize, integer_types):
            raise TypeError('Expected integer, got %r' % (maxsize, ))

        self._maxsize = maxsize
        self._maxwait = maxwait
        self._expires = expires
        self._cleanup = cleanup
        self._created_at = {}
        self._latest_use = {}
        self._pool = LifoQueue()
        self._size = 0
        self._latest_cleanup = 0 if self._expires or self._cleanup else 0xffffffffffffffff
        self._interval_cleanup = min(
            self._expires or self._cleanup, self._cleanup
            or self._expires) if self._expires or self._cleanup else 0
        self._cleanup_lock = Semaphore(value=1)
Beispiel #8
0
    def highestmodseq_callback(self, crispin_client, new_uids, updated_uids):
        uids = new_uids + updated_uids
        g_metadata = crispin_client.g_metadata(uids)
        to_download = self.__deduplicate_message_download(
            crispin_client, g_metadata, uids)

        if self.folder_name == crispin_client.folder_names()['inbox']:
            flags = crispin_client.flags(to_download)
            message_download_stack = LifoQueue()
            for uid in to_download:
                if uid in flags and uid in g_metadata:
                    # IMAP will just return no data for a UID if it's
                    # disappeared from the folder in the meantime.
                    message_download_stack.put(
                        GMessage(uid, g_metadata[uid], flags[uid].flags,
                                 flags[uid].labels))
            self.__download_queued_threads(crispin_client,
                                           message_download_stack)
        elif self.folder_name in uid_download_folders(crispin_client):
            uid_download_stack = uid_list_to_stack(to_download)
            self.download_uids(crispin_client, uid_download_stack)
        else:
            raise MailsyncError('Unknown Gmail sync folder: {}'.format(
                self.folder_name))
Beispiel #9
0
def gmail_initial_sync(crispin_client, db_session, log, folder_name,
                       shared_state, local_uids, uid_download_stack):
    remote_g_metadata = get_g_metadata(crispin_client, db_session, log,
                                       folder_name, local_uids,
                                       shared_state['syncmanager_lock'])
    remote_uids = sorted(remote_g_metadata.keys(), key=int)
    log.info("Found {0} UIDs for folder {1}".format(len(remote_uids),
                                                    folder_name))
    if folder_name == crispin_client.folder_names()['all']:
        log.info("Already have {0} UIDs".format(len(local_uids)))

    with shared_state['syncmanager_lock']:
        log.debug("gmail_initial_sync grabbed syncmanager_lock")
        deleted_uids = remove_deleted_uids(
            crispin_client.account_id, db_session, log, folder_name,
            local_uids, remote_uids)
    local_uids = set(local_uids) - deleted_uids
    unknown_uids = set(remote_uids) - local_uids

    if folder_name == crispin_client.folder_names()['inbox']:
        # We don't do an initial dedupe for Inbox because we do thread
        # expansion, which means even if we have a given msgid downloaded, we
        # miiight not have the whole thread. This means that restarts cause
        # duplicate work, but hopefully these folders aren't too huge.
        message_download_stack = LifoQueue()
        flags = crispin_client.flags(unknown_uids)
        for uid in unknown_uids:
            if uid in flags:
                message_download_stack.put(
                    GMessage(uid, remote_g_metadata[uid], flags[uid].flags,
                             flags[uid].labels))
        new_uid_poller = spawn(check_new_g_thrids, crispin_client.account_id,
                               crispin_client.PROVIDER, folder_name, log,
                               message_download_stack,
                               shared_state['poll_frequency'],
                               shared_state['syncmanager_lock'])
        download_queued_threads(crispin_client, db_session, log, folder_name,
                                message_download_stack,
                                shared_state['status_cb'],
                                shared_state['syncmanager_lock'])
    elif folder_name in uid_download_folders(crispin_client):
        full_download = deduplicate_message_download(
            crispin_client, db_session, log, shared_state['syncmanager_lock'],
            remote_g_metadata, unknown_uids)

        add_uids_to_stack(full_download, uid_download_stack)
        new_uid_poller = spawn(check_new_uids, crispin_client.account_id,
                               crispin_client.PROVIDER, folder_name,
                               log, uid_download_stack,
                               shared_state['poll_frequency'],
                               shared_state['syncmanager_lock'])
        download_queued_uids(crispin_client, db_session, log, folder_name,
                             uid_download_stack, len(local_uids),
                             len(remote_uids), shared_state['status_cb'],
                             shared_state['syncmanager_lock'],
                             gmail_download_and_commit_uids,
                             create_gmail_message)
    else:
        raise MailsyncError(
            "Unknown Gmail sync folder: {}".format(folder_name))

    # Complete X-GM-MSGID mapping is no longer needed after initial sync.
    rm_cache(remote_g_metadata_cache_file(crispin_client.account_id,
                                          folder_name))

    new_uid_poller.kill()
Beispiel #10
0
 def __init__(self):
     self._lifoqueue = LifoQueue()
Beispiel #11
0
def uid_list_to_stack(uids):
    """ UID download function needs a stack even for polling. """
    uid_download_stack = LifoQueue()
    for uid in sorted(uids, key=int):
        uid_download_stack.put(uid)
    return uid_download_stack
Beispiel #12
0
def gmail_initial_sync(crispin_client, log, folder_name, shared_state,
                       local_uids, uid_download_stack, msg_create_fn):
    remote_uid_count = len(set(crispin_client.all_uids()))
    remote_g_metadata, update_uid_count = get_g_metadata(
        crispin_client, log, folder_name, local_uids,
        shared_state['syncmanager_lock'])
    remote_uids = sorted(remote_g_metadata.keys(), key=int)
    log.info(remote_uid_count=len(remote_uids))
    if folder_name == crispin_client.folder_names()['all']:
        log.info(local_uid_count=len(local_uids))

    with shared_state['syncmanager_lock']:
        log.debug('gmail_initial_sync grabbed syncmanager_lock')
        with session_scope(ignore_soft_deletes=False) as db_session:
            deleted_uids = remove_deleted_uids(
                crispin_client.account_id, db_session, log, folder_name,
                local_uids, remote_uids)
            delete_uid_count = len(deleted_uids)

            local_uids = set(local_uids) - deleted_uids
            unknown_uids = set(remote_uids) - local_uids

            # Persist the num(messages) to sync (any type of sync: download,
            # update or delete) before we start.
            # Note that num_local_deleted, num_local_updated ARE the numbers to
            # delete/update too since we make those changes rightaway before we
            # start downloading messages.
            update_uid_counts(db_session, log, crispin_client.account_id,
                              folder_name, remote_uid_count=remote_uid_count,
                              download_uid_count=len(unknown_uids),
                              update_uid_count=update_uid_count,
                              delete_uid_count=delete_uid_count)

    if folder_name == crispin_client.folder_names()['inbox']:
        # We don't do an initial dedupe for Inbox because we do thread
        # expansion, which means even if we have a given msgid downloaded, we
        # miiight not have the whole thread. This means that restarts cause
        # duplicate work, but hopefully these folders aren't too huge.
        message_download_stack = LifoQueue()
        flags = crispin_client.flags(unknown_uids)
        for uid in unknown_uids:
            if uid in flags:
                message_download_stack.put(
                    GMessage(uid, remote_g_metadata[uid], flags[uid].flags,
                             flags[uid].labels))
        new_uid_poller = spawn(check_new_g_thrids, crispin_client.account_id,
                               crispin_client.PROVIDER, folder_name, log,
                               message_download_stack,
                               shared_state['poll_frequency'],
                               shared_state['syncmanager_lock'])
        download_queued_threads(crispin_client, log, folder_name,
                                message_download_stack,
                                shared_state['syncmanager_lock'])
    elif folder_name in uid_download_folders(crispin_client):
        full_download = deduplicate_message_download(
            crispin_client, log, shared_state['syncmanager_lock'],
            remote_g_metadata, unknown_uids)
        add_uids_to_stack(full_download, uid_download_stack)
        new_uid_poller = spawn(check_new_uids, crispin_client.account_id,
                               folder_name,
                               log, uid_download_stack,
                               shared_state['poll_frequency'],
                               shared_state['syncmanager_lock'])
        download_queued_uids(crispin_client, log, folder_name,
                             uid_download_stack, len(local_uids),
                             len(unknown_uids),
                             shared_state['syncmanager_lock'],
                             gmail_download_and_commit_uids, msg_create_fn)
    else:
        raise MailsyncError(
            'Unknown Gmail sync folder: {}'.format(folder_name))

    # Complete X-GM-MSGID mapping is no longer needed after initial sync.
    rm_cache(remote_g_metadata_cache_file(crispin_client.account_id,
                                          folder_name))

    new_uid_poller.kill()
Beispiel #13
0
def _migrator_with_worker_pool(migrator, reindexer, iterable, max_retry,
                               num_workers):
    """Migrate in parallel with worker pool

    When running in steady state, failed doc will be retried up to the
    max retry limit. Documents awaiting retry and all documents that
    started the migration process but did not finish will be saved and
    retried on the next run if the migration is stopped before it
    completes.
    """
    def work_on(doc, key, retry_count):
        try:
            ok = migrator.migrate(doc)
            assert ok, "run_with_worker_pool expects success!"
        except Exception:
            err = traceback.format_exc().strip()
            print("Error processing blob:\n{}".format(err))
            if retry_count < max_retry:
                print("will retry {}".format(key))
                retry_blobs[key] += 1
                queue.put(doc)
                return
            migrator.save_backup(doc, "too many retries")
            print("too many retries {}".format(key))
        retry_blobs.pop(key, None)

    def retry_loop():
        for doc in queue:
            enqueue_doc(doc)

    def enqueue_doc(doc):
        key = reindexer.get_key(doc)
        retry_count = retry_blobs.setdefault(key, 0)
        # pool.spawn will block until a worker is available
        pool.spawn(work_on, doc, key, retry_count)
        # Returning True here means the underlying iterator will think
        # this doc has been processed successfully. Therefore we must
        # process this doc before the process exits or save it to be
        # processed on the next run.
        return True

    queue = LifoQueue()
    loop = gevent.spawn(retry_loop)
    pool = Pool(size=num_workers)

    class gmigrator:
        migrate = staticmethod(enqueue_doc)

    with migrator:
        retry_blobs = iterable.get_iterator_detail("retry_blobs") or {}
        for key in list(retry_blobs):
            queue.put(reindexer.load(key))
        try:
            yield gmigrator
        finally:
            try:
                print("waiting for workers to stop... (Ctrl+C to abort)")
                queue.put(StopIteration)
                loop.join()
                while not pool.join(timeout=10):
                    print("waiting for {} workers to stop...".format(
                        len(pool)))
            finally:
                iterable.set_iterator_detail("retry_blobs", retry_blobs)
                print("done.")
Beispiel #14
0
 def load(self, ctx):
     self.models = ctx.get('models', {})
     self.backfills = {}
     self.user_updates = LifoQueue(maxsize=4096)
     super(SQLPlugin, self).load(ctx)
Beispiel #15
0
import scrapy
from redis import StrictRedis, ConnectionPool
from gevent.queue import LifoQueue
from gevent.pool import Pool
from urllib.parse import quote, unquote
import gevent.monkey
from pymongo import MongoClient
import time
import traceback
import json
import math
import argparse
from pprint import pprint

redis_queue = StrictRedis(connection_pool=ConnectionPool(host='127.0.0.1', port=6379, db=0, decode_responses=True))
task_queue = LifoQueue()

SEARCH_KEY = 'search_key'
SEARCH_HTML = 'search_html'
LIST_LINK = 'list_link'
LIST_HTML = 'list_html'
FIRST_COMMENT_LINK = 'first_comment_link'
FIRST_COMMENT_HTML = 'first_comment_html'
COMMENT_LINK = 'comment_link'
COMMENT_HTML = 'comment_html'
# MAX_PAGE = 'max_page'
IP_KEY = 'ip'


class JDCrawler(object):
    def __init__(self):
 def init_tasks_queue(self, sub_domains):
     tasks_queue = LifoQueue()
     for sub_domain in sub_domains:
         tasks_queue.put(sub_domain)
     return tasks_queue
Beispiel #17
0
    def initial_sync_impl(self, crispin_client, local_uids,
                          uid_download_stack):
        # We wrap the block in a try/finally because the greenlets like
        # new_uid_poller need to be killed when this greenlet is interrupted
        try:
            remote_uid_count = len(set(crispin_client.all_uids()))
            remote_g_metadata, update_uid_count = self.__fetch_g_metadata(
                crispin_client, local_uids)
            remote_uids = sorted(remote_g_metadata.keys(), key=int)
            log.info(remote_uid_count=len(remote_uids))
            if self.folder_name == crispin_client.folder_names()['all']:
                log.info(local_uid_count=len(local_uids))

            with self.syncmanager_lock:
                log.debug('gmail_initial_sync grabbed syncmanager_lock')
                with mailsync_session_scope() as db_session:
                    deleted_uids = self.remove_deleted_uids(
                        db_session, local_uids, remote_uids)
                    delete_uid_count = len(deleted_uids)

                    local_uids = set(local_uids) - deleted_uids
                    unknown_uids = set(remote_uids) - local_uids

                    # Persist the num(messages) to sync (any type of sync:
                    # download, update or delete) before we start.  Note that
                    # num_local_deleted, num_local_updated ARE the numbers to
                    # delete/update too since we make those changes rightaway
                    # before we start downloading messages.
                    self.update_uid_counts(
                        db_session,
                        remote_uid_count=remote_uid_count,
                        download_uid_count=len(unknown_uids),
                        update_uid_count=update_uid_count,
                        delete_uid_count=delete_uid_count)

            if self.folder_name == crispin_client.folder_names()['inbox']:
                # We don't do an initial dedupe for Inbox because we do thread
                # expansion, which means even if we have a given msgid
                # downloaded, we miiight not have the whole thread. This means
                # that restarts cause duplicate work, but hopefully these
                # folders aren't too huge.
                message_download_stack = LifoQueue()
                flags = crispin_client.flags(unknown_uids)
                for uid in unknown_uids:
                    if uid in flags:
                        message_download_stack.put(
                            GMessage(uid, remote_g_metadata[uid],
                                     flags[uid].flags, flags[uid].labels))
                new_uid_poller = spawn(self.__check_new_g_thrids,
                                       message_download_stack)
                self.__download_queued_threads(crispin_client,
                                               message_download_stack)
            elif self.folder_name in uid_download_folders(crispin_client):
                full_download = self.__deduplicate_message_download(
                    crispin_client, remote_g_metadata, unknown_uids)
                add_uids_to_stack(full_download, uid_download_stack)
                new_uid_poller = spawn(self.check_new_uids, uid_download_stack)
                self.download_uids(crispin_client, uid_download_stack)
            else:
                raise MailsyncError('Unknown Gmail sync folder: {}'.format(
                    self.folder_name))

            # Complete X-GM-MSGID mapping is no longer needed after initial
            # sync.
            rm_cache(
                remote_g_metadata_cache_file(self.account_id,
                                             self.folder_name))
        finally:
            new_uid_poller.kill()