def gmail_highestmodseq_update(crispin_client, log, folder_name, new_uids, updated_uids, syncmanager_lock): uids = new_uids + updated_uids g_metadata = crispin_client.g_metadata(uids) to_download = deduplicate_message_download( crispin_client, log, syncmanager_lock, g_metadata, uids) if folder_name == crispin_client.folder_names()['inbox']: flags = crispin_client.flags(to_download) message_download_stack = LifoQueue() for uid in to_download: if uid in flags and uid in g_metadata: # IMAP will just return no data for a UID if it's disappeared # from the folder in the meantime. message_download_stack.put(GMessage( uid, g_metadata[uid], flags[uid].flags, flags[uid].labels)) download_queued_threads(crispin_client, log, folder_name, message_download_stack, syncmanager_lock) elif folder_name in uid_download_folders(crispin_client): uid_download_stack = uid_list_to_stack(to_download) download_queued_uids(crispin_client, log, folder_name, uid_download_stack, 0, uid_download_stack.qsize(), syncmanager_lock, gmail_download_and_commit_uids, create_gmail_message) else: raise MailsyncError( 'Unknown Gmail sync folder: {}'.format(folder_name))
def base_initial_sync(crispin_client, log, folder_name, shared_state, initial_sync_fn, msg_create_fn): """ Downloads entire messages. This function may be retried as many times as you like; it will pick up where it left off, delete removed messages if things disappear between restarts, and only complete once we have all the UIDs in the given folder locally. This function also starts up a secondary greenlet that checks for new messages periodically, to deal with the case of very large folders---it's a bad experience for the user to keep receiving old mail but not receive new mail! We use a LIFO queue to make sure we're downloading newest mail first. """ log.info('starting initial sync') uid_download_stack = LifoQueue() crispin_client.select_folder(folder_name, uidvalidity_cb(crispin_client.account_id)) with session_scope(ignore_soft_deletes=False) as db_session: local_uids = account.all_uids(crispin_client.account_id, db_session, folder_name) initial_sync_fn(crispin_client, log, folder_name, shared_state, local_uids, uid_download_stack, msg_create_fn) return 'poll'
def __init__(self, size, host, port, db=0, passwd=None, socket_timeout=None): """ Args: size - Number of connections to maintain in the pool. host - The hostname to use for making connections. port - The port to use for making connections. db - The database number to connect to. passwd - The password to use for accessing the database. socket_timeout - The socket timeout value for connections. """ self.size = size self.all = set() self.pool = LifoQueue(maxsize=self.size) for _ in xrange(self.size): connection = redis.Connection(host, port, db, passwd, socket_timeout, encoding='utf-8', encoding_errors='strict', parser_class=DefaultParser) self.all.add(connection) self.pool.put(connection)
def __init__(self, engine, *args, **kwargs): from time import time from hashlib import md5 from threading import Lock from gevent.queue import LifoQueue self.processor_name = '%s:%s' % (self.name, md5(str( time())).hexdigest()[:6]) LoggerMixin.__init__(self) self.engine = engine self.__redis = None self.redis_lock = Lock() self.progress = 0 self.total = 0 # 忽略统计 self.bypassed_cnt = 0 # 超过这一限制时,add_task就暂停向其中添加任务 self.maxsize = 1000 self.tasks = LifoQueue() self.workers = [] # 默认的polling间隔为1秒 self.polling_interval = 1 import argparse arg_parser = argparse.ArgumentParser() # 并发数量 arg_parser.add_argument('--concur', type=int) args, leftover = arg_parser.parse_known_args() from core import dhaulagiri_settings if args.concur: dhaulagiri_settings['core']['concur'] = args.concur self.concur = dhaulagiri_settings['core']['concur'] self.checkpoint_ts = None self.checkpoint_prog = None self.init_ts = time() # 心跳任务 self.heart_beat = None # worker的Monitor。Worker在每次循环开始的时候,都会在该对象中进行一次状态更新 self.worker_monitor = {}
def initial_sync(self): with self.conn_pool.get() as crispin_client: uid_download_stack = LifoQueue() crispin_client.select_folder( self.folder_name, uidvalidity_cb(crispin_client.account_id)) with mailsync_session_scope() as db_session: local_uids = common.all_uids(crispin_client.account_id, db_session, self.folder_name) self.initial_sync_impl(crispin_client, local_uids, uid_download_stack) return 'poll'
def __init__(self, pool_name, pool_size, client_class, close_client_handler, *client_args, **client_kwargs): assert pool_size > 0 assert client_class is not None and hasattr(client_class, '__call__') assert close_client_handler is None or hasattr(close_client_handler, '__call__') self._pool_name = pool_name self._pool_size = pool_size self._client_class = client_class self._close_client_handler = close_client_handler self._client_args = client_args self._client_kwargs = client_kwargs self._queue = LifoQueue(maxsize=pool_size) for i in range(pool_size): self._queue.put(ClientHolder()) self._client_expire_time = self.DEFAULT_CLIENT_EXPIRE_TIME self._gc_task = ScheduleTask( name='ClientPool-GC-%s' % pool_name, start_after_seconds=0, interval_seconds=self.DEFAULT_CLOSE_EXPIRE_CLIENT_INTERVAL, handler=self._close_expire_client) self._gc_task.run()
def __init__(self, maxsize=100, maxwait=1.0, expires=None, cleanup=None): """ The pool manages opened connections to the database. The main strategy is to keep the smallest number of alive connections which are required for best web service performance. In most cases connections are taken from the pool. In case of views-peeks, pool creates some extra resources preventing service gone unavailable. In time of low traffic (night) unnecessary connections are released. Parameters ---------- maxsize : int Soft limit of the number of created connections. After reaching this limit taking the next connection first waits `maxwait` time for any returned slot. maxwait : float The time in seconds which is to be wait before creating new connection after the pool gets empty. It may be 0 then immediate connections are created til `maxoverflow` is reached. expires : float The time in seconds indicates how long connection should stay alive. It is also used to close unneeded slots. """ if not isinstance(maxsize, integer_types): raise TypeError('Expected integer, got %r' % (maxsize, )) self._maxsize = maxsize self._maxwait = maxwait self._expires = expires self._cleanup = cleanup self._created_at = {} self._latest_use = {} self._pool = LifoQueue() self._size = 0 self._latest_cleanup = 0 if self._expires or self._cleanup else 0xffffffffffffffff self._interval_cleanup = min( self._expires or self._cleanup, self._cleanup or self._expires) if self._expires or self._cleanup else 0 self._cleanup_lock = Semaphore(value=1)
def highestmodseq_callback(self, crispin_client, new_uids, updated_uids): uids = new_uids + updated_uids g_metadata = crispin_client.g_metadata(uids) to_download = self.__deduplicate_message_download( crispin_client, g_metadata, uids) if self.folder_name == crispin_client.folder_names()['inbox']: flags = crispin_client.flags(to_download) message_download_stack = LifoQueue() for uid in to_download: if uid in flags and uid in g_metadata: # IMAP will just return no data for a UID if it's # disappeared from the folder in the meantime. message_download_stack.put( GMessage(uid, g_metadata[uid], flags[uid].flags, flags[uid].labels)) self.__download_queued_threads(crispin_client, message_download_stack) elif self.folder_name in uid_download_folders(crispin_client): uid_download_stack = uid_list_to_stack(to_download) self.download_uids(crispin_client, uid_download_stack) else: raise MailsyncError('Unknown Gmail sync folder: {}'.format( self.folder_name))
def gmail_initial_sync(crispin_client, db_session, log, folder_name, shared_state, local_uids, uid_download_stack): remote_g_metadata = get_g_metadata(crispin_client, db_session, log, folder_name, local_uids, shared_state['syncmanager_lock']) remote_uids = sorted(remote_g_metadata.keys(), key=int) log.info("Found {0} UIDs for folder {1}".format(len(remote_uids), folder_name)) if folder_name == crispin_client.folder_names()['all']: log.info("Already have {0} UIDs".format(len(local_uids))) with shared_state['syncmanager_lock']: log.debug("gmail_initial_sync grabbed syncmanager_lock") deleted_uids = remove_deleted_uids( crispin_client.account_id, db_session, log, folder_name, local_uids, remote_uids) local_uids = set(local_uids) - deleted_uids unknown_uids = set(remote_uids) - local_uids if folder_name == crispin_client.folder_names()['inbox']: # We don't do an initial dedupe for Inbox because we do thread # expansion, which means even if we have a given msgid downloaded, we # miiight not have the whole thread. This means that restarts cause # duplicate work, but hopefully these folders aren't too huge. message_download_stack = LifoQueue() flags = crispin_client.flags(unknown_uids) for uid in unknown_uids: if uid in flags: message_download_stack.put( GMessage(uid, remote_g_metadata[uid], flags[uid].flags, flags[uid].labels)) new_uid_poller = spawn(check_new_g_thrids, crispin_client.account_id, crispin_client.PROVIDER, folder_name, log, message_download_stack, shared_state['poll_frequency'], shared_state['syncmanager_lock']) download_queued_threads(crispin_client, db_session, log, folder_name, message_download_stack, shared_state['status_cb'], shared_state['syncmanager_lock']) elif folder_name in uid_download_folders(crispin_client): full_download = deduplicate_message_download( crispin_client, db_session, log, shared_state['syncmanager_lock'], remote_g_metadata, unknown_uids) add_uids_to_stack(full_download, uid_download_stack) new_uid_poller = spawn(check_new_uids, crispin_client.account_id, crispin_client.PROVIDER, folder_name, log, uid_download_stack, shared_state['poll_frequency'], shared_state['syncmanager_lock']) download_queued_uids(crispin_client, db_session, log, folder_name, uid_download_stack, len(local_uids), len(remote_uids), shared_state['status_cb'], shared_state['syncmanager_lock'], gmail_download_and_commit_uids, create_gmail_message) else: raise MailsyncError( "Unknown Gmail sync folder: {}".format(folder_name)) # Complete X-GM-MSGID mapping is no longer needed after initial sync. rm_cache(remote_g_metadata_cache_file(crispin_client.account_id, folder_name)) new_uid_poller.kill()
def __init__(self): self._lifoqueue = LifoQueue()
def uid_list_to_stack(uids): """ UID download function needs a stack even for polling. """ uid_download_stack = LifoQueue() for uid in sorted(uids, key=int): uid_download_stack.put(uid) return uid_download_stack
def gmail_initial_sync(crispin_client, log, folder_name, shared_state, local_uids, uid_download_stack, msg_create_fn): remote_uid_count = len(set(crispin_client.all_uids())) remote_g_metadata, update_uid_count = get_g_metadata( crispin_client, log, folder_name, local_uids, shared_state['syncmanager_lock']) remote_uids = sorted(remote_g_metadata.keys(), key=int) log.info(remote_uid_count=len(remote_uids)) if folder_name == crispin_client.folder_names()['all']: log.info(local_uid_count=len(local_uids)) with shared_state['syncmanager_lock']: log.debug('gmail_initial_sync grabbed syncmanager_lock') with session_scope(ignore_soft_deletes=False) as db_session: deleted_uids = remove_deleted_uids( crispin_client.account_id, db_session, log, folder_name, local_uids, remote_uids) delete_uid_count = len(deleted_uids) local_uids = set(local_uids) - deleted_uids unknown_uids = set(remote_uids) - local_uids # Persist the num(messages) to sync (any type of sync: download, # update or delete) before we start. # Note that num_local_deleted, num_local_updated ARE the numbers to # delete/update too since we make those changes rightaway before we # start downloading messages. update_uid_counts(db_session, log, crispin_client.account_id, folder_name, remote_uid_count=remote_uid_count, download_uid_count=len(unknown_uids), update_uid_count=update_uid_count, delete_uid_count=delete_uid_count) if folder_name == crispin_client.folder_names()['inbox']: # We don't do an initial dedupe for Inbox because we do thread # expansion, which means even if we have a given msgid downloaded, we # miiight not have the whole thread. This means that restarts cause # duplicate work, but hopefully these folders aren't too huge. message_download_stack = LifoQueue() flags = crispin_client.flags(unknown_uids) for uid in unknown_uids: if uid in flags: message_download_stack.put( GMessage(uid, remote_g_metadata[uid], flags[uid].flags, flags[uid].labels)) new_uid_poller = spawn(check_new_g_thrids, crispin_client.account_id, crispin_client.PROVIDER, folder_name, log, message_download_stack, shared_state['poll_frequency'], shared_state['syncmanager_lock']) download_queued_threads(crispin_client, log, folder_name, message_download_stack, shared_state['syncmanager_lock']) elif folder_name in uid_download_folders(crispin_client): full_download = deduplicate_message_download( crispin_client, log, shared_state['syncmanager_lock'], remote_g_metadata, unknown_uids) add_uids_to_stack(full_download, uid_download_stack) new_uid_poller = spawn(check_new_uids, crispin_client.account_id, folder_name, log, uid_download_stack, shared_state['poll_frequency'], shared_state['syncmanager_lock']) download_queued_uids(crispin_client, log, folder_name, uid_download_stack, len(local_uids), len(unknown_uids), shared_state['syncmanager_lock'], gmail_download_and_commit_uids, msg_create_fn) else: raise MailsyncError( 'Unknown Gmail sync folder: {}'.format(folder_name)) # Complete X-GM-MSGID mapping is no longer needed after initial sync. rm_cache(remote_g_metadata_cache_file(crispin_client.account_id, folder_name)) new_uid_poller.kill()
def _migrator_with_worker_pool(migrator, reindexer, iterable, max_retry, num_workers): """Migrate in parallel with worker pool When running in steady state, failed doc will be retried up to the max retry limit. Documents awaiting retry and all documents that started the migration process but did not finish will be saved and retried on the next run if the migration is stopped before it completes. """ def work_on(doc, key, retry_count): try: ok = migrator.migrate(doc) assert ok, "run_with_worker_pool expects success!" except Exception: err = traceback.format_exc().strip() print("Error processing blob:\n{}".format(err)) if retry_count < max_retry: print("will retry {}".format(key)) retry_blobs[key] += 1 queue.put(doc) return migrator.save_backup(doc, "too many retries") print("too many retries {}".format(key)) retry_blobs.pop(key, None) def retry_loop(): for doc in queue: enqueue_doc(doc) def enqueue_doc(doc): key = reindexer.get_key(doc) retry_count = retry_blobs.setdefault(key, 0) # pool.spawn will block until a worker is available pool.spawn(work_on, doc, key, retry_count) # Returning True here means the underlying iterator will think # this doc has been processed successfully. Therefore we must # process this doc before the process exits or save it to be # processed on the next run. return True queue = LifoQueue() loop = gevent.spawn(retry_loop) pool = Pool(size=num_workers) class gmigrator: migrate = staticmethod(enqueue_doc) with migrator: retry_blobs = iterable.get_iterator_detail("retry_blobs") or {} for key in list(retry_blobs): queue.put(reindexer.load(key)) try: yield gmigrator finally: try: print("waiting for workers to stop... (Ctrl+C to abort)") queue.put(StopIteration) loop.join() while not pool.join(timeout=10): print("waiting for {} workers to stop...".format( len(pool))) finally: iterable.set_iterator_detail("retry_blobs", retry_blobs) print("done.")
def load(self, ctx): self.models = ctx.get('models', {}) self.backfills = {} self.user_updates = LifoQueue(maxsize=4096) super(SQLPlugin, self).load(ctx)
import scrapy from redis import StrictRedis, ConnectionPool from gevent.queue import LifoQueue from gevent.pool import Pool from urllib.parse import quote, unquote import gevent.monkey from pymongo import MongoClient import time import traceback import json import math import argparse from pprint import pprint redis_queue = StrictRedis(connection_pool=ConnectionPool(host='127.0.0.1', port=6379, db=0, decode_responses=True)) task_queue = LifoQueue() SEARCH_KEY = 'search_key' SEARCH_HTML = 'search_html' LIST_LINK = 'list_link' LIST_HTML = 'list_html' FIRST_COMMENT_LINK = 'first_comment_link' FIRST_COMMENT_HTML = 'first_comment_html' COMMENT_LINK = 'comment_link' COMMENT_HTML = 'comment_html' # MAX_PAGE = 'max_page' IP_KEY = 'ip' class JDCrawler(object): def __init__(self):
def init_tasks_queue(self, sub_domains): tasks_queue = LifoQueue() for sub_domain in sub_domains: tasks_queue.put(sub_domain) return tasks_queue
def initial_sync_impl(self, crispin_client, local_uids, uid_download_stack): # We wrap the block in a try/finally because the greenlets like # new_uid_poller need to be killed when this greenlet is interrupted try: remote_uid_count = len(set(crispin_client.all_uids())) remote_g_metadata, update_uid_count = self.__fetch_g_metadata( crispin_client, local_uids) remote_uids = sorted(remote_g_metadata.keys(), key=int) log.info(remote_uid_count=len(remote_uids)) if self.folder_name == crispin_client.folder_names()['all']: log.info(local_uid_count=len(local_uids)) with self.syncmanager_lock: log.debug('gmail_initial_sync grabbed syncmanager_lock') with mailsync_session_scope() as db_session: deleted_uids = self.remove_deleted_uids( db_session, local_uids, remote_uids) delete_uid_count = len(deleted_uids) local_uids = set(local_uids) - deleted_uids unknown_uids = set(remote_uids) - local_uids # Persist the num(messages) to sync (any type of sync: # download, update or delete) before we start. Note that # num_local_deleted, num_local_updated ARE the numbers to # delete/update too since we make those changes rightaway # before we start downloading messages. self.update_uid_counts( db_session, remote_uid_count=remote_uid_count, download_uid_count=len(unknown_uids), update_uid_count=update_uid_count, delete_uid_count=delete_uid_count) if self.folder_name == crispin_client.folder_names()['inbox']: # We don't do an initial dedupe for Inbox because we do thread # expansion, which means even if we have a given msgid # downloaded, we miiight not have the whole thread. This means # that restarts cause duplicate work, but hopefully these # folders aren't too huge. message_download_stack = LifoQueue() flags = crispin_client.flags(unknown_uids) for uid in unknown_uids: if uid in flags: message_download_stack.put( GMessage(uid, remote_g_metadata[uid], flags[uid].flags, flags[uid].labels)) new_uid_poller = spawn(self.__check_new_g_thrids, message_download_stack) self.__download_queued_threads(crispin_client, message_download_stack) elif self.folder_name in uid_download_folders(crispin_client): full_download = self.__deduplicate_message_download( crispin_client, remote_g_metadata, unknown_uids) add_uids_to_stack(full_download, uid_download_stack) new_uid_poller = spawn(self.check_new_uids, uid_download_stack) self.download_uids(crispin_client, uid_download_stack) else: raise MailsyncError('Unknown Gmail sync folder: {}'.format( self.folder_name)) # Complete X-GM-MSGID mapping is no longer needed after initial # sync. rm_cache( remote_g_metadata_cache_file(self.account_id, self.folder_name)) finally: new_uid_poller.kill()