def gmail_highestmodseq_update(crispin_client, log, folder_name, new_uids, updated_uids, syncmanager_lock): uids = new_uids + updated_uids g_metadata = crispin_client.g_metadata(uids) to_download = deduplicate_message_download( crispin_client, log, syncmanager_lock, g_metadata, uids) if folder_name == crispin_client.folder_names()['inbox']: flags = crispin_client.flags(to_download) message_download_stack = LifoQueue() for uid in to_download: if uid in flags and uid in g_metadata: # IMAP will just return no data for a UID if it's disappeared # from the folder in the meantime. message_download_stack.put(GMessage( uid, g_metadata[uid], flags[uid].flags, flags[uid].labels)) download_queued_threads(crispin_client, log, folder_name, message_download_stack, syncmanager_lock) elif folder_name in uid_download_folders(crispin_client): uid_download_stack = uid_list_to_stack(to_download) download_queued_uids(crispin_client, log, folder_name, uid_download_stack, 0, uid_download_stack.qsize(), syncmanager_lock, gmail_download_and_commit_uids, create_gmail_message) else: raise MailsyncError( 'Unknown Gmail sync folder: {}'.format(folder_name))
def gmail_highestmodseq_update(crispin_client, db_session, log, folder_name, uids, local_uids, status_cb, syncmanager_lock): g_metadata = crispin_client.g_metadata(uids) to_download = deduplicate_message_download( crispin_client, db_session, log, syncmanager_lock, g_metadata, uids) if folder_name != crispin_client.folder_names()['all']: flags = crispin_client.flags(to_download) message_download_stack = LifoQueue() for uid in to_download: if uid in flags and uid in g_metadata: # IMAP will just return no data for a UID if it's disappeared # from the folder in the meantime. message_download_stack.put(GMessage( uid, g_metadata[uid], flags[uid].flags, flags[uid].labels)) download_queued_threads(crispin_client, db_session, log, folder_name, message_download_stack, status_cb, syncmanager_lock) else: uid_download_stack = uid_list_to_stack(to_download) download_queued_uids(crispin_client, db_session, log, folder_name, uid_download_stack, 0, uid_download_stack.qsize(), status_cb, syncmanager_lock, gmail_download_and_commit_uids, create_gmail_message)
class UIDStack(object): """Thin convenience wrapper around gevent.queue.LifoQueue. Each entry in the stack is a pair (uid, metadata), where the metadata may be None.""" def __init__(self): self._lifoqueue = LifoQueue() def empty(self): return self._lifoqueue.empty() def get(self): return self._lifoqueue.get_nowait() def peek(self): # This should be LifoQueue.peek_nowait(), which is currently buggy in # gevent. Can update with gevent version 1.0.2. return self._lifoqueue.queue[-1] def put(self, uid, metadata): self._lifoqueue.put((uid, metadata)) def discard(self, objects): self._lifoqueue.queue = [ item for item in self._lifoqueue.queue if item not in objects ] def qsize(self): return self._lifoqueue.qsize() def __iter__(self): for item in self._lifoqueue.queue: yield item
class UIDStack(object): """Thin convenience wrapper around gevent.queue.LifoQueue. Each entry in the stack is a pair (uid, metadata), where the metadata may be None.""" def __init__(self): self._lifoqueue = LifoQueue() def empty(self): return self._lifoqueue.empty() def get(self): return self._lifoqueue.get_nowait() def peek(self): # This should be LifoQueue.peek_nowait(), which is currently buggy in # gevent. Can update with gevent version 1.0.2. return self._lifoqueue.queue[-1] def put(self, uid, metadata): self._lifoqueue.put((uid, metadata)) def discard(self, objects): self._lifoqueue.queue = [item for item in self._lifoqueue.queue if item not in objects] def qsize(self): return self._lifoqueue.qsize() def __iter__(self): for item in self._lifoqueue.queue: yield item
class RedisConnectionPool(object): """Pool of Redis Connections that uses a gevent LifoQueue to block when a resource is not available. """ def __init__(self, size, host, port, db=0, passwd=None, socket_timeout=None): """ Args: size - Number of connections to maintain in the pool. host - The hostname to use for making connections. port - The port to use for making connections. db - The database number to connect to. passwd - The password to use for accessing the database. socket_timeout - The socket timeout value for connections. """ self.size = size self.all = set() self.pool = LifoQueue(maxsize=self.size) for _ in xrange(self.size): connection = redis.Connection(host, port, db, passwd, socket_timeout, encoding='utf-8', encoding_errors='strict', parser_class=DefaultParser) self.all.add(connection) self.pool.put(connection) def get_connection(self, command_name, *keys, **options): """Get a connection from the pool. If no connection is available, this call will block. """ return self.pool.get(timeout=60) def release(self, connection): """Return a connection to the pool. """ if connection not in self.all: raise ValueError() self.pool.put(connection) def disconnect(self): """Close all the connections managed by this pool. """ for connection in self.all: connection.disconnect()
class RedisConnectionPool(object): """Pool of Redis Connections that uses a gevent LifoQueue to block when a resource is not available. """ def __init__(self, size, host, port, db=0, passwd=None, socket_timeout=None): """ Args: size - Number of connections to maintain in the pool. host - The hostname to use for making connections. port - The port to use for making connections. db - The database number to connect to. passwd - The password to use for accessing the database. socket_timeout - The socket timeout value for connections. """ self.size = size self.all = set() self.pool = LifoQueue(maxsize=self.size) for _ in xrange(self.size): connection = redis.Connection( host, port, db, passwd, socket_timeout, encoding='utf-8', encoding_errors='strict', parser_class=DefaultParser) self.all.add(connection) self.pool.put(connection) def get_connection(self, command_name, *keys, **options): """Get a connection from the pool. If no connection is available, this call will block. """ return self.pool.get(timeout=60) def release(self, connection): """Return a connection to the pool. """ if connection not in self.all: raise ValueError() self.pool.put(connection) def disconnect(self): """Close all the connections managed by this pool. """ for connection in self.all: connection.disconnect()
class Stack(object): """Thin convenience wrapper around gevent.queue.LifoQueue.""" def __init__(self, key, initial_elements=None): self.key = key self._lifoqueue = LifoQueue() if initial_elements is not None: self._lifoqueue.queue = sorted(list(initial_elements), key=self.key) def empty(self): return self._lifoqueue.empty() def get(self): return self._lifoqueue.get_nowait() def peek(self): # This should be LifoQueue.peek_nowait(), which is currently buggy in # gevent. Can update with gevent version 1.0.2. return self._lifoqueue.queue[-1] def put(self, obj): self._lifoqueue.put(obj) def update_from(self, objects): for obj in sorted(list(objects), key=self.key): self._lifoqueue.put(obj) def discard(self, objects): self._lifoqueue.queue = [item for item in self._lifoqueue.queue if item not in objects] def qsize(self): return self._lifoqueue.qsize() def __iter__(self): for item in self._lifoqueue.queue: yield item
def highestmodseq_callback(self, crispin_client, new_uids, updated_uids): uids = new_uids + updated_uids g_metadata = crispin_client.g_metadata(uids) to_download = self.__deduplicate_message_download( crispin_client, g_metadata, uids) if self.folder_name == crispin_client.folder_names()['inbox']: flags = crispin_client.flags(to_download) message_download_stack = LifoQueue() for uid in to_download: if uid in flags and uid in g_metadata: # IMAP will just return no data for a UID if it's # disappeared from the folder in the meantime. message_download_stack.put( GMessage(uid, g_metadata[uid], flags[uid].flags, flags[uid].labels)) self.__download_queued_threads(crispin_client, message_download_stack) elif self.folder_name in uid_download_folders(crispin_client): uid_download_stack = uid_list_to_stack(to_download) self.download_uids(crispin_client, uid_download_stack) else: raise MailsyncError('Unknown Gmail sync folder: {}'.format( self.folder_name))
def highestmodseq_callback(self, crispin_client, new_uids, updated_uids): uids = new_uids + updated_uids g_metadata = crispin_client.g_metadata(uids) to_download = self.__deduplicate_message_download( crispin_client, g_metadata, uids) if self.folder_name == crispin_client.folder_names()['inbox']: flags = crispin_client.flags(to_download) message_download_stack = LifoQueue() for uid in to_download: if uid in flags and uid in g_metadata: # IMAP will just return no data for a UID if it's # disappeared from the folder in the meantime. message_download_stack.put(GMessage( uid, g_metadata[uid], flags[uid].flags, flags[uid].labels)) self.__download_queued_threads(crispin_client, message_download_stack) elif self.folder_name in uid_download_folders(crispin_client): uid_download_stack = uid_list_to_stack(to_download) self.download_uids(crispin_client, uid_download_stack) else: raise MailsyncError( 'Unknown Gmail sync folder: {}'.format(self.folder_name))
class SQLPlugin(Plugin): global_plugin = True def load(self, ctx): self.models = ctx.get('models', {}) self.backfills = {} self.user_updates = LifoQueue(maxsize=4096) super(SQLPlugin, self).load(ctx) def unload(self, ctx): ctx['models'] = self.models super(SQLPlugin, self).unload(ctx) @Plugin.schedule(15, init=False) def update_users(self): already_updated = set() while True: # Only update so many at a time if len(already_updated) > 10000: return try: user_id, data = self.user_updates.get_nowait() except Empty: return if user_id in already_updated: continue already_updated.add(user_id) try: User.update(**data).where(User.user_id == user_id).execute() except: self.log.exception('Failed to update user %s: ', user_id) @Plugin.listen('VoiceStateUpdate', priority=Priority.BEFORE) def on_voice_state_update(self, event): pre_state = self.state.voice_states.get(event.session_id) GuildVoiceSession.create_or_update(pre_state, event.state) @Plugin.listen('PresenceUpdate') def on_presence_update(self, event): updates = {} if event.user.avatar != UNSET: updates['avatar'] = event.user.avatar if event.user.username != UNSET: updates['username'] = event.user.username if event.user.discriminator != UNSET: updates['discriminator'] = int(event.user.discriminator) if not updates: return self.user_updates.put((event.user.id, updates)) @Plugin.listen('MessageCreate') def on_message_create(self, event): Message.from_disco_message(event.message) @Plugin.listen('MessageUpdate') def on_message_update(self, event): Message.from_disco_message_update(event.message) @Plugin.listen('MessageDelete') def on_message_delete(self, event): Message.update(deleted=True).where(Message.id == event.id).execute() @Plugin.listen('MessageDeleteBulk') def on_message_delete_bulk(self, event): Message.update(deleted=True).where((Message.id << event.ids)).execute() @Plugin.listen('MessageReactionAdd', priority=Priority.BEFORE) def on_message_reaction_add(self, event): Reaction.from_disco_reaction(event) @Plugin.listen('MessageReactionRemove', priority=Priority.BEFORE) def on_message_reaction_remove(self, event): Reaction.delete().where( (Reaction.message_id == event.message_id) & (Reaction.user_id == event.user_id) & (Reaction.emoji_id == (event.emoji.id or None)) & (Reaction.emoji_name == (event.emoji.name or None))).execute() @Plugin.listen('MessageReactionRemoveAll') def on_message_reaction_remove_all(self, event): Reaction.delete().where((Reaction.message_id == event.message_id)).execute() @Plugin.listen('GuildEmojisUpdate', priority=Priority.BEFORE) def on_guild_emojis_update(self, event): ids = [] for emoji in event.emojis: GuildEmoji.from_disco_guild_emoji(emoji, event.guild_id) ids.append(emoji.id) GuildEmoji.update(deleted=True).where( (GuildEmoji.guild_id == event.guild_id) & (~(GuildEmoji.emoji_id << ids)) ).execute() @Plugin.listen('GuildCreate') def on_guild_create(self, event): for channel in list(event.channels.values()): Channel.from_disco_channel(channel) for emoji in list(event.emojis.values()): GuildEmoji.from_disco_guild_emoji(emoji, guild_id=event.guild.id) @Plugin.listen('GuildDelete') def on_guild_delete(self, event): if event.deleted: Channel.update(deleted=True).where( Channel.guild_id == event.id ).execute() @Plugin.listen('ChannelCreate') def on_channel_create(self, event): Channel.from_disco_channel(event.channel) @Plugin.listen('ChannelUpdate') def on_channel_update(self, event): Channel.from_disco_channel(event.channel) @Plugin.listen('ChannelDelete') def on_channel_delete(self, event): Channel.update(deleted=True).where(Channel.channel_id == event.channel.id).execute() @Plugin.command('sql', level=-1, global_=True) def command_sql(self, event): conn = database.obj.get_conn() try: tbl = MessageTable(codeblock=False) with conn.cursor() as cur: start = time.time() cur.execute(event.codeblock.format(e=event)) dur = time.time() - start if not cur.description: return event.msg.reply('_Query took {}ms - no result._'.format(int(dur * 1000))) tbl.set_header(*[desc[0] for desc in cur.description]) for row in cur.fetchall(): tbl.add(*row) result = tbl.compile() if len(result) > 1900: return event.msg.reply( '_Query took {}ms_'.format(int(dur * 1000)), attachments=[('sql_result_{}.txt'.format(event.msg.id), result)] ) event.msg.reply(u'```{}```_Query took {}ms_'.format(result, int(dur * 1000))) except psycopg2.Error as e: event.msg.reply(u'```{}```'.format(e.pgerror)) @Plugin.command('init', '<entity:user|channel>', level=-1, group='markov', global_=True) def command_markov(self, event, entity): if isinstance(entity, DiscoUser): q = Message.select().where(Message.author_id == entity.id).limit(500000) else: q = Message.select().where(Message.channel_id == entity.id).limit(500000) text = [msg.content for msg in q] self.models[entity.id] = markovify.NewlineText('\n'.join(text)) event.msg.reply(u':ok_hand: created markov model for {} using {} messages'.format(entity, len(text))) @Plugin.command('one', '<entity:user|channel>', level=-1, group='markov', global_=True) def command_markov_one(self, event, entity): if entity.id not in self.models: return event.msg.reply(':warning: no model created yet for {}'.format(entity)) sentence = self.models[entity.id].make_sentence(max_overlap_ratio=1, max_overlap_total=500) if not sentence: event.msg.reply(':warning: not enough data :(') return event.msg.reply(u'{}: {}'.format(entity, sentence)) @Plugin.command('many', '<entity:user|channel> [count:int]', level=-1, group='markov', global_=True) def command_markov_many(self, event, entity, count=5): if entity.id not in self.models: return event.msg.reply(':warning: no model created yet for {}'.format(entity)) for _ in range(int(count)): sentence = self.models[entity.id].make_sentence(max_overlap_total=500) if not sentence: event.msg.reply(':warning: not enough data :(') return event.msg.reply(u'{}: {}'.format(entity, sentence)) @Plugin.command('list', level=-1, group='markov', global_=True) def command_markov_list(self, event): event.msg.reply(u'`{}`'.format(', '.join(map(str, self.models.keys())))) @Plugin.command('delete', '<oid:snowflake>', level=-1, group='markov', global_=True) def command_markov_delete(self, event, oid): if oid not in self.models: return event.msg.reply(':warning: no model with that ID') del self.models[oid] event.msg.reply(':ok_hand: deleted model') @Plugin.command('clear', level=-1, group='markov', global_=True) def command_markov_clear(self, event): self.models = {} event.msg.reply(':ok_hand: cleared models') @Plugin.command('message', '<channel:snowflake> <message:snowflake>', level=-1, group='backfill', global_=True) def command_backfill_message(self, event, channel, message): channel = self.state.channels.get(channel) Message.from_disco_message(channel.get_message(message)) return event.msg.reply(':ok_hand: backfilled') @Plugin.command('reactions', '<message:snowflake>', level=-1, group='backfill', global_=True) def command_sql_reactions(self, event, message): try: message = Message.get(id=message) except Message.DoesNotExist: return event.msg.reply(':warning: no message found') message = self.state.channels.get(message.channel_id).get_message(message.id) for reaction in message.reactions: for users in message.get_reactors(reaction.emoji, bulk=True): Reaction.from_disco_reactors(message.id, reaction, (i.id for i in users)) @Plugin.command('global', '<duration:str> [pool:int]', level=-1, global_=True, context={'mode': 'global'}, group='recover') @Plugin.command('here', '<duration:str> [pool:int]', level=-1, global_=True, context={'mode': 'here'}, group='recover') def command_recover(self, event, duration, pool=4, mode=None): if mode == 'global': channels = list(self.state.channels.values()) else: channels = list(event.guild.channels.values()) start_at = parse_duration(duration, negative=True) pool = Pool(pool) total = len(channels) msg = event.msg.reply('Recovery Status: 0/{}'.format(total)) recoveries = [] def updater(): last = len(recoveries) while True: if last != len(recoveries): last = len(recoveries) msg.edit('Recovery Status: {}/{}'.format(len(recoveries), total)) gevent.sleep(5) u = self.spawn(updater) try: for channel in channels: pool.wait_available() r = Recovery(self.log, channel, start_at) pool.spawn(r.run) recoveries.append(r) finally: pool.join() u.kill() msg.edit('RECOVERY COMPLETED ({} total messages)'.format( sum([i._recovered for i in recoveries]) )) @Plugin.command('backfill channel', '[channel:snowflake]', level=-1, global_=True) def command_backfill_channel(self, event, channel=None): channel = self.state.channels.get(channel) if channel else event.channel backfill_channel.queue(channel.id) event.msg.reply(':ok_hand: enqueued channel to be backfilled') @Plugin.command('backfill guild', '[guild:guild] [concurrency:int]', level=-1, global_=True) def command_backfill_guild(self, event, guild=None, concurrency=1): guild = guild or event.guild backfill_guild.queue(guild.id) event.msg.reply(':ok_hand: enqueued guild to be backfilled') @Plugin.command('usage', '<word:str> [unit:str] [amount:int]', level=-1, group='words') def words_usage(self, event, word, unit='days', amount=7): sql = ''' SELECT date, coalesce(count, 0) AS count FROM generate_series( NOW() - interval %s, NOW(), %s ) AS date LEFT OUTER JOIN ( SELECT date_trunc(%s, timestamp) AS dt, count(*) AS count FROM messages WHERE timestamp >= (NOW() - interval %s) AND timestamp < (NOW()) AND guild_id=%s AND (SELECT count(*) FROM regexp_matches(content, %s)) >= 1 GROUP BY dt ) results ON (date_trunc(%s, date) = results.dt); ''' msg = event.msg.reply(':alarm_clock: One moment pls...') start = time.time() tuples = list(Message.raw( sql, '{} {}'.format(amount, unit), '1 {}'.format(unit), unit, '{} {}'.format(amount, unit), event.guild.id, '\s?{}\s?'.format(word), unit ).tuples()) sql_duration = time.time() - start start = time.time() chart = pygal.Line() chart.title = 'Usage of {} Over {} {}'.format( word, amount, unit, ) if unit == 'days': chart.x_labels = [i[0].strftime('%a %d') for i in tuples] elif unit == 'minutes': chart.x_labels = [i[0].strftime('%X') for i in tuples] else: chart.x_labels = [i[0].strftime('%x %X') for i in tuples] chart.x_labels = [i[0] for i in tuples] chart.add(word, [i[1] for i in tuples]) pngdata = cairosvg.svg2png( bytestring=chart.render(), dpi=72) chart_duration = time.time() - start event.msg.reply( '_SQL: {}ms_ - _Chart: {}ms_'.format( int(sql_duration * 1000), int(chart_duration * 1000), ), attachments=[('chart.png', pngdata)]) msg.delete() @Plugin.command('top', '<target:user|channel|guild>', level=-1, group='words') def words_top(self, event, target): if isinstance(target, DiscoUser): q = 'author_id' elif isinstance(target, DiscoChannel): q = 'channel_id' elif isinstance(target, DiscoGuild): q = 'guild_id' else: raise Exception("You should not be here") sql = """ SELECT word, count(*) FROM ( SELECT regexp_split_to_table(content, '\s') as word FROM messages WHERE {}=%s LIMIT 3000000 ) t GROUP BY word ORDER BY 2 DESC LIMIT 30 """.format(q) t = MessageTable() t.set_header('Word', 'Count') for word, count in Message.raw(sql, target.id).tuples(): if '```' in word: continue t.add(word, count) event.msg.reply(t.compile())
def _migrator_with_worker_pool(migrator, reindexer, iterable, max_retry, num_workers): """Migrate in parallel with worker pool When running in steady state, failed doc will be retried up to the max retry limit. Documents awaiting retry and all documents that started the migration process but did not finish will be saved and retried on the next run if the migration is stopped before it completes. """ def work_on(doc, key, retry_count): try: ok = migrator.migrate(doc) assert ok, "run_with_worker_pool expects success!" except Exception: err = traceback.format_exc().strip() print("Error processing blob:\n{}".format(err)) if retry_count < max_retry: print("will retry {}".format(key)) retry_blobs[key] += 1 queue.put(doc) return migrator.save_backup(doc, "too many retries") print("too many retries {}".format(key)) retry_blobs.pop(key, None) def retry_loop(): for doc in queue: enqueue_doc(doc) def enqueue_doc(doc): key = reindexer.get_key(doc) retry_count = retry_blobs.setdefault(key, 0) # pool.spawn will block until a worker is available pool.spawn(work_on, doc, key, retry_count) # Returning True here means the underlying iterator will think # this doc has been processed successfully. Therefore we must # process this doc before the process exits or save it to be # processed on the next run. return True queue = LifoQueue() loop = gevent.spawn(retry_loop) pool = Pool(size=num_workers) class gmigrator: migrate = staticmethod(enqueue_doc) with migrator: retry_blobs = iterable.get_iterator_detail("retry_blobs") or {} for key in list(retry_blobs): queue.put(reindexer.load(key)) try: yield gmigrator finally: try: print("waiting for workers to stop... (Ctrl+C to abort)") queue.put(StopIteration) loop.join() while not pool.join(timeout=10): print("waiting for {} workers to stop...".format(len(pool))) finally: iterable.set_iterator_detail("retry_blobs", retry_blobs) print("done.")
def gmail_initial_sync(crispin_client, db_session, log, folder_name, shared_state, local_uids, uid_download_stack): remote_g_metadata = get_g_metadata(crispin_client, db_session, log, folder_name, local_uids, shared_state['syncmanager_lock']) remote_uids = sorted(remote_g_metadata.keys(), key=int) log.info("Found {0} UIDs for folder {1}".format(len(remote_uids), folder_name)) if folder_name == crispin_client.folder_names()['all']: log.info("Already have {0} UIDs".format(len(local_uids))) with shared_state['syncmanager_lock']: log.debug("gmail_initial_sync grabbed syncmanager_lock") deleted_uids = remove_deleted_uids( crispin_client.account_id, db_session, log, folder_name, local_uids, remote_uids) local_uids = set(local_uids) - deleted_uids unknown_uids = set(remote_uids) - local_uids if folder_name == crispin_client.folder_names()['inbox']: # We don't do an initial dedupe for Inbox because we do thread # expansion, which means even if we have a given msgid downloaded, we # miiight not have the whole thread. This means that restarts cause # duplicate work, but hopefully these folders aren't too huge. message_download_stack = LifoQueue() flags = crispin_client.flags(unknown_uids) for uid in unknown_uids: if uid in flags: message_download_stack.put( GMessage(uid, remote_g_metadata[uid], flags[uid].flags, flags[uid].labels)) new_uid_poller = spawn(check_new_g_thrids, crispin_client.account_id, crispin_client.PROVIDER, folder_name, log, message_download_stack, shared_state['poll_frequency'], shared_state['syncmanager_lock']) download_queued_threads(crispin_client, db_session, log, folder_name, message_download_stack, shared_state['status_cb'], shared_state['syncmanager_lock']) elif folder_name in uid_download_folders(crispin_client): full_download = deduplicate_message_download( crispin_client, db_session, log, shared_state['syncmanager_lock'], remote_g_metadata, unknown_uids) add_uids_to_stack(full_download, uid_download_stack) new_uid_poller = spawn(check_new_uids, crispin_client.account_id, crispin_client.PROVIDER, folder_name, log, uid_download_stack, shared_state['poll_frequency'], shared_state['syncmanager_lock']) download_queued_uids(crispin_client, db_session, log, folder_name, uid_download_stack, len(local_uids), len(remote_uids), shared_state['status_cb'], shared_state['syncmanager_lock'], gmail_download_and_commit_uids, create_gmail_message) else: raise MailsyncError( "Unknown Gmail sync folder: {}".format(folder_name)) # Complete X-GM-MSGID mapping is no longer needed after initial sync. rm_cache(remote_g_metadata_cache_file(crispin_client.account_id, folder_name)) new_uid_poller.kill()
class BlockingSentinelMasterGeventConnectionPool(object): """Blocking, Sentinel enabled Redis connection pool. We use this instead of the built-in connection pool in redis-py, because the built-in one for sentinel does not support a blocking implementation, which Taba relies on. (There is a blocking connection pool in redis-py, but it is incompatible with sentinel connections). """ # Timeout, in seconds, when trying to retrieve a connection from the # redis connection pool. This is set to infinite (i.e. a worker will wait # indefinitely for a connection to become available). Any actual remote # failure should be caught and surfaced by the socket timeout. GET_CONNECTION_TIMEOUT = None def __init__(self, service_name, sentinel_manager, pool_size=8, tab_prefix='redis_bsmg_pool', connection_class=SentinelManagedConnection, connection_kwargs={}, sentinel_check_connections=False, is_master=True): """ Args: service_name - Name of the Sentinel service name to connect to. sentinel_manager - Sentinel manager object. pool_size - Number of connections to maintain in the pool. tab_prefix - Tab name prefix for Tabs recorded by this class. connection_class - Class to use for creating connections. Must be a sub-class of (or be API compatible with) SentinelManagedConnection. connection_kwargs - Keyword arguments to pass through to connection constructor. sentinel_check_connections - Whether to enable Sentinel connection checking on establishing each connection. is_master - Always True. Included to match SentinelConnectionPool API. """ self.service_name = service_name self.sentinel_manager = sentinel_manager self.pool_size = pool_size self.tab_prefix = tab_prefix self.conn_class = connection_class self.conn_kwargs = connection_kwargs # Sentinel connection pool API member variables. self.is_master = is_master self.check_connection = sentinel_check_connections self.master_address = None # Actual pool containers. self.closed = False self.all = set() self.pool = LifoQueue(maxsize=self.pool_size) # Initialize the pool. for _ in xrange(self.pool_size): conn = self.conn_class( connection_pool=weakref.proxy(self), **connection_kwargs) self.all.add(conn) self.pool.put(conn) def __repr__(self): return "%s<%s|%s>" % ( type(self).__name__, self.connection_class.__name__, self.connection_kwargs) ######################################################### # Connection Pool API Methods ######################################################### def get_connection(self, command_name, *keys, **options): """Get a connection from the pool. Args: Ignored. Included to match ConnectionPool API. """ if self.closed: raise Empty() try: return self.pool.get(timeout=self.GET_CONNECTION_TIMEOUT) except Empty as e: client.Counter(self.tab_prefix + '_redis_conn_pool_get_conn_timeout') LOG.error('Cannot get connection for %s:%d' % (self.host, self.port)) raise e def release(self, connection): """Releases the connection back to the pool Args: connection - Connection to put back in the pool. Must have been initially taken from this pool. """ if connection not in self.all: raise ValueError() self.pool.put(connection) def disconnect(self): """Disconnects all connections in the pool.""" for conn in self.all: conn.disconnect() def shutdown(self): """Close the pool and disconnect all connections. """ self.closed = True try: # Wait for all the connections to finish and get returned to the pool. def _wait_ready(): while not self.pool.full(): time.sleep(0.5) thread_util.PerformOperationWithTimeout(30, _wait_ready) except Exception as e: LOG.error(e) finally: # Disconnect anyway. self.disconnect() ######################################################### # Sentinel Pool API Methods ######################################################### def get_master_address(self): """SentinelConnectionPool API compatibility. Get the connection information to the service master. Returns: Tuple of (Master Hostname, Master Port) """ master_address = self.sentinel_manager.discover_master(self.service_name) if self.master_address is None: self.master_address = master_address elif master_address != self.master_address: # Master address changed. Reset all connections. self.disconnect() return master_address def rotate_slaves(self): """SentinelConnectionPool API compatibility. Not implemented. """ pass
def _migrator_with_worker_pool(migrator, reindexer, iterable, max_retry, num_workers): """Migrate in parallel with worker pool When running in steady state, failed doc will be retried up to the max retry limit. Documents awaiting retry and all documents that started the migration process but did not finish will be saved and retried on the next run if the migration is stopped before it completes. """ def work_on(doc, key, retry_count): try: ok = migrator.migrate(doc) assert ok, "run_with_worker_pool expects success!" except Exception: err = traceback.format_exc().strip() print("Error processing blob:\n{}".format(err)) if retry_count < max_retry: print("will retry {}".format(key)) retry_blobs[key] += 1 queue.put(doc) return migrator.save_backup(doc, "too many retries") print("too many retries {}".format(key)) retry_blobs.pop(key, None) def retry_loop(): for doc in queue: enqueue_doc(doc) def enqueue_doc(doc): key = reindexer.get_key(doc) retry_count = retry_blobs.setdefault(key, 0) # pool.spawn will block until a worker is available pool.spawn(work_on, doc, key, retry_count) # Returning True here means the underlying iterator will think # this doc has been processed successfully. Therefore we must # process this doc before the process exits or save it to be # processed on the next run. return True queue = LifoQueue() loop = gevent.spawn(retry_loop) pool = Pool(size=num_workers) class gmigrator: migrate = staticmethod(enqueue_doc) with migrator: retry_blobs = iterable.get_iterator_detail("retry_blobs") or {} for key in list(retry_blobs): queue.put(reindexer.load(key)) try: yield gmigrator finally: try: print("waiting for workers to stop... (Ctrl+C to abort)") queue.put(StopIteration) loop.join() while not pool.join(timeout=10): print("waiting for {} workers to stop...".format( len(pool))) finally: iterable.set_iterator_detail("retry_blobs", retry_blobs) print("done.")
def gmail_initial_sync(crispin_client, log, folder_name, shared_state, local_uids, uid_download_stack, msg_create_fn): remote_uid_count = len(set(crispin_client.all_uids())) remote_g_metadata, update_uid_count = get_g_metadata( crispin_client, log, folder_name, local_uids, shared_state['syncmanager_lock']) remote_uids = sorted(remote_g_metadata.keys(), key=int) log.info(remote_uid_count=len(remote_uids)) if folder_name == crispin_client.folder_names()['all']: log.info(local_uid_count=len(local_uids)) with shared_state['syncmanager_lock']: log.debug('gmail_initial_sync grabbed syncmanager_lock') with session_scope(ignore_soft_deletes=False) as db_session: deleted_uids = remove_deleted_uids( crispin_client.account_id, db_session, log, folder_name, local_uids, remote_uids) delete_uid_count = len(deleted_uids) local_uids = set(local_uids) - deleted_uids unknown_uids = set(remote_uids) - local_uids # Persist the num(messages) to sync (any type of sync: download, # update or delete) before we start. # Note that num_local_deleted, num_local_updated ARE the numbers to # delete/update too since we make those changes rightaway before we # start downloading messages. update_uid_counts(db_session, log, crispin_client.account_id, folder_name, remote_uid_count=remote_uid_count, download_uid_count=len(unknown_uids), update_uid_count=update_uid_count, delete_uid_count=delete_uid_count) if folder_name == crispin_client.folder_names()['inbox']: # We don't do an initial dedupe for Inbox because we do thread # expansion, which means even if we have a given msgid downloaded, we # miiight not have the whole thread. This means that restarts cause # duplicate work, but hopefully these folders aren't too huge. message_download_stack = LifoQueue() flags = crispin_client.flags(unknown_uids) for uid in unknown_uids: if uid in flags: message_download_stack.put( GMessage(uid, remote_g_metadata[uid], flags[uid].flags, flags[uid].labels)) new_uid_poller = spawn(check_new_g_thrids, crispin_client.account_id, crispin_client.PROVIDER, folder_name, log, message_download_stack, shared_state['poll_frequency'], shared_state['syncmanager_lock']) download_queued_threads(crispin_client, log, folder_name, message_download_stack, shared_state['syncmanager_lock']) elif folder_name in uid_download_folders(crispin_client): full_download = deduplicate_message_download( crispin_client, log, shared_state['syncmanager_lock'], remote_g_metadata, unknown_uids) add_uids_to_stack(full_download, uid_download_stack) new_uid_poller = spawn(check_new_uids, crispin_client.account_id, folder_name, log, uid_download_stack, shared_state['poll_frequency'], shared_state['syncmanager_lock']) download_queued_uids(crispin_client, log, folder_name, uid_download_stack, len(local_uids), len(unknown_uids), shared_state['syncmanager_lock'], gmail_download_and_commit_uids, msg_create_fn) else: raise MailsyncError( 'Unknown Gmail sync folder: {}'.format(folder_name)) # Complete X-GM-MSGID mapping is no longer needed after initial sync. rm_cache(remote_g_metadata_cache_file(crispin_client.account_id, folder_name)) new_uid_poller.kill()
def initial_sync_impl(self, crispin_client, local_uids, uid_download_stack): # We wrap the block in a try/finally because the greenlets like # new_uid_poller need to be killed when this greenlet is interrupted try: remote_uid_count = len(set(crispin_client.all_uids())) remote_g_metadata, update_uid_count = self.__fetch_g_metadata( crispin_client, local_uids) remote_uids = sorted(remote_g_metadata.keys(), key=int) log.info(remote_uid_count=len(remote_uids)) if self.folder_name == crispin_client.folder_names()['all']: log.info(local_uid_count=len(local_uids)) with self.syncmanager_lock: log.debug('gmail_initial_sync grabbed syncmanager_lock') with mailsync_session_scope() as db_session: deleted_uids = self.remove_deleted_uids( db_session, local_uids, remote_uids) delete_uid_count = len(deleted_uids) local_uids = set(local_uids) - deleted_uids unknown_uids = set(remote_uids) - local_uids # Persist the num(messages) to sync (any type of sync: # download, update or delete) before we start. Note that # num_local_deleted, num_local_updated ARE the numbers to # delete/update too since we make those changes rightaway # before we start downloading messages. self.update_uid_counts( db_session, remote_uid_count=remote_uid_count, download_uid_count=len(unknown_uids), update_uid_count=update_uid_count, delete_uid_count=delete_uid_count) if self.folder_name == crispin_client.folder_names()['inbox']: # We don't do an initial dedupe for Inbox because we do thread # expansion, which means even if we have a given msgid # downloaded, we miiight not have the whole thread. This means # that restarts cause duplicate work, but hopefully these # folders aren't too huge. message_download_stack = LifoQueue() flags = crispin_client.flags(unknown_uids) for uid in unknown_uids: if uid in flags: message_download_stack.put( GMessage(uid, remote_g_metadata[uid], flags[uid].flags, flags[uid].labels)) new_uid_poller = spawn(self.__check_new_g_thrids, message_download_stack) self.__download_queued_threads(crispin_client, message_download_stack) elif self.folder_name in uid_download_folders(crispin_client): full_download = self.__deduplicate_message_download( crispin_client, remote_g_metadata, unknown_uids) add_uids_to_stack(full_download, uid_download_stack) new_uid_poller = spawn(self.check_new_uids, uid_download_stack) self.download_uids(crispin_client, uid_download_stack) else: raise MailsyncError( 'Unknown Gmail sync folder: {}'.format(self.folder_name)) # Complete X-GM-MSGID mapping is no longer needed after initial # sync. rm_cache(remote_g_metadata_cache_file(self.account_id, self.folder_name)) finally: new_uid_poller.kill()
class AbstractDatabaseConnectionPool(object): def __init__(self, maxsize=100, maxwait=1.0, expires=None, cleanup=None): """ The pool manages opened connections to the database. The main strategy is to keep the smallest number of alive connections which are required for best web service performance. In most cases connections are taken from the pool. In case of views-peeks, pool creates some extra resources preventing service gone unavailable. In time of low traffic (night) unnecessary connections are released. Parameters ---------- maxsize : int Soft limit of the number of created connections. After reaching this limit taking the next connection first waits `maxwait` time for any returned slot. maxwait : float The time in seconds which is to be wait before creating new connection after the pool gets empty. It may be 0 then immediate connections are created til `maxoverflow` is reached. expires : float The time in seconds indicates how long connection should stay alive. It is also used to close unneeded slots. """ if not isinstance(maxsize, integer_types): raise TypeError('Expected integer, got %r' % (maxsize, )) self._maxsize = maxsize self._maxwait = maxwait self._expires = expires self._cleanup = cleanup self._created_at = {} self._latest_use = {} self._pool = LifoQueue() self._size = 0 self._latest_cleanup = 0 if self._expires or self._cleanup else 0xffffffffffffffff self._interval_cleanup = min( self._expires or self._cleanup, self._cleanup or self._expires) if self._expires or self._cleanup else 0 self._cleanup_lock = Semaphore(value=1) def create_connection(self): raise NotImplementedError() def close_connection(self, item): try: self._size -= 1 self._created_at.pop(id(item), None) self._latest_use.pop(id(item), None) item.close() except Exception: pass def cleanup(self): self._cleanup_queue(time.time()) def _cleanup_queue(self, now): if self._latest_cleanup > now: return with self._cleanup_lock: if self._latest_cleanup > now: return self._latest_cleanup = now + self._interval_cleanup cleanup = now - self._cleanup if self._cleanup else None expires = now - self._expires if self._expires else None # Instead of creating new LIFO for self._pool, the ole one is reused, # beacuse some othere might wait for connetion on it. fresh_slots = [] try: # try to fill self._pool ASAP, preventing creation of new connections. # because after this loop LIFO will be in reversed order while not self._pool.empty(): item = self._pool.get_nowait() if cleanup and self._latest_use.get(id(item), 0) < cleanup: self.close_connection(item) elif expires and self._created_at.get(id(item), 0) < expires: self.close_connection(item) else: fresh_slots.append(item) except Empty: pass # Reverse order back (frestest connections should be at the begining) for conn in reversed(fresh_slots): self._pool.put_nowait(conn) def get(self): try: return self._pool.get_nowait() except Empty: pass if self._size >= self._maxsize: try: return self._pool.get(timeout=self._maxwait) except Empty: pass # It is posiible that after waiting self._maxwait time, non connection has been returned # because of cleaning up old ones on put(), so there is not connection but also LIFO is not full. # In that case new connection shouls be created, otherwise exception is risen. if self._size >= self._maxsize: raise OperationalError( "Too many connections created: {} (maxsize is {})".format( self._size, self._maxsize)) try: self._size += 1 conn = self.create_connection() except: self._size -= 1 raise now = time.time() self._created_at[id(conn)] = now self._latest_use[id(conn)] = now return conn def put(self, conn): now = time.time() self._pool.put(conn) self._latest_use[id(conn)] = now self._cleanup_queue(now) def closeall(self): while not self._pool.empty(): conn = self._pool.get_nowait() try: conn.close() except Exception: pass self._size = 0 @contextlib.contextmanager def connection(self, isolation_level=None): conn = self.get() try: if isolation_level is not None: if conn.isolation_level == isolation_level: isolation_level = None else: conn.set_isolation_level(isolation_level) yield conn except: if conn.closed: conn = None self.closeall() else: conn = self._rollback(conn) raise else: if conn.closed: raise OperationalError( "Cannot commit because connection was closed: %r" % (conn, )) conn.commit() finally: if conn is not None and not conn.closed: if isolation_level is not None: conn.set_isolation_level(isolation_level) self.put(conn) @contextlib.contextmanager def cursor(self, *args, **kwargs): isolation_level = kwargs.pop('isolation_level', None) with self.connection(isolation_level) as conn: yield conn.cursor(*args, **kwargs) def _rollback(self, conn): try: conn.rollback() except: gevent.get_hub().handle_error(conn, *sys.exc_info()) return return conn def execute(self, *args, **kwargs): with self.cursor(**kwargs) as cursor: cursor.execute(*args) return cursor.rowcount def fetchone(self, *args, **kwargs): with self.cursor(**kwargs) as cursor: cursor.execute(*args) return cursor.fetchone() def fetchall(self, *args, **kwargs): with self.cursor(**kwargs) as cursor: cursor.execute(*args) return cursor.fetchall() def fetchiter(self, *args, **kwargs): with self.cursor(**kwargs) as cursor: cursor.execute(*args) while True: items = cursor.fetchmany() if not items: break for item in items: yield item
def init_tasks_queue(self, sub_domains): tasks_queue = LifoQueue() for sub_domain in sub_domains: tasks_queue.put(sub_domain) return tasks_queue
def gmail_initial_sync(crispin_client, db_session, log, folder_name, shared_state, local_uids, uid_download_stack): remote_g_metadata = get_g_metadata(crispin_client, db_session, log, folder_name, local_uids, shared_state['syncmanager_lock']) remote_uids = sorted(remote_g_metadata.keys(), key=int) log.info("Found {0} UIDs for folder {1}".format(len(remote_uids), folder_name)) if folder_name == crispin_client.folder_names()['all']: log.info("Already have {0} UIDs".format(len(local_uids))) with shared_state['syncmanager_lock']: log.debug("gmail_initial_sync grabbed syncmanager_lock") deleted_uids = remove_deleted_uids( crispin_client.account_id, db_session, log, folder_name, local_uids, remote_uids) local_uids = set(local_uids) - deleted_uids unknown_uids = set(remote_uids) - local_uids # folders that don't get thread expanded uid_download_folders = [crispin_client.folder_names()[tag] for tag in ('trash', 'spam', 'all') if tag in crispin_client.folder_names()] if folder_name == crispin_client.folder_names()['inbox']: # We don't do an initial dedupe for Inbox because we do thread # expansion, which means even if we have a given msgid downloaded, we # miiight not have the whole thread. This means that restarts cause # duplicate work, but hopefully these folders aren't too huge. message_download_stack = LifoQueue() flags = crispin_client.flags(unknown_uids) for uid in unknown_uids: if uid in flags: message_download_stack.put( GMessage(uid, remote_g_metadata[uid], flags[uid].flags, flags[uid].labels)) new_uid_poller = spawn(check_new_g_thrids, crispin_client.account_id, crispin_client.PROVIDER, folder_name, log, message_download_stack, shared_state['poll_frequency'], shared_state['syncmanager_lock']) download_queued_threads(crispin_client, db_session, log, folder_name, message_download_stack, shared_state['status_cb'], shared_state['syncmanager_lock']) elif folder_name in uid_download_folders: full_download = deduplicate_message_download( crispin_client, db_session, log, shared_state['syncmanager_lock'], remote_g_metadata, unknown_uids) add_uids_to_stack(full_download, uid_download_stack) new_uid_poller = spawn(check_new_uids, crispin_client.account_id, crispin_client.PROVIDER, folder_name, log, uid_download_stack, shared_state['poll_frequency'], shared_state['syncmanager_lock']) download_queued_uids(crispin_client, db_session, log, folder_name, uid_download_stack, len(local_uids), len(remote_uids), shared_state['status_cb'], shared_state['syncmanager_lock'], gmail_download_and_commit_uids, create_gmail_message) else: raise MailsyncError( "Unknown Gmail sync folder: {}".format(folder_name)) # Complete X-GM-MSGID mapping is no longer needed after initial sync. rm_cache(remote_g_metadata_cache_file(crispin_client.account_id, folder_name)) new_uid_poller.kill()
def initial_sync_impl(self, crispin_client, local_uids, uid_download_stack): # We wrap the block in a try/finally because the greenlets like # new_uid_poller need to be killed when this greenlet is interrupted try: remote_uid_count = len(set(crispin_client.all_uids())) remote_g_metadata, update_uid_count = self.__fetch_g_metadata( crispin_client, local_uids) remote_uids = sorted(remote_g_metadata.keys(), key=int) log.info(remote_uid_count=len(remote_uids)) if self.folder_name == crispin_client.folder_names()['all']: log.info(local_uid_count=len(local_uids)) with self.syncmanager_lock: log.debug('gmail_initial_sync grabbed syncmanager_lock') with mailsync_session_scope() as db_session: deleted_uids = self.remove_deleted_uids( db_session, local_uids, remote_uids) delete_uid_count = len(deleted_uids) local_uids = set(local_uids) - deleted_uids unknown_uids = set(remote_uids) - local_uids # Persist the num(messages) to sync (any type of sync: # download, update or delete) before we start. Note that # num_local_deleted, num_local_updated ARE the numbers to # delete/update too since we make those changes rightaway # before we start downloading messages. self.update_uid_counts( db_session, remote_uid_count=remote_uid_count, download_uid_count=len(unknown_uids), update_uid_count=update_uid_count, delete_uid_count=delete_uid_count) if self.folder_name == crispin_client.folder_names()['inbox']: # We don't do an initial dedupe for Inbox because we do thread # expansion, which means even if we have a given msgid # downloaded, we miiight not have the whole thread. This means # that restarts cause duplicate work, but hopefully these # folders aren't too huge. message_download_stack = LifoQueue() flags = crispin_client.flags(unknown_uids) for uid in unknown_uids: if uid in flags: message_download_stack.put( GMessage(uid, remote_g_metadata[uid], flags[uid].flags, flags[uid].labels)) new_uid_poller = spawn(self.__check_new_g_thrids, message_download_stack) self.__download_queued_threads(crispin_client, message_download_stack) elif self.folder_name in uid_download_folders(crispin_client): full_download = self.__deduplicate_message_download( crispin_client, remote_g_metadata, unknown_uids) add_uids_to_stack(full_download, uid_download_stack) new_uid_poller = spawn(self.check_new_uids, uid_download_stack) self.download_uids(crispin_client, uid_download_stack) else: raise MailsyncError('Unknown Gmail sync folder: {}'.format( self.folder_name)) # Complete X-GM-MSGID mapping is no longer needed after initial # sync. rm_cache( remote_g_metadata_cache_file(self.account_id, self.folder_name)) finally: new_uid_poller.kill()
def uid_list_to_stack(uids): """ UID download function needs a stack even for polling. """ uid_download_stack = LifoQueue() for uid in sorted(uids, key=int): uid_download_stack.put(uid) return uid_download_stack
def gmail_initial_sync(crispin_client, db_session, log, folder_name, shared_state, local_uids, uid_download_stack, msg_create_fn): remote_uid_count = len(set(crispin_client.all_uids())) remote_g_metadata, sync_info = get_g_metadata( crispin_client, db_session, log, folder_name, local_uids, shared_state['syncmanager_lock']) sync_type, update_uid_count = sync_info remote_uids = sorted(remote_g_metadata.keys(), key=int) log.info(remote_uid_count=len(remote_uids)) if folder_name == crispin_client.folder_names()['all']: log.info(local_uid_count=len(local_uids)) with shared_state['syncmanager_lock']: log.debug('gmail_initial_sync grabbed syncmanager_lock') deleted_uids = remove_deleted_uids( crispin_client.account_id, db_session, log, folder_name, local_uids, remote_uids) delete_uid_count = len(deleted_uids) local_uids = set(local_uids) - deleted_uids unknown_uids = set(remote_uids) - local_uids # Persist the num(messages) to sync (any type of sync: download, # update or delete) before we start. # Note that num_local_deleted, num_local_updated ARE the numbers to # delete/update too since we make those changes rightaway before we start # downloading messages. update_uid_counts(db_session, log, crispin_client.account_id, folder_name, remote_uid_count=remote_uid_count, download_uid_count=len(unknown_uids), update_uid_count=update_uid_count, delete_uid_count=delete_uid_count, sync_type=sync_type) if folder_name == crispin_client.folder_names()['inbox']: # We don't do an initial dedupe for Inbox because we do thread # expansion, which means even if we have a given msgid downloaded, we # miiight not have the whole thread. This means that restarts cause # duplicate work, but hopefully these folders aren't too huge. message_download_stack = LifoQueue() flags = crispin_client.flags(unknown_uids) for uid in unknown_uids: if uid in flags: message_download_stack.put( GMessage(uid, remote_g_metadata[uid], flags[uid].flags, flags[uid].labels)) new_uid_poller = spawn(check_new_g_thrids, crispin_client.account_id, crispin_client.PROVIDER, folder_name, log, message_download_stack, shared_state['poll_frequency'], shared_state['syncmanager_lock']) download_queued_threads(crispin_client, db_session, log, folder_name, message_download_stack, shared_state['syncmanager_lock']) elif folder_name in uid_download_folders(crispin_client): full_download = deduplicate_message_download( crispin_client, db_session, log, shared_state['syncmanager_lock'], remote_g_metadata, unknown_uids) add_uids_to_stack(full_download, uid_download_stack) new_uid_poller = spawn(check_new_uids, crispin_client.account_id, crispin_client.PROVIDER, folder_name, log, uid_download_stack, shared_state['poll_frequency'], shared_state['syncmanager_lock']) download_queued_uids(crispin_client, db_session, log, folder_name, uid_download_stack, len(local_uids), len(unknown_uids), shared_state['syncmanager_lock'], gmail_download_and_commit_uids, msg_create_fn) else: raise MailsyncError( 'Unknown Gmail sync folder: {}'.format(folder_name)) # Complete X-GM-MSGID mapping is no longer needed after initial sync. rm_cache(remote_g_metadata_cache_file(crispin_client.account_id, folder_name)) new_uid_poller.kill()
class BaseProcessor(LoggerMixin): name = 'base-processor' @classmethod def from_engine(cls, engine, *args, **kwargs): return cls(engine, *args, **kwargs) def _request(self): return self.engine.request request = property(_request) def __init__(self, engine, *args, **kwargs): from time import time from hashlib import md5 from threading import Lock from gevent.queue import LifoQueue self.processor_name = '%s:%s' % (self.name, md5(str( time())).hexdigest()[:6]) LoggerMixin.__init__(self) self.engine = engine self.__redis = None self.redis_lock = Lock() self.progress = 0 self.total = 0 # 忽略统计 self.bypassed_cnt = 0 # 超过这一限制时,add_task就暂停向其中添加任务 self.maxsize = 1000 self.tasks = LifoQueue() self.workers = [] # 默认的polling间隔为1秒 self.polling_interval = 1 import argparse arg_parser = argparse.ArgumentParser() # 并发数量 arg_parser.add_argument('--concur', type=int) args, leftover = arg_parser.parse_known_args() from core import dhaulagiri_settings if args.concur: dhaulagiri_settings['core']['concur'] = args.concur self.concur = dhaulagiri_settings['core']['concur'] self.checkpoint_ts = None self.checkpoint_prog = None self.init_ts = time() # 心跳任务 self.heart_beat = None # worker的Monitor。Worker在每次循环开始的时候,都会在该对象中进行一次状态更新 self.worker_monitor = {} def update_worker_status(self, worker): """ 更新worker的状态 :param worker: :return: """ from time import time name = worker.worker_name self.worker_monitor[name] = time() def get_worker_stat(self): """ 获得worker队列的状态 :return: """ from time import time # 如果60秒都没有状态更新,说明该worker进入zombie状态 time_window = 90 cur = time() active = dict( filter(lambda item: item[1] >= cur - time_window, self.worker_monitor.items())) zombie = dict( filter(lambda item: item[1] < cur - time_window, self.worker_monitor.items())) return {'zombie': zombie, 'active': active} def incr_progress(self): self.progress += 1 def _start_workers(self): def timer(): """ 每30秒启动一次,输出当前进度 """ import time while True: msg = 'Progress: %d / %d.' % (self.progress, self.total) cts = time.time() if self.checkpoint_prog is not None and self.checkpoint_ts is not None: rate = (self.progress - self.checkpoint_prog) / ( cts - self.checkpoint_ts) * 60 msg = '%s %s' % (msg, 'Processing rate: %d items/min' % int(rate)) self.checkpoint_ts = cts self.checkpoint_prog = self.progress # 获得worker monitor统计 stat = self.get_worker_stat() msg += ', active workers: %d, zombie workers: %d' % (len( stat['active']), len(stat['zombie'])) self.log(msg) gevent.sleep(30) self.heart_beat = gevent.spawn(timer) gevent.signal(signal.SIGKILL, gevent.kill) gevent.signal(signal.SIGQUIT, gevent.kill) for i in xrange(self.concur): worker = Worker.from_processor(self, self.tasks) self.workers.append(worker) def add_task(self, task, *args, **kwargs): # 是否启用流量控制 flow_control = True while flow_control: # 如果self.tasks中的项目过多,则暂停添加 if self.tasks.qsize() > self.maxsize: gevent.sleep(self.polling_interval) else: break func = lambda: task(*args, **kwargs) task_key = getattr(task, 'task_key', None) if task_key: setattr(func, 'task_key', task_key) self.tasks.put(func, timeout=120) self.logger.debug( 'New task%s added to the queue. Remaining: %d' % ('(%s)' % task_key if task_key else '', self.tasks.qsize())) gevent.sleep(0) def _wait_for_workers(self): """ 等待所有的worker是否完成。判据:所有的worker都处于idle状态,并且tasks队列已空 :return: """ while True: if not self.tasks.empty(): gevent.sleep(self.polling_interval) continue completed = True for w in self.workers: if not w.idle: gevent.sleep(self.polling_interval) completed = False break if completed: break gevent.killall([w.gevent for w in self.workers]) gevent.kill(self.heart_beat) def run(self): self._start_workers() self.populate_tasks() self._wait_for_workers() import time self.log( 'Processor ended: %d items processed(%d bypassed) in %d minutes' % (self.progress, self.bypassed_cnt, int((time.time() - self.init_ts) / 60.0))) def populate_tasks(self): raise NotImplementedError
class ClientPool(object): DEFAULT_CLIENT_EXPIRE_TIME = 300 DEFAULT_CLOSE_EXPIRE_CLIENT_INTERVAL = 60 def __init__(self, pool_name, pool_size, client_class, close_client_handler, *client_args, **client_kwargs): assert pool_size > 0 assert client_class is not None and hasattr(client_class, '__call__') assert close_client_handler is None or hasattr(close_client_handler, '__call__') self._pool_name = pool_name self._pool_size = pool_size self._client_class = client_class self._close_client_handler = close_client_handler self._client_args = client_args self._client_kwargs = client_kwargs self._queue = LifoQueue(maxsize=pool_size) for i in range(pool_size): self._queue.put(ClientHolder()) self._client_expire_time = self.DEFAULT_CLIENT_EXPIRE_TIME self._gc_task = ScheduleTask( name='ClientPool-GC-%s' % pool_name, start_after_seconds=0, interval_seconds=self.DEFAULT_CLOSE_EXPIRE_CLIENT_INTERVAL, handler=self._close_expire_client) self._gc_task.run() def __del__(self): self._gc_task.stop() @contextmanager def get_client(self, block=True, pool_acquire_client_timeout=1, req_timeout=5): client_holder = self._get_client(block, pool_acquire_client_timeout) tm = None try: tm = gevent.Timeout.start_new(req_timeout) yield client_holder.get_client() except BaseException as e: logger.error( 'Client is out pool for too long %s seconds, raise exception: %s', req_timeout, e) self._close_client(client_holder) raise finally: if tm: tm.cancel() self.push(client_holder) def _get_client(self, block=True, timeout=1): if self.is_empty(): logger.info('ClientPool: %s is empty.', self._pool_name) client_holder = self._queue.get(block=block, timeout=timeout) if client_holder.get_client() is None: tm = None try: tm = gevent.Timeout.start_new(timeout) client_holder.set_client(self._create_client()) except BaseException as e: client_holder.set_client(None) self.push(client_holder) raise finally: if tm: tm.cancel() client_holder.set_access_time(time.time()) return client_holder def push(self, client_holder): if not self.is_full(): self._queue.put_nowait(client_holder) def is_full(self): return self._queue.qsize() >= self._pool_size def is_empty(self): return self._queue.qsize() <= 0 def _create_client(self): return self._client_class(*self._client_args, **self._client_kwargs) def _close_client(self, client_holder): if self._close_client_handler and client_holder.get_client(): try: self._close_client_handler(client_holder.get_client()) except Exception as e: logger.error('Close client raise exception: %s', e) client_holder.set_client(None) def _close_expire_client(self): cur_time = time.time() need_closed_clients = [] for client_holder in self._queue.queue: if client_holder.get_client( ) and cur_time - client_holder.get_access_time( ) > self._client_expire_time: need_closed_clients.append(client_holder.get_client) for client in need_closed_clients: self._close_client_handler(client)
def gmail_initial_sync(crispin_client, db_session, log, folder_name, shared_state, local_uids, uid_download_stack, c): remote_g_metadata = get_g_metadata(crispin_client, db_session, log, folder_name, local_uids, shared_state['syncmanager_lock'], c) remote_uids = sorted(remote_g_metadata.keys(), key=int) log.info("Found {0} UIDs for folder {1}".format(len(remote_uids), folder_name)) if folder_name == crispin_client.folder_names(c)['all']: log.info("Already have {0} UIDs".format(len(local_uids))) deleted_uids = remove_deleted_uids( crispin_client.account_id, db_session, log, folder_name, local_uids, remote_uids, shared_state['syncmanager_lock'], c) local_uids = set(local_uids) - deleted_uids unknown_uids = set(remote_uids) - local_uids if folder_name != crispin_client.folder_names(c)['all']: # We don't do an initial dedupe for non-All Mail folders because # we do thread expansion, which means even if we have a given msgid # downloaded, we miiight not have the whole thread. This means that # restarts cause duplicate work, but hopefully these folders aren't # too huge. message_download_stack = LifoQueue() flags = crispin_client.flags(unknown_uids, c) for uid in unknown_uids: if uid in flags: message_download_stack.put( GMessage(uid, remote_g_metadata[uid], flags[uid].flags, flags[uid].labels)) new_uid_poller = spawn(check_new_g_thrids, crispin_client.account_id, crispin_client.PROVIDER, folder_name, log, message_download_stack, shared_state['poll_frequency'], shared_state['syncmanager_lock']) download_queued_threads(crispin_client, db_session, log, folder_name, message_download_stack, shared_state['status_cb'], shared_state['syncmanager_lock'], c) else: full_download = deduplicate_message_download( crispin_client, db_session, log, remote_g_metadata, unknown_uids, c) add_uids_to_stack(full_download, uid_download_stack) new_uid_poller = spawn(check_new_uids, crispin_client.account_id, crispin_client.PROVIDER, folder_name, log, uid_download_stack, shared_state['poll_frequency'], shared_state['syncmanager_lock']) download_queued_uids(crispin_client, db_session, log, folder_name, uid_download_stack, len(local_uids), len(remote_uids), shared_state['status_cb'], shared_state['syncmanager_lock'], gmail_download_and_commit_uids, account.create_gmail_message, c) # Complete X-GM-MSGID mapping is no longer needed after initial sync. rm_cache(remote_g_metadata_cache_file(crispin_client.account_id, folder_name)) new_uid_poller.kill()