def chunked_uid_download(crispin_client, db_session, log, folder_name, uids, num_local_messages, num_total_messages, status_cb, syncmanager_lock, download_commit_fn, msg_create_fn, c): log.info("{0} uids left to fetch".format(len(uids))) if uids: chunk_size = crispin_client.CHUNK_SIZE log.info("Starting sync for {0} with chunks of size {1}"\ .format(folder_name, chunk_size)) # we prioritize message download by reverse-UID order, which # generally puts more recent messages first for uids in chunk(reversed(uids), chunk_size): num_local_messages += download_commit_fn(crispin_client, db_session, log, folder_name, uids, msg_create_fn, syncmanager_lock, c) percent_done = (num_local_messages / num_total_messages) * 100 status_cb(crispin_client.account_id, 'initial', (folder_name, percent_done)) log.info("Syncing %s -- %.2f%% (%i/%i)" % (folder_name, percent_done, num_local_messages, num_total_messages)) log.info("Saved all messages and metadata on {0} to UIDVALIDITY {1} \ / HIGHESTMODSEQ {2}".format(folder_name, crispin_client.selected_uidvalidity, crispin_client.selected_highestmodseq))
def update_saved_g_metadata(crispin_client, db_session, log, folder_name, remote_g_metadata, local_uids, c): """ If HIGHESTMODSEQ has changed since we saved the X-GM-MSGID cache, we need to query for any changes since then and update the saved data. """ log.info("Updating cache with latest changes") # any uids we don't already have will be downloaded correctly # as usual, but updated uids need to be updated manually # XXX it may actually be faster to just query for X-GM-MSGID for the # whole folder rather than getting changed UIDs first; MODSEQ queries # are slow on large folders. modified = crispin_client.new_and_updated_uids( crispin_client.selected_highestmodseq, c) new, updated = new_or_updated(modified, local_uids) log.info("{0} new and {1} updated UIDs".format(len(new), len(updated))) # for new, query metadata and update cache remote_g_metadata.update(crispin_client.g_metadata(new, c)) # filter out messages that have disappeared all_uids = set(crispin_client.all_uids(c)) remote_g_metadata = dict((uid, md) for uid, md in \ remote_g_metadata.iteritems() if uid in all_uids) set_cache(remote_g_metadata_cache_file(crispin_client.account_id, folder_name), remote_g_metadata) log.info("Updated cache with new messages") # for updated, it's easier to just update them now # bigger chunk because the data being fetched here is very small for uids in chunk(updated, 5*crispin_client.CHUNK_SIZE): update_metadata(crispin_client, db_session, log, folder_name, uids, c) log.info("Updated metadata for modified messages")
def initial_sync_impl(self, crispin_client): # We wrap the block in a try/finally because the greenlets like # change_poller need to be killed when this greenlet is interrupted change_poller = None try: remote_uids = sorted(crispin_client.all_uids(), key=int) with self.syncmanager_lock: with session_scope(self.namespace_id) as db_session: local_uids = common.local_uids( self.account_id, db_session, self.folder_id ) common.remove_deleted_uids( self.account_id, self.folder_id, set(local_uids) - set(remote_uids) ) unknown_uids = set(remote_uids) - local_uids with session_scope(self.namespace_id) as db_session: self.update_uid_counts( db_session, remote_uid_count=len(remote_uids), download_uid_count=len(unknown_uids), ) change_poller = gevent.spawn(self.poll_for_changes) bind_context(change_poller, "changepoller", self.account_id, self.folder_id) if self.is_all_mail(crispin_client): # Prioritize UIDs for messages in the inbox folder. if len(remote_uids) < 1e6: inbox_uids = set( crispin_client.search_uids(["X-GM-LABELS", "inbox"]) ) else: # The search above is really slow (times out) on really # large mailboxes, so bound the search to messages within # the past month in order to get anywhere. since = datetime.utcnow() - timedelta(days=30) inbox_uids = set( crispin_client.search_uids( ["X-GM-LABELS", "inbox", "SINCE", since] ) ) uids_to_download = sorted(unknown_uids - inbox_uids) + sorted( unknown_uids & inbox_uids ) else: uids_to_download = sorted(unknown_uids) for uids in chunk(reversed(uids_to_download), 1024): g_metadata = crispin_client.g_metadata(uids) # UIDs might have been expunged since sync started, in which # case the g_metadata call above will return nothing. # They may also have been preemptively downloaded by thread # expansion. We can omit such UIDs. uids = [u for u in uids if u in g_metadata and u not in self.saved_uids] self.batch_download_uids(crispin_client, uids, g_metadata) finally: if change_poller is not None: # schedule change_poller to die gevent.kill(change_poller)
def fetch_headers(self, uids): """Fetch headers for the given uids. Chunked because certain providers fail with 'Command line too large' if you feed them too many uids at once.""" headers = {} for uid_chunk in chunk(uids, 100): headers.update(self.conn.fetch(uid_chunk, ['BODY.PEEK[HEADER]'])) return headers
def fetch_headers(self, uids): """Fetch headers for the given uids. Chunked because certain providers fail with 'Command line too large' if you feed them too many uids at once.""" headers = {} for uid_chunk in chunk(uids, 100): headers.update(self.conn.fetch(uid_chunk, ["BODY.PEEK[HEADER]"])) return headers
def update_metadata(crispin_client, db_session, log, folder_name, uids, syncmanager_lock, c): """ Update flags (the only metadata that can change). """ # bigger chunk because the data being fetched here is very small for uids in chunk(uids, 5 * crispin_client.CHUNK_SIZE): new_flags = crispin_client.flags(uids, c) assert sorted(uids, key=int) == sorted(new_flags.keys(), key=int), "server uids != local uids" log.info("new flags: {0}".format(new_flags)) with syncmanager_lock: account.update_metadata(crispin_client.account_id, db_session, folder_name, uids, new_flags) db_session.commit()
def initial_sync_impl(self, crispin_client): # We wrap the block in a try/finally because the greenlets like # change_poller need to be killed when this greenlet is interrupted change_poller = None try: remote_uids = sorted(crispin_client.all_uids(), key=int) with self.syncmanager_lock: with session_scope(self.namespace_id) as db_session: local_uids = common.local_uids(self.account_id, db_session, self.folder_id) common.remove_deleted_uids( self.account_id, self.folder_id, set(local_uids) - set(remote_uids)) unknown_uids = set(remote_uids) - local_uids with session_scope(self.namespace_id) as db_session: self.update_uid_counts( db_session, remote_uid_count=len(remote_uids), download_uid_count=len(unknown_uids)) change_poller = spawn(self.poll_for_changes) bind_context(change_poller, 'changepoller', self.account_id, self.folder_id) if self.is_all_mail(crispin_client): # Prioritize UIDs for messages in the inbox folder. if len(remote_uids) < 1e6: inbox_uids = set( crispin_client.search_uids(['X-GM-LABELS', 'inbox'])) else: # The search above is really slow (times out) on really # large mailboxes, so bound the search to messages within # the past month in order to get anywhere. since = datetime.utcnow() - timedelta(days=30) inbox_uids = set(crispin_client.search_uids([ 'X-GM-LABELS', 'inbox', 'SINCE', since])) uids_to_download = (sorted(unknown_uids - inbox_uids) + sorted(unknown_uids & inbox_uids)) else: uids_to_download = sorted(unknown_uids) for uids in chunk(reversed(uids_to_download), 1024): g_metadata = crispin_client.g_metadata(uids) # UIDs might have been expunged since sync started, in which # case the g_metadata call above will return nothing. # They may also have been preemptively downloaded by thread # expansion. We can omit such UIDs. uids = [u for u in uids if u in g_metadata and u not in self.saved_uids] self.batch_download_uids(crispin_client, uids, g_metadata) finally: if change_poller is not None: # schedule change_poller to die kill(change_poller)
def update_metadata(self, crispin_client, updated): """ Update flags (the only metadata that can change). """ # bigger chunk because the data being fetched here is very small for uids in chunk(updated, 5 * crispin_client.CHUNK_SIZE): new_flags = crispin_client.flags(uids) # Messages can disappear in the meantime; we'll update them next # sync. uids = [uid for uid in uids if uid in new_flags] with self.syncmanager_lock: with mailsync_session_scope() as db_session: common.update_metadata(self.account_id, db_session, self.folder_name, self.folder_id, uids, new_flags) db_session.commit()
def update_metadata(crispin_client, db_session, log, folder_name, uids, syncmanager_lock): """ Update flags (the only metadata that can change). """ # bigger chunk because the data being fetched here is very small for uids in chunk(uids, 5 * crispin_client.CHUNK_SIZE): new_flags = crispin_client.flags(uids) # messages can disappear in the meantime; we'll update them next sync uids = [uid for uid in uids if uid in new_flags] log.info("new flags ", new_flags=new_flags, folder_name=folder_name) with syncmanager_lock: log.debug("update_metadata acquired syncmanager_lock") account.update_metadata(crispin_client.account_id, db_session, folder_name, uids, new_flags) db_session.commit()
def update_metadata(crispin_client, log, folder_name, uids, syncmanager_lock): """ Update flags (the only metadata that can change). """ # bigger chunk because the data being fetched here is very small for uids in chunk(uids, 5 * crispin_client.CHUNK_SIZE): new_flags = crispin_client.flags(uids) # messages can disappear in the meantime; we'll update them next sync uids = [uid for uid in uids if uid in new_flags] log.info("new flags ", new_flags=new_flags, folder_name=folder_name) with syncmanager_lock: log.debug("update_metadata acquired syncmanager_lock") with session_scope(ignore_soft_deletes=False) as db_session: account.update_metadata(crispin_client.account_id, db_session, folder_name, uids, new_flags) db_session.commit()
def __download_thread(self, crispin_client, thread_g_metadata, g_thrid, thread_uids): """ Download all messages in thread identified by `g_thrid`. Messages are downloaded oldest-first via All Mail, which allows us to get the entire thread regardless of which folders it's in. We do oldest-first so that if the thread started with a message sent from the Inbox API, we can reconcile this thread appropriately with the existing message/thread. """ log.debug("downloading thread", g_thrid=g_thrid, message_count=len(thread_uids)) to_download = self.__deduplicate_message_download(crispin_client, thread_g_metadata, thread_uids) log.debug(deduplicated_message_count=len(to_download)) for uids in chunk(to_download, crispin_client.CHUNK_SIZE): self.download_and_commit_uids(crispin_client, uids) return len(to_download)
def _run_impl(self): self.log.info("Starting LabelRenameHandler", label_name=self.label_name) self.semaphore.acquire(blocking=True) try: with connection_pool(self.account_id).get() as crispin_client: folder_names = [] with session_scope(self.account_id) as db_session: folders = db_session.query(Folder).filter( Folder.account_id == self.account_id) folder_names = [folder.name for folder in folders] db_session.expunge_all() for folder_name in folder_names: crispin_client.select_folder(folder_name, uidvalidity_cb) found_uids = crispin_client.search_uids( ["X-GM-LABELS", utf7_encode(self.label_name)]) for chnk in chunk(found_uids, 200): flags = crispin_client.flags(chnk) self.log.info( "Running metadata update for folder", folder_name=folder_name, ) with session_scope(self.account_id) as db_session: fld = (db_session.query(Folder).options( load_only("id")).filter( Folder.account_id == self.account_id, Folder.name == folder_name, ).one()) common.update_metadata( self.account_id, fld.id, fld.canonical_name, flags, db_session, ) db_session.commit() finally: self.semaphore.release()
def __update_saved_g_metadata(self, crispin_client, remote_g_metadata, local_uids): """ If HIGHESTMODSEQ has changed since we saved the X-GM-MSGID cache, we need to query for any changes since then and update the saved data. """ log.info('Updating cache with latest changes') # Any uids we don't already have will be downloaded correctly as usual, # but updated uids need to be updated manually. # XXX it may actually be faster to just query for X-GM-MSGID for the # whole folder rather than getting changed UIDs first; MODSEQ queries # are slow on large folders. modified = crispin_client.new_and_updated_uids( crispin_client.selected_highestmodseq) log.info(modified_msg_count=len(modified)) new, updated = new_or_updated(modified, local_uids) log.info(new_uid_count=len(new), updated_uid_count=len(updated)) if new: remote_g_metadata.update(crispin_client.g_metadata(new)) log.info('Updated cache with new messages') else: log.info('No new messages to update metadata for') # Filter out messages that have disappeared. old_len = len(remote_g_metadata) current_remote_uids = set(crispin_client.all_uids()) remote_g_metadata = dict((uid, md) for uid, md in remote_g_metadata.iteritems() if uid in current_remote_uids) num_removed = old_len - len(remote_g_metadata) if num_removed > 0: log.info(removed_msg_count=num_removed) set_cache( remote_g_metadata_cache_file(self.account_id, self.folder_name), remote_g_metadata) if updated: # It's easy and fast to just update these here and now. # Bigger chunk because the data being fetched here is very small. for uids in chunk(updated, 5 * crispin_client.CHUNK_SIZE): self.update_metadata(crispin_client, uids) log.info('updated metadata for modified messages', msg_count=len(updated)) return len(updated) else: log.info('No modified messages to update metadata for') return 0
def __update_saved_g_metadata(self, crispin_client, remote_g_metadata, local_uids): """ If HIGHESTMODSEQ has changed since we saved the X-GM-MSGID cache, we need to query for any changes since then and update the saved data. """ log.info('Updating cache with latest changes') # Any uids we don't already have will be downloaded correctly as usual, # but updated uids need to be updated manually. # XXX it may actually be faster to just query for X-GM-MSGID for the # whole folder rather than getting changed UIDs first; MODSEQ queries # are slow on large folders. modified = crispin_client.new_and_updated_uids( crispin_client.selected_highestmodseq) log.info(modified_msg_count=len(modified)) new, updated = new_or_updated(modified, local_uids) log.info(new_uid_count=len(new), updated_uid_count=len(updated)) if new: remote_g_metadata.update(crispin_client.g_metadata(new)) log.info('Updated cache with new messages') else: log.info('No new messages to update metadata for') # Filter out messages that have disappeared. old_len = len(remote_g_metadata) current_remote_uids = set(crispin_client.all_uids()) remote_g_metadata = dict((uid, md) for uid, md in remote_g_metadata.iteritems() if uid in current_remote_uids) num_removed = old_len - len(remote_g_metadata) if num_removed > 0: log.info(removed_msg_count=num_removed) set_cache(remote_g_metadata_cache_file(self.account_id, self.folder_name), remote_g_metadata) if updated: # It's easy and fast to just update these here and now. # Bigger chunk because the data being fetched here is very small. for uids in chunk(updated, 5 * crispin_client.CHUNK_SIZE): self.update_metadata(crispin_client, uids) log.info('updated metadata for modified messages', msg_count=len(updated)) return len(updated) else: log.info('No modified messages to update metadata for') return 0
def download_thread(crispin_client, db_session, log, syncmanager_lock, thread_g_metadata, g_thrid, thread_uids): """ Download all messages in thread identified by `g_thrid`. Messages are downloaded most-recent-first via All Mail, which allows us to get the entire thread regardless of which folders it's in. """ log.debug("Downloading thread {} with {} messages." .format(g_thrid, len(thread_uids))) to_download = deduplicate_message_download(crispin_client, db_session, log, syncmanager_lock, thread_g_metadata, thread_uids) log.debug("{} deduplicated messages to download.".format(len(to_download))) for uids in chunk(reversed(to_download), crispin_client.CHUNK_SIZE): gmail_download_and_commit_uids(crispin_client, db_session, log, crispin_client.selected_folder_name, uids, create_gmail_message, syncmanager_lock)
def __download_thread(self, crispin_client, thread_g_metadata, g_thrid, thread_uids): """ Download all messages in thread identified by `g_thrid`. Messages are downloaded most-recent-first via All Mail, which allows us to get the entire thread regardless of which folders it's in. """ log.debug('downloading thread', g_thrid=g_thrid, message_count=len(thread_uids)) to_download = self.__deduplicate_message_download( crispin_client, thread_g_metadata, thread_uids) log.debug(deduplicated_message_count=len(to_download)) for uids in chunk(reversed(to_download), crispin_client.CHUNK_SIZE): self.download_and_commit_uids(crispin_client, crispin_client.selected_folder_name, uids) return len(to_download)
def chunked_thread_download(crispin_client, db_session, log, folder_name, g_metadata, uids, status_cb, syncmanager_lock, c): """ UIDs and g_metadata passed in are for the _folder that threads are being expanded in_. Messages are downloaded by thread, most-recent-thread-first, newest-to-oldest in thread. (Threads are expanded to all messages in the email archive that belong to the threads corresponding to the given uids. NOTE: this method will leave All Mail selected, since selecting folders is expensive and we don't want to assume what the caller needs to do next. """ # X-GM-THRID is roughly ascending over time, so sort most-recent first all_g_thrids = sorted(set([msg['thrid'] for uid, msg in \ g_metadata.iteritems() if uid in uids]), reverse=True) folder_g_msgids = set([msg['msgid'] for uid, msg in \ g_metadata.items() if uid in uids]) log.info("{0} threads found".format(len(all_g_thrids))) flags = crispin_client.flags(uids, c) crispin_client.select_folder( crispin_client.folder_names(c)['All'], uidvalidity_callback(db_session, crispin_client.account_id), c) log.info("Expanding threads and downloading messages.") # We can't determine how many threads we have fully downloaded locally # before expansion, so we start from 0 every time and skip # already-downloaded messages along the way. num_downloaded_threads = 0 num_total_threads = len(all_g_thrids) acc = db_session.query(ImapAccount).join(Namespace).filter_by( id=crispin_client.account_id).one() for g_thrids in chunk(all_g_thrids, 100): num_downloaded_threads = download_threads(crispin_client, db_session, log, acc, folder_name, g_thrids, flags, folder_g_msgids, num_downloaded_threads, num_total_threads, status_cb, syncmanager_lock, c)
def __download_thread(self, crispin_client, thread_g_metadata, g_thrid, thread_uids): """ Download all messages in thread identified by `g_thrid`. Messages are downloaded oldest-first via All Mail, which allows us to get the entire thread regardless of which folders it's in. We do oldest-first so that if the thread started with a message sent from the Inbox API, we can reconcile this thread appropriately with the existing message/thread. """ log.debug('downloading thread', g_thrid=g_thrid, message_count=len(thread_uids)) to_download = self.__deduplicate_message_download( crispin_client, thread_g_metadata, thread_uids) log.debug(deduplicated_message_count=len(to_download)) for uids in chunk(to_download, crispin_client.CHUNK_SIZE): self.download_and_commit_uids(crispin_client, uids) return len(to_download)
def get_accounts_folders(self, account_ids): # This is where things get interesting --- we need to make queries # to multiple shards and return the results to a single caller. # Preferred method of querying for multiple accounts. Uses pipelining # to reduce the number of requests to redis. account_ids_grouped_by_shards = [] # A magic one-liner to group account ids by shard. # http://stackoverflow.com/questions/8793772/how-to-split-a-sequence-according-to-a-predicate shard_num = heartbeat_config.account_redis_shard_number account_ids_grouped_by_shards = [ list(v[1]) for v in itertools.groupby( sorted(account_ids, key=shard_num), key=shard_num ) ] results = dict() for account_group in account_ids_grouped_by_shards: if not account_group: continue client = heartbeat_config.get_redis_client(account_group[0]) # Because of the way pipelining works, redis buffers data. # We break our requests in chunk to not have to ask for # impossibly big numbers. for chnk in chunk(account_group, 10000): pipe = client.pipeline() for index in chnk: pipe.zrange(index, 0, -1, withscores=True) pipe_results = pipe.execute() for i, account_id in enumerate(chnk): account_id = int(account_id) results[account_id] = pipe_results[i] return results
def _run_impl(self): self.log.info('Starting LabelRenameHandler', label_name=self.label_name) self.semaphore.acquire(blocking=True) try: with connection_pool(self.account_id).get() as crispin_client: folder_names = [] with session_scope(self.account_id) as db_session: folders = db_session.query(Folder).filter( Folder.account_id == self.account_id) folder_names = [folder.name for folder in folders] db_session.expunge_all() for folder_name in folder_names: crispin_client.select_folder(folder_name, uidvalidity_cb) found_uids = crispin_client.search_uids(['X-GM-LABELS', utf7_encode(self.label_name)]) for chnk in chunk(found_uids, 200): flags = crispin_client.flags(chnk) self.log.info('Running metadata update for folder', folder_name=folder_name) with session_scope(self.account_id) as db_session: fld = db_session.query(Folder).options(load_only("id"))\ .filter(Folder.account_id == self.account_id, Folder.name == folder_name).one() common.update_metadata(self.account_id, fld.id, fld.canonical_name, flags, db_session) db_session.commit() finally: self.semaphore.release()
def get_accounts_folders(self, account_ids): # This is where things get interesting --- we need to make queries # to multiple shards and return the results to a single caller. # Preferred method of querying for multiple accounts. Uses pipelining # to reduce the number of requests to redis. account_ids_grouped_by_shards = [] # A magic one-liner to group account ids by shard. # http://stackoverflow.com/questions/8793772/how-to-split-a-sequence-according-to-a-predicate shard_num = heartbeat_config.account_redis_shard_number account_ids_grouped_by_shards = [list(v[1]) for v in itertools.groupby( sorted(account_ids, key=shard_num), key=shard_num)] results = dict() for account_group in account_ids_grouped_by_shards: if not account_group: continue client = heartbeat_config.get_redis_client(account_group[0]) # Because of the way pipelining works, redis buffers data. # We break our requests in chunk to not have to ask for # impossibly big numbers. for chnk in chunk(account_group, 10000): pipe = client.pipeline() for index in chnk: pipe.zrange(index, 0, -1, withscores=True) pipe_results = pipe.execute() for i, account_id in enumerate(chnk): account_id = int(account_id) results[account_id] = pipe_results[i] return results
def initial_sync_impl(self, crispin_client): assert crispin_client.selected_folder_name == self.folder_name remote_uids = crispin_client.all_uids() uids = sorted(remote_uids, reverse=True) starting_uid = None with session_scope(self.namespace_id) as db_session: account = db_session.query(Account).get(self.account_id) s3_resync_status = account._sync_status.get( 's3_resync_status', {}) folder_id = str(self.folder_id) if folder_id in s3_resync_status: folder_status = s3_resync_status[folder_id] resync_status = folder_status.get('status') # We've synced everything we had to sync. if resync_status == 'done': raise MailsyncDone() starting_uid = s3_resync_status[folder_id].get( 'last_synced_uid') if starting_uid is not None: # We're not starting from zero try: i = uids.index(starting_uid) uids = uids[i:] except ValueError: pass # We need the provider and account id to ship per-account # data to statsd. with session_scope(self.namespace_id) as db_session: account = db_session.query(Account).get(self.account_id) statsd_prefix = '.'.join(['s3_resync', account.provider, str(account.id), str(self.folder_id)]) statsd_client.gauge(statsd_prefix + '.messages_total', len(remote_uids)) remaining_messages = len(uids) statsd_client.gauge(statsd_prefix + '.remaining_messages', remaining_messages) if len(uids) == 0: log.info('Done syncing to S3', account_id=self.account_id) self._update_uid_resync_status(status='done') raise MailsyncDone() for chnk in chunk(uids, BATCH_SIZE): to_download = [uid for uid in chnk if _message_missing_s3_object( self.account_id, self.folder_id, uid)] self.download_and_commit_uids(crispin_client, to_download) # FIXME: publish some heartbeats. log.info('Resynced another batch of uids. Updating position.', batch_size=BATCH_SIZE, position=chnk[-1]) self._update_uid_resync_status(uid=chnk[-1]) remaining_messages -= BATCH_SIZE statsd_client.gauge(statsd_prefix + '.remaining_messages', remaining_messages) sleep(S3_RESYNC_FREQUENCY) self._update_uid_resync_status(status='done') raise MailsyncDone()
def condstore_refresh_flags(self, crispin_client): new_highestmodseq = crispin_client.conn.folder_status( self.folder_name, ['HIGHESTMODSEQ'])['HIGHESTMODSEQ'] # Ensure that we have an initial highestmodseq value stored before we # begin polling for changes. if self.highestmodseq is None: self.highestmodseq = new_highestmodseq if new_highestmodseq == self.highestmodseq: # Don't need to do anything if the highestmodseq hasn't # changed. return elif new_highestmodseq < self.highestmodseq: # This should really never happen, but if it does, handle it. log.warning('got server highestmodseq less than saved ' 'highestmodseq', new_highestmodseq=new_highestmodseq, saved_highestmodseq=self.highestmodseq) return log.info('HIGHESTMODSEQ has changed, getting changed UIDs', new_highestmodseq=new_highestmodseq, saved_highestmodseq=self.highestmodseq) crispin_client.select_folder(self.folder_name, self.uidvalidity_cb) changed_flags = crispin_client.condstore_changed_flags( self.highestmodseq) remote_uids = crispin_client.all_uids() # In order to be able to sync changes to tens of thousands of flags at # once, we commit updates in batches. We do this in ascending order by # modseq and periodically "checkpoint" our saved highestmodseq. (It's # safe to checkpoint *because* we go in ascending order by modseq.) # That way if the process gets restarted halfway through this refresh, # we don't have to completely start over. It's also slow to load many # objects into the SQLAlchemy session and then issue lots of commits; # we avoid that by batching. flag_batches = chunk( sorted(changed_flags.items(), key=lambda (k, v): v.modseq), CONDSTORE_FLAGS_REFRESH_BATCH_SIZE) for flag_batch in flag_batches: with session_scope(self.namespace_id) as db_session: common.update_metadata(self.account_id, self.folder_id, self.folder_role, dict(flag_batch), db_session) if len(flag_batch) == CONDSTORE_FLAGS_REFRESH_BATCH_SIZE: interim_highestmodseq = max(v.modseq for k, v in flag_batch) self.highestmodseq = interim_highestmodseq with session_scope(self.namespace_id) as db_session: local_uids = common.local_uids(self.account_id, db_session, self.folder_id) expunged_uids = set(local_uids).difference(remote_uids) if expunged_uids: # If new UIDs have appeared since we last checked in # get_new_uids, save them first. We want to always have the # latest UIDs before expunging anything, in order to properly # capture draft revisions. with session_scope(self.namespace_id) as db_session: lastseenuid = common.lastseenuid(self.account_id, db_session, self.folder_id) if remote_uids and lastseenuid < max(remote_uids): log.info('Downloading new UIDs before expunging') self.get_new_uids(crispin_client) common.remove_deleted_uids(self.account_id, self.folder_id, expunged_uids) self.highestmodseq = new_highestmodseq
def condstore_refresh_flags(self, crispin_client): new_highestmodseq = crispin_client.conn.folder_status( self.folder_name, ["HIGHESTMODSEQ"])[b"HIGHESTMODSEQ"] # type: int # Ensure that we have an initial highestmodseq value stored before we # begin polling for changes. if self.highestmodseq is None: self.highestmodseq = new_highestmodseq if new_highestmodseq == self.highestmodseq: # Don't need to do anything if the highestmodseq hasn't # changed. return elif new_highestmodseq < self.highestmodseq: # This should really never happen, but if it does, handle it. log.warning( "got server highestmodseq less than saved " "highestmodseq", new_highestmodseq=new_highestmodseq, saved_highestmodseq=self.highestmodseq, ) return log.debug( "HIGHESTMODSEQ has changed, getting changed UIDs", new_highestmodseq=new_highestmodseq, saved_highestmodseq=self.highestmodseq, ) crispin_client.select_folder(self.folder_name, self.uidvalidity_cb) changed_flags = crispin_client.condstore_changed_flags( self.highestmodseq) remote_uids = crispin_client.all_uids() # In order to be able to sync changes to tens of thousands of flags at # once, we commit updates in batches. We do this in ascending order by # modseq and periodically "checkpoint" our saved highestmodseq. (It's # safe to checkpoint *because* we go in ascending order by modseq.) # That way if the process gets restarted halfway through this refresh, # we don't have to completely start over. It's also slow to load many # objects into the SQLAlchemy session and then issue lots of commits; # we avoid that by batching. flag_batches = chunk( sorted(changed_flags.items(), key=lambda key_and_value: key_and_value[1].modseq), CONDSTORE_FLAGS_REFRESH_BATCH_SIZE, ) for flag_batch in flag_batches: with session_scope(self.namespace_id) as db_session: common.update_metadata( self.account_id, self.folder_id, self.folder_role, dict(flag_batch), db_session, ) if len(flag_batch) == CONDSTORE_FLAGS_REFRESH_BATCH_SIZE: interim_highestmodseq = max(v.modseq for k, v in flag_batch) self.highestmodseq = interim_highestmodseq with session_scope(self.namespace_id) as db_session: local_uids = common.local_uids(self.account_id, db_session, self.folder_id) expunged_uids = set(local_uids).difference(remote_uids) if expunged_uids: # If new UIDs have appeared since we last checked in # get_new_uids, save them first. We want to always have the # latest UIDs before expunging anything, in order to properly # capture draft revisions. with session_scope(self.namespace_id) as db_session: lastseenuid = common.lastseenuid(self.account_id, db_session, self.folder_id) if remote_uids and lastseenuid < max(remote_uids): log.info("Downloading new UIDs before expunging") self.get_new_uids(crispin_client) with self.syncmanager_lock: common.remove_deleted_uids(self.account_id, self.folder_id, expunged_uids) self.highestmodseq = new_highestmodseq