Exemple #1
0
def chunked_uid_download(crispin_client, db_session, log,
        folder_name, uids, num_local_messages, num_total_messages, status_cb,
        syncmanager_lock, download_commit_fn, msg_create_fn, c):
    log.info("{0} uids left to fetch".format(len(uids)))

    if uids:
        chunk_size = crispin_client.CHUNK_SIZE
        log.info("Starting sync for {0} with chunks of size {1}"\
                .format(folder_name, chunk_size))
        # we prioritize message download by reverse-UID order, which
        # generally puts more recent messages first
        for uids in chunk(reversed(uids), chunk_size):
            num_local_messages += download_commit_fn(crispin_client,
                    db_session, log, folder_name, uids, msg_create_fn,
                    syncmanager_lock, c)

            percent_done = (num_local_messages / num_total_messages) * 100
            status_cb(crispin_client.account_id,
                    'initial', (folder_name, percent_done))
            log.info("Syncing %s -- %.2f%% (%i/%i)" % (folder_name,
                percent_done, num_local_messages, num_total_messages))
        log.info("Saved all messages and metadata on {0} to UIDVALIDITY {1} \
            / HIGHESTMODSEQ {2}".format(folder_name,
                crispin_client.selected_uidvalidity,
                crispin_client.selected_highestmodseq))
Exemple #2
0
def update_saved_g_metadata(crispin_client, db_session, log, folder_name,
        remote_g_metadata, local_uids, c):
    """ If HIGHESTMODSEQ has changed since we saved the X-GM-MSGID cache,
        we need to query for any changes since then and update the saved
        data.
    """
    log.info("Updating cache with latest changes")
    # any uids we don't already have will be downloaded correctly
    # as usual, but updated uids need to be updated manually
    # XXX it may actually be faster to just query for X-GM-MSGID for the
    # whole folder rather than getting changed UIDs first; MODSEQ queries
    # are slow on large folders.
    modified = crispin_client.new_and_updated_uids(
            crispin_client.selected_highestmodseq, c)
    new, updated = new_or_updated(modified, local_uids)
    log.info("{0} new and {1} updated UIDs".format(len(new), len(updated)))
    # for new, query metadata and update cache
    remote_g_metadata.update(crispin_client.g_metadata(new, c))
    # filter out messages that have disappeared
    all_uids = set(crispin_client.all_uids(c))
    remote_g_metadata = dict((uid, md) for uid, md in \
            remote_g_metadata.iteritems() if uid in all_uids)
    set_cache(remote_g_metadata_cache_file(crispin_client.account_id,
        folder_name), remote_g_metadata)
    log.info("Updated cache with new messages")
    # for updated, it's easier to just update them now
    # bigger chunk because the data being fetched here is very small
    for uids in chunk(updated, 5*crispin_client.CHUNK_SIZE):
        update_metadata(crispin_client, db_session, log, folder_name, uids, c)
    log.info("Updated metadata for modified messages")
Exemple #3
0
    def initial_sync_impl(self, crispin_client):
        # We wrap the block in a try/finally because the greenlets like
        # change_poller need to be killed when this greenlet is interrupted
        change_poller = None
        try:
            remote_uids = sorted(crispin_client.all_uids(), key=int)
            with self.syncmanager_lock:
                with session_scope(self.namespace_id) as db_session:
                    local_uids = common.local_uids(
                        self.account_id, db_session, self.folder_id
                    )
                common.remove_deleted_uids(
                    self.account_id, self.folder_id, set(local_uids) - set(remote_uids)
                )
                unknown_uids = set(remote_uids) - local_uids
                with session_scope(self.namespace_id) as db_session:
                    self.update_uid_counts(
                        db_session,
                        remote_uid_count=len(remote_uids),
                        download_uid_count=len(unknown_uids),
                    )

            change_poller = gevent.spawn(self.poll_for_changes)
            bind_context(change_poller, "changepoller", self.account_id, self.folder_id)

            if self.is_all_mail(crispin_client):
                # Prioritize UIDs for messages in the inbox folder.
                if len(remote_uids) < 1e6:
                    inbox_uids = set(
                        crispin_client.search_uids(["X-GM-LABELS", "inbox"])
                    )
                else:
                    # The search above is really slow (times out) on really
                    # large mailboxes, so bound the search to messages within
                    # the past month in order to get anywhere.
                    since = datetime.utcnow() - timedelta(days=30)
                    inbox_uids = set(
                        crispin_client.search_uids(
                            ["X-GM-LABELS", "inbox", "SINCE", since]
                        )
                    )

                uids_to_download = sorted(unknown_uids - inbox_uids) + sorted(
                    unknown_uids & inbox_uids
                )
            else:
                uids_to_download = sorted(unknown_uids)

            for uids in chunk(reversed(uids_to_download), 1024):
                g_metadata = crispin_client.g_metadata(uids)
                # UIDs might have been expunged since sync started, in which
                # case the g_metadata call above will return nothing.
                # They may also have been preemptively downloaded by thread
                # expansion. We can omit such UIDs.
                uids = [u for u in uids if u in g_metadata and u not in self.saved_uids]
                self.batch_download_uids(crispin_client, uids, g_metadata)
        finally:
            if change_poller is not None:
                # schedule change_poller to die
                gevent.kill(change_poller)
Exemple #4
0
 def fetch_headers(self, uids):
     """Fetch headers for the given uids. Chunked because certain providers
     fail with 'Command line too large' if you feed them too many uids at
     once."""
     headers = {}
     for uid_chunk in chunk(uids, 100):
         headers.update(self.conn.fetch(uid_chunk, ['BODY.PEEK[HEADER]']))
     return headers
 def fetch_headers(self, uids):
     """Fetch headers for the given uids. Chunked because certain providers
     fail with 'Command line too large' if you feed them too many uids at
     once."""
     headers = {}
     for uid_chunk in chunk(uids, 100):
         headers.update(self.conn.fetch(uid_chunk, ["BODY.PEEK[HEADER]"]))
     return headers
Exemple #6
0
def update_metadata(crispin_client, db_session, log, folder_name, uids, syncmanager_lock, c):
    """ Update flags (the only metadata that can change). """
    # bigger chunk because the data being fetched here is very small
    for uids in chunk(uids, 5 * crispin_client.CHUNK_SIZE):
        new_flags = crispin_client.flags(uids, c)
        assert sorted(uids, key=int) == sorted(new_flags.keys(), key=int), "server uids != local uids"
        log.info("new flags: {0}".format(new_flags))
        with syncmanager_lock:
            account.update_metadata(crispin_client.account_id, db_session, folder_name, uids, new_flags)
            db_session.commit()
Exemple #7
0
    def initial_sync_impl(self, crispin_client):
        # We wrap the block in a try/finally because the greenlets like
        # change_poller need to be killed when this greenlet is interrupted
        change_poller = None
        try:
            remote_uids = sorted(crispin_client.all_uids(), key=int)
            with self.syncmanager_lock:
                with session_scope(self.namespace_id) as db_session:
                    local_uids = common.local_uids(self.account_id, db_session,
                                                   self.folder_id)
                common.remove_deleted_uids(
                    self.account_id, self.folder_id,
                    set(local_uids) - set(remote_uids))
                unknown_uids = set(remote_uids) - local_uids
                with session_scope(self.namespace_id) as db_session:
                    self.update_uid_counts(
                        db_session, remote_uid_count=len(remote_uids),
                        download_uid_count=len(unknown_uids))

            change_poller = spawn(self.poll_for_changes)
            bind_context(change_poller, 'changepoller', self.account_id,
                         self.folder_id)

            if self.is_all_mail(crispin_client):
                # Prioritize UIDs for messages in the inbox folder.
                if len(remote_uids) < 1e6:
                    inbox_uids = set(
                        crispin_client.search_uids(['X-GM-LABELS', 'inbox']))
                else:
                    # The search above is really slow (times out) on really
                    # large mailboxes, so bound the search to messages within
                    # the past month in order to get anywhere.
                    since = datetime.utcnow() - timedelta(days=30)
                    inbox_uids = set(crispin_client.search_uids([
                        'X-GM-LABELS', 'inbox',
                        'SINCE', since]))

                uids_to_download = (sorted(unknown_uids - inbox_uids) +
                                    sorted(unknown_uids & inbox_uids))
            else:
                uids_to_download = sorted(unknown_uids)

            for uids in chunk(reversed(uids_to_download), 1024):
                g_metadata = crispin_client.g_metadata(uids)
                # UIDs might have been expunged since sync started, in which
                # case the g_metadata call above will return nothing.
                # They may also have been preemptively downloaded by thread
                # expansion. We can omit such UIDs.
                uids = [u for u in uids if u in g_metadata and u not in
                        self.saved_uids]
                self.batch_download_uids(crispin_client, uids, g_metadata)
        finally:
            if change_poller is not None:
                # schedule change_poller to die
                kill(change_poller)
Exemple #8
0
 def update_metadata(self, crispin_client, updated):
     """ Update flags (the only metadata that can change). """
     # bigger chunk because the data being fetched here is very small
     for uids in chunk(updated, 5 * crispin_client.CHUNK_SIZE):
         new_flags = crispin_client.flags(uids)
         # Messages can disappear in the meantime; we'll update them next
         # sync.
         uids = [uid for uid in uids if uid in new_flags]
         with self.syncmanager_lock:
             with mailsync_session_scope() as db_session:
                 common.update_metadata(self.account_id, db_session,
                                        self.folder_name, self.folder_id,
                                        uids, new_flags)
                 db_session.commit()
Exemple #9
0
 def update_metadata(self, crispin_client, updated):
     """ Update flags (the only metadata that can change). """
     # bigger chunk because the data being fetched here is very small
     for uids in chunk(updated, 5 * crispin_client.CHUNK_SIZE):
         new_flags = crispin_client.flags(uids)
         # Messages can disappear in the meantime; we'll update them next
         # sync.
         uids = [uid for uid in uids if uid in new_flags]
         with self.syncmanager_lock:
             with mailsync_session_scope() as db_session:
                 common.update_metadata(self.account_id, db_session,
                                        self.folder_name, self.folder_id,
                                        uids, new_flags)
                 db_session.commit()
Exemple #10
0
def update_metadata(crispin_client, db_session, log, folder_name, uids,
                    syncmanager_lock):
    """ Update flags (the only metadata that can change). """
    # bigger chunk because the data being fetched here is very small
    for uids in chunk(uids, 5 * crispin_client.CHUNK_SIZE):
        new_flags = crispin_client.flags(uids)
        # messages can disappear in the meantime; we'll update them next sync
        uids = [uid for uid in uids if uid in new_flags]
        log.info("new flags ", new_flags=new_flags, folder_name=folder_name)
        with syncmanager_lock:
            log.debug("update_metadata acquired syncmanager_lock")
            account.update_metadata(crispin_client.account_id, db_session,
                                    folder_name, uids, new_flags)
            db_session.commit()
Exemple #11
0
def update_metadata(crispin_client, log, folder_name, uids, syncmanager_lock):
    """ Update flags (the only metadata that can change). """

    # bigger chunk because the data being fetched here is very small
    for uids in chunk(uids, 5 * crispin_client.CHUNK_SIZE):
        new_flags = crispin_client.flags(uids)
        # messages can disappear in the meantime; we'll update them next sync
        uids = [uid for uid in uids if uid in new_flags]
        log.info("new flags ", new_flags=new_flags, folder_name=folder_name)
        with syncmanager_lock:
            log.debug("update_metadata acquired syncmanager_lock")
            with session_scope(ignore_soft_deletes=False) as db_session:
                account.update_metadata(crispin_client.account_id, db_session,
                                        folder_name, uids, new_flags)
                db_session.commit()
Exemple #12
0
    def __download_thread(self, crispin_client, thread_g_metadata, g_thrid, thread_uids):
        """
        Download all messages in thread identified by `g_thrid`.

        Messages are downloaded oldest-first via All Mail, which allows us
        to get the entire thread regardless of which folders it's in. We do
        oldest-first so that if the thread started with a message sent from the
        Inbox API, we can reconcile this thread appropriately with the existing
        message/thread.
        """
        log.debug("downloading thread", g_thrid=g_thrid, message_count=len(thread_uids))
        to_download = self.__deduplicate_message_download(crispin_client, thread_g_metadata, thread_uids)
        log.debug(deduplicated_message_count=len(to_download))
        for uids in chunk(to_download, crispin_client.CHUNK_SIZE):
            self.download_and_commit_uids(crispin_client, uids)
        return len(to_download)
Exemple #13
0
    def _run_impl(self):
        self.log.info("Starting LabelRenameHandler",
                      label_name=self.label_name)

        self.semaphore.acquire(blocking=True)

        try:
            with connection_pool(self.account_id).get() as crispin_client:
                folder_names = []
                with session_scope(self.account_id) as db_session:
                    folders = db_session.query(Folder).filter(
                        Folder.account_id == self.account_id)

                    folder_names = [folder.name for folder in folders]
                    db_session.expunge_all()

                for folder_name in folder_names:
                    crispin_client.select_folder(folder_name, uidvalidity_cb)

                    found_uids = crispin_client.search_uids(
                        ["X-GM-LABELS",
                         utf7_encode(self.label_name)])

                    for chnk in chunk(found_uids, 200):
                        flags = crispin_client.flags(chnk)

                        self.log.info(
                            "Running metadata update for folder",
                            folder_name=folder_name,
                        )
                        with session_scope(self.account_id) as db_session:
                            fld = (db_session.query(Folder).options(
                                load_only("id")).filter(
                                    Folder.account_id == self.account_id,
                                    Folder.name == folder_name,
                                ).one())

                            common.update_metadata(
                                self.account_id,
                                fld.id,
                                fld.canonical_name,
                                flags,
                                db_session,
                            )
                            db_session.commit()
        finally:
            self.semaphore.release()
Exemple #14
0
    def __update_saved_g_metadata(self, crispin_client, remote_g_metadata,
                                  local_uids):
        """
        If HIGHESTMODSEQ has changed since we saved the X-GM-MSGID cache,
        we need to query for any changes since then and update the saved
        data.

        """
        log.info('Updating cache with latest changes')
        # Any uids we don't already have will be downloaded correctly as usual,
        # but updated uids need to be updated manually.
        # XXX it may actually be faster to just query for X-GM-MSGID for the
        # whole folder rather than getting changed UIDs first; MODSEQ queries
        # are slow on large folders.
        modified = crispin_client.new_and_updated_uids(
            crispin_client.selected_highestmodseq)
        log.info(modified_msg_count=len(modified))
        new, updated = new_or_updated(modified, local_uids)
        log.info(new_uid_count=len(new), updated_uid_count=len(updated))
        if new:
            remote_g_metadata.update(crispin_client.g_metadata(new))
            log.info('Updated cache with new messages')
        else:
            log.info('No new messages to update metadata for')
        # Filter out messages that have disappeared.
        old_len = len(remote_g_metadata)
        current_remote_uids = set(crispin_client.all_uids())
        remote_g_metadata = dict((uid, md)
                                 for uid, md in remote_g_metadata.iteritems()
                                 if uid in current_remote_uids)
        num_removed = old_len - len(remote_g_metadata)
        if num_removed > 0:
            log.info(removed_msg_count=num_removed)
        set_cache(
            remote_g_metadata_cache_file(self.account_id, self.folder_name),
            remote_g_metadata)
        if updated:
            # It's easy and fast to just update these here and now.
            # Bigger chunk because the data being fetched here is very small.
            for uids in chunk(updated, 5 * crispin_client.CHUNK_SIZE):
                self.update_metadata(crispin_client, uids)
            log.info('updated metadata for modified messages',
                     msg_count=len(updated))
            return len(updated)
        else:
            log.info('No modified messages to update metadata for')
            return 0
Exemple #15
0
    def __update_saved_g_metadata(self, crispin_client, remote_g_metadata,
                                  local_uids):
        """
        If HIGHESTMODSEQ has changed since we saved the X-GM-MSGID cache,
        we need to query for any changes since then and update the saved
        data.

        """
        log.info('Updating cache with latest changes')
        # Any uids we don't already have will be downloaded correctly as usual,
        # but updated uids need to be updated manually.
        # XXX it may actually be faster to just query for X-GM-MSGID for the
        # whole folder rather than getting changed UIDs first; MODSEQ queries
        # are slow on large folders.
        modified = crispin_client.new_and_updated_uids(
            crispin_client.selected_highestmodseq)
        log.info(modified_msg_count=len(modified))
        new, updated = new_or_updated(modified, local_uids)
        log.info(new_uid_count=len(new), updated_uid_count=len(updated))
        if new:
            remote_g_metadata.update(crispin_client.g_metadata(new))
            log.info('Updated cache with new messages')
        else:
            log.info('No new messages to update metadata for')
        # Filter out messages that have disappeared.
        old_len = len(remote_g_metadata)
        current_remote_uids = set(crispin_client.all_uids())
        remote_g_metadata = dict((uid, md) for uid, md in
                                 remote_g_metadata.iteritems() if uid in
                                 current_remote_uids)
        num_removed = old_len - len(remote_g_metadata)
        if num_removed > 0:
            log.info(removed_msg_count=num_removed)
        set_cache(remote_g_metadata_cache_file(self.account_id,
                                               self.folder_name),
                  remote_g_metadata)
        if updated:
            # It's easy and fast to just update these here and now.
            # Bigger chunk because the data being fetched here is very small.
            for uids in chunk(updated, 5 * crispin_client.CHUNK_SIZE):
                self.update_metadata(crispin_client, uids)
            log.info('updated metadata for modified messages',
                     msg_count=len(updated))
            return len(updated)
        else:
            log.info('No modified messages to update metadata for')
            return 0
Exemple #16
0
def download_thread(crispin_client, db_session, log, syncmanager_lock,
                    thread_g_metadata, g_thrid, thread_uids):
    """ Download all messages in thread identified by `g_thrid`.

    Messages are downloaded most-recent-first via All Mail, which allows us to
    get the entire thread regardless of which folders it's in.
    """
    log.debug("Downloading thread {} with {} messages."
              .format(g_thrid, len(thread_uids)))
    to_download = deduplicate_message_download(crispin_client, db_session, log,
                                               syncmanager_lock,
                                               thread_g_metadata, thread_uids)
    log.debug("{} deduplicated messages to download.".format(len(to_download)))
    for uids in chunk(reversed(to_download), crispin_client.CHUNK_SIZE):
        gmail_download_and_commit_uids(crispin_client, db_session, log,
                                       crispin_client.selected_folder_name,
                                       uids, create_gmail_message,
                                       syncmanager_lock)
Exemple #17
0
    def __download_thread(self, crispin_client, thread_g_metadata, g_thrid,
                          thread_uids):
        """
        Download all messages in thread identified by `g_thrid`.

        Messages are downloaded most-recent-first via All Mail, which allows us
        to get the entire thread regardless of which folders it's in.
        """
        log.debug('downloading thread',
                  g_thrid=g_thrid, message_count=len(thread_uids))
        to_download = self.__deduplicate_message_download(
            crispin_client, thread_g_metadata, thread_uids)
        log.debug(deduplicated_message_count=len(to_download))
        for uids in chunk(reversed(to_download), crispin_client.CHUNK_SIZE):
            self.download_and_commit_uids(crispin_client,
                                          crispin_client.selected_folder_name,
                                          uids)
        return len(to_download)
Exemple #18
0
def download_thread(crispin_client, db_session, log, syncmanager_lock,
                    thread_g_metadata, g_thrid, thread_uids):
    """ Download all messages in thread identified by `g_thrid`.

    Messages are downloaded most-recent-first via All Mail, which allows us to
    get the entire thread regardless of which folders it's in.
    """
    log.debug("Downloading thread {} with {} messages."
              .format(g_thrid, len(thread_uids)))
    to_download = deduplicate_message_download(crispin_client, db_session, log,
                                               syncmanager_lock,
                                               thread_g_metadata, thread_uids)
    log.debug("{} deduplicated messages to download.".format(len(to_download)))
    for uids in chunk(reversed(to_download), crispin_client.CHUNK_SIZE):
        gmail_download_and_commit_uids(crispin_client, db_session, log,
                                       crispin_client.selected_folder_name,
                                       uids, create_gmail_message,
                                       syncmanager_lock)
Exemple #19
0
    def __download_thread(self, crispin_client, thread_g_metadata, g_thrid,
                          thread_uids):
        """
        Download all messages in thread identified by `g_thrid`.

        Messages are downloaded most-recent-first via All Mail, which allows us
        to get the entire thread regardless of which folders it's in.
        """
        log.debug('downloading thread',
                  g_thrid=g_thrid,
                  message_count=len(thread_uids))
        to_download = self.__deduplicate_message_download(
            crispin_client, thread_g_metadata, thread_uids)
        log.debug(deduplicated_message_count=len(to_download))
        for uids in chunk(reversed(to_download), crispin_client.CHUNK_SIZE):
            self.download_and_commit_uids(crispin_client,
                                          crispin_client.selected_folder_name,
                                          uids)
        return len(to_download)
Exemple #20
0
def chunked_thread_download(crispin_client, db_session, log, folder_name,
        g_metadata, uids, status_cb, syncmanager_lock, c):
    """ UIDs and g_metadata passed in are for the _folder that threads are
        being expanded in_.

        Messages are downloaded by thread, most-recent-thread-first,
        newest-to-oldest in thread. (Threads are expanded to all messages in
        the email archive that belong to the threads corresponding to the
        given uids.

        NOTE: this method will leave All Mail selected, since selecting
        folders is expensive and we don't want to assume what the caller
        needs to do next.
    """
    # X-GM-THRID is roughly ascending over time, so sort most-recent first
    all_g_thrids = sorted(set([msg['thrid'] for uid, msg in \
            g_metadata.iteritems() if uid in uids]), reverse=True)
    folder_g_msgids = set([msg['msgid'] for uid, msg in \
            g_metadata.items() if uid in uids])
    log.info("{0} threads found".format(len(all_g_thrids)))

    flags = crispin_client.flags(uids, c)

    crispin_client.select_folder(
            crispin_client.folder_names(c)['All'],
            uidvalidity_callback(db_session,
                crispin_client.account_id), c)

    log.info("Expanding threads and downloading messages.")

    # We can't determine how many threads we have fully downloaded locally
    # before expansion, so we start from 0 every time and skip
    # already-downloaded messages along the way.
    num_downloaded_threads = 0
    num_total_threads = len(all_g_thrids)
    acc = db_session.query(ImapAccount).join(Namespace).filter_by(
            id=crispin_client.account_id).one()
    for g_thrids in chunk(all_g_thrids, 100):
        num_downloaded_threads = download_threads(crispin_client, db_session,
                log, acc, folder_name, g_thrids, flags, folder_g_msgids,
                num_downloaded_threads, num_total_threads, status_cb,
                syncmanager_lock, c)
Exemple #21
0
    def __download_thread(self, crispin_client, thread_g_metadata, g_thrid,
                          thread_uids):
        """
        Download all messages in thread identified by `g_thrid`.

        Messages are downloaded oldest-first via All Mail, which allows us
        to get the entire thread regardless of which folders it's in. We do
        oldest-first so that if the thread started with a message sent from the
        Inbox API, we can reconcile this thread appropriately with the existing
        message/thread.
        """
        log.debug('downloading thread',
                  g_thrid=g_thrid,
                  message_count=len(thread_uids))
        to_download = self.__deduplicate_message_download(
            crispin_client, thread_g_metadata, thread_uids)
        log.debug(deduplicated_message_count=len(to_download))
        for uids in chunk(to_download, crispin_client.CHUNK_SIZE):
            self.download_and_commit_uids(crispin_client, uids)
        return len(to_download)
Exemple #22
0
    def get_accounts_folders(self, account_ids):
        # This is where things get interesting --- we need to make queries
        # to multiple shards and return the results to a single caller.
        # Preferred method of querying for multiple accounts. Uses pipelining
        # to reduce the number of requests to redis.
        account_ids_grouped_by_shards = []

        # A magic one-liner to group account ids by shard.
        # http://stackoverflow.com/questions/8793772/how-to-split-a-sequence-according-to-a-predicate
        shard_num = heartbeat_config.account_redis_shard_number
        account_ids_grouped_by_shards = [
            list(v[1])
            for v in itertools.groupby(
                sorted(account_ids, key=shard_num), key=shard_num
            )
        ]

        results = dict()
        for account_group in account_ids_grouped_by_shards:
            if not account_group:
                continue

            client = heartbeat_config.get_redis_client(account_group[0])

            # Because of the way pipelining works, redis buffers data.
            # We break our requests in chunk to not have to ask for
            # impossibly big numbers.
            for chnk in chunk(account_group, 10000):
                pipe = client.pipeline()
                for index in chnk:
                    pipe.zrange(index, 0, -1, withscores=True)

                pipe_results = pipe.execute()

                for i, account_id in enumerate(chnk):
                    account_id = int(account_id)
                    results[account_id] = pipe_results[i]

        return results
Exemple #23
0
    def _run_impl(self):
        self.log.info('Starting LabelRenameHandler',
                      label_name=self.label_name)

        self.semaphore.acquire(blocking=True)

        try:
            with connection_pool(self.account_id).get() as crispin_client:
                folder_names = []
                with session_scope(self.account_id) as db_session:
                    folders = db_session.query(Folder).filter(
                        Folder.account_id == self.account_id)

                    folder_names = [folder.name for folder in folders]
                    db_session.expunge_all()

                for folder_name in folder_names:
                    crispin_client.select_folder(folder_name, uidvalidity_cb)

                    found_uids = crispin_client.search_uids(['X-GM-LABELS',
                                                             utf7_encode(self.label_name)])

                    for chnk in chunk(found_uids, 200):
                        flags = crispin_client.flags(chnk)

                        self.log.info('Running metadata update for folder',
                                      folder_name=folder_name)
                        with session_scope(self.account_id) as db_session:
                            fld = db_session.query(Folder).options(load_only("id"))\
                                .filter(Folder.account_id == self.account_id,
                                        Folder.name == folder_name).one()

                            common.update_metadata(self.account_id, fld.id,
                                                   fld.canonical_name, flags,
                                                   db_session)
                            db_session.commit()
        finally:
            self.semaphore.release()
Exemple #24
0
    def get_accounts_folders(self, account_ids):
        # This is where things get interesting --- we need to make queries
        # to multiple shards and return the results to a single caller.
        # Preferred method of querying for multiple accounts. Uses pipelining
        # to reduce the number of requests to redis.
        account_ids_grouped_by_shards = []

        # A magic one-liner to group account ids by shard.
        # http://stackoverflow.com/questions/8793772/how-to-split-a-sequence-according-to-a-predicate
        shard_num = heartbeat_config.account_redis_shard_number
        account_ids_grouped_by_shards = [list(v[1]) for v in
                                         itertools.groupby(
                                            sorted(account_ids, key=shard_num),
                                            key=shard_num)]

        results = dict()
        for account_group in account_ids_grouped_by_shards:
            if not account_group:
                continue

            client = heartbeat_config.get_redis_client(account_group[0])

            # Because of the way pipelining works, redis buffers data.
            # We break our requests in chunk to not have to ask for
            # impossibly big numbers.
            for chnk in chunk(account_group, 10000):
                pipe = client.pipeline()
                for index in chnk:
                    pipe.zrange(index, 0, -1, withscores=True)

                pipe_results = pipe.execute()

                for i, account_id in enumerate(chnk):
                    account_id = int(account_id)
                    results[account_id] = pipe_results[i]

        return results
Exemple #25
0
    def initial_sync_impl(self, crispin_client):
        assert crispin_client.selected_folder_name == self.folder_name
        remote_uids = crispin_client.all_uids()
        uids = sorted(remote_uids, reverse=True)

        starting_uid = None
        with session_scope(self.namespace_id) as db_session:
            account = db_session.query(Account).get(self.account_id)
            s3_resync_status = account._sync_status.get(
                's3_resync_status', {})

            folder_id = str(self.folder_id)
            if folder_id in s3_resync_status:
                folder_status = s3_resync_status[folder_id]
                resync_status = folder_status.get('status')

                # We've synced everything we had to sync.
                if resync_status == 'done':
                    raise MailsyncDone()

                starting_uid = s3_resync_status[folder_id].get(
                    'last_synced_uid')

        if starting_uid is not None:
            # We're not starting from zero
            try:
                i = uids.index(starting_uid)
                uids = uids[i:]
            except ValueError:
                pass

        # We need the provider and account id to ship per-account
        # data to statsd.
        with session_scope(self.namespace_id) as db_session:
            account = db_session.query(Account).get(self.account_id)
            statsd_prefix = '.'.join(['s3_resync', account.provider, str(account.id), str(self.folder_id)])

        statsd_client.gauge(statsd_prefix + '.messages_total', len(remote_uids))

        remaining_messages = len(uids)
        statsd_client.gauge(statsd_prefix + '.remaining_messages', remaining_messages)

        if len(uids) == 0:
            log.info('Done syncing to S3', account_id=self.account_id)
            self._update_uid_resync_status(status='done')
            raise MailsyncDone()

        for chnk in chunk(uids, BATCH_SIZE):
            to_download = [uid for uid in chnk if _message_missing_s3_object(
                            self.account_id, self.folder_id, uid)]
            self.download_and_commit_uids(crispin_client, to_download)

            # FIXME: publish some heartbeats.

            log.info('Resynced another batch of uids. Updating position.',
                     batch_size=BATCH_SIZE, position=chnk[-1])
            self._update_uid_resync_status(uid=chnk[-1])

            remaining_messages -= BATCH_SIZE
            statsd_client.gauge(statsd_prefix + '.remaining_messages',
                                remaining_messages)

            sleep(S3_RESYNC_FREQUENCY)

        self._update_uid_resync_status(status='done')
        raise MailsyncDone()
Exemple #26
0
    def initial_sync_impl(self, crispin_client):
        assert crispin_client.selected_folder_name == self.folder_name
        remote_uids = crispin_client.all_uids()
        uids = sorted(remote_uids, reverse=True)

        starting_uid = None
        with session_scope(self.namespace_id) as db_session:
            account = db_session.query(Account).get(self.account_id)
            s3_resync_status = account._sync_status.get(
                's3_resync_status', {})

            folder_id = str(self.folder_id)
            if folder_id in s3_resync_status:
                folder_status = s3_resync_status[folder_id]
                resync_status = folder_status.get('status')

                # We've synced everything we had to sync.
                if resync_status == 'done':
                    raise MailsyncDone()

                starting_uid = s3_resync_status[folder_id].get(
                    'last_synced_uid')

        if starting_uid is not None:
            # We're not starting from zero
            try:
                i = uids.index(starting_uid)
                uids = uids[i:]
            except ValueError:
                pass

        # We need the provider and account id to ship per-account
        # data to statsd.
        with session_scope(self.namespace_id) as db_session:
            account = db_session.query(Account).get(self.account_id)
            statsd_prefix = '.'.join(['s3_resync', account.provider,
                                      str(account.id), str(self.folder_id)])

        statsd_client.gauge(statsd_prefix + '.messages_total', len(remote_uids))

        remaining_messages = len(uids)
        statsd_client.gauge(statsd_prefix + '.remaining_messages',
                            remaining_messages)

        if len(uids) == 0:
            log.info('Done syncing to S3', account_id=self.account_id)
            self._update_uid_resync_status(status='done')
            raise MailsyncDone()

        for chnk in chunk(uids, BATCH_SIZE):
            to_download = [uid for uid in chnk if _message_missing_s3_object(
                            self.account_id, self.folder_id, uid)]
            self.download_and_commit_uids(crispin_client, to_download)

            # FIXME: publish some heartbeats.

            log.info('Resynced another batch of uids. Updating position.',
                     batch_size=BATCH_SIZE, position=chnk[-1])
            self._update_uid_resync_status(uid=chnk[-1])

            remaining_messages -= BATCH_SIZE
            statsd_client.gauge(statsd_prefix + '.remaining_messages',
                                remaining_messages)

            sleep(S3_RESYNC_FREQUENCY)

        self._update_uid_resync_status(status='done')
        raise MailsyncDone()
Exemple #27
0
    def condstore_refresh_flags(self, crispin_client):
        new_highestmodseq = crispin_client.conn.folder_status(
            self.folder_name, ['HIGHESTMODSEQ'])['HIGHESTMODSEQ']
        # Ensure that we have an initial highestmodseq value stored before we
        # begin polling for changes.
        if self.highestmodseq is None:
            self.highestmodseq = new_highestmodseq

        if new_highestmodseq == self.highestmodseq:
            # Don't need to do anything if the highestmodseq hasn't
            # changed.
            return
        elif new_highestmodseq < self.highestmodseq:
            # This should really never happen, but if it does, handle it.
            log.warning('got server highestmodseq less than saved '
                        'highestmodseq',
                        new_highestmodseq=new_highestmodseq,
                        saved_highestmodseq=self.highestmodseq)
            return

        log.info('HIGHESTMODSEQ has changed, getting changed UIDs',
                 new_highestmodseq=new_highestmodseq,
                 saved_highestmodseq=self.highestmodseq)
        crispin_client.select_folder(self.folder_name, self.uidvalidity_cb)
        changed_flags = crispin_client.condstore_changed_flags(
            self.highestmodseq)
        remote_uids = crispin_client.all_uids()

        # In order to be able to sync changes to tens of thousands of flags at
        # once, we commit updates in batches. We do this in ascending order by
        # modseq and periodically "checkpoint" our saved highestmodseq. (It's
        # safe to checkpoint *because* we go in ascending order by modseq.)
        # That way if the process gets restarted halfway through this refresh,
        # we don't have to completely start over. It's also slow to load many
        # objects into the SQLAlchemy session and then issue lots of commits;
        # we avoid that by batching.
        flag_batches = chunk(
            sorted(changed_flags.items(), key=lambda (k, v): v.modseq),
            CONDSTORE_FLAGS_REFRESH_BATCH_SIZE)
        for flag_batch in flag_batches:
            with session_scope(self.namespace_id) as db_session:
                common.update_metadata(self.account_id, self.folder_id,
                                       self.folder_role, dict(flag_batch),
                                       db_session)
            if len(flag_batch) == CONDSTORE_FLAGS_REFRESH_BATCH_SIZE:
                interim_highestmodseq = max(v.modseq for k, v in flag_batch)
                self.highestmodseq = interim_highestmodseq

        with session_scope(self.namespace_id) as db_session:
            local_uids = common.local_uids(self.account_id, db_session,
                                           self.folder_id)
            expunged_uids = set(local_uids).difference(remote_uids)

        if expunged_uids:
            # If new UIDs have appeared since we last checked in
            # get_new_uids, save them first. We want to always have the
            # latest UIDs before expunging anything, in order to properly
            # capture draft revisions.
            with session_scope(self.namespace_id) as db_session:
                lastseenuid = common.lastseenuid(self.account_id, db_session,
                                                 self.folder_id)
            if remote_uids and lastseenuid < max(remote_uids):
                log.info('Downloading new UIDs before expunging')
                self.get_new_uids(crispin_client)
            common.remove_deleted_uids(self.account_id, self.folder_id,
                                       expunged_uids)
        self.highestmodseq = new_highestmodseq
Exemple #28
0
    def condstore_refresh_flags(self, crispin_client):
        new_highestmodseq = crispin_client.conn.folder_status(
            self.folder_name, ["HIGHESTMODSEQ"])[b"HIGHESTMODSEQ"]  # type: int
        # Ensure that we have an initial highestmodseq value stored before we
        # begin polling for changes.
        if self.highestmodseq is None:
            self.highestmodseq = new_highestmodseq

        if new_highestmodseq == self.highestmodseq:
            # Don't need to do anything if the highestmodseq hasn't
            # changed.
            return
        elif new_highestmodseq < self.highestmodseq:
            # This should really never happen, but if it does, handle it.
            log.warning(
                "got server highestmodseq less than saved "
                "highestmodseq",
                new_highestmodseq=new_highestmodseq,
                saved_highestmodseq=self.highestmodseq,
            )
            return

        log.debug(
            "HIGHESTMODSEQ has changed, getting changed UIDs",
            new_highestmodseq=new_highestmodseq,
            saved_highestmodseq=self.highestmodseq,
        )
        crispin_client.select_folder(self.folder_name, self.uidvalidity_cb)
        changed_flags = crispin_client.condstore_changed_flags(
            self.highestmodseq)
        remote_uids = crispin_client.all_uids()

        # In order to be able to sync changes to tens of thousands of flags at
        # once, we commit updates in batches. We do this in ascending order by
        # modseq and periodically "checkpoint" our saved highestmodseq. (It's
        # safe to checkpoint *because* we go in ascending order by modseq.)
        # That way if the process gets restarted halfway through this refresh,
        # we don't have to completely start over. It's also slow to load many
        # objects into the SQLAlchemy session and then issue lots of commits;
        # we avoid that by batching.
        flag_batches = chunk(
            sorted(changed_flags.items(),
                   key=lambda key_and_value: key_and_value[1].modseq),
            CONDSTORE_FLAGS_REFRESH_BATCH_SIZE,
        )
        for flag_batch in flag_batches:
            with session_scope(self.namespace_id) as db_session:
                common.update_metadata(
                    self.account_id,
                    self.folder_id,
                    self.folder_role,
                    dict(flag_batch),
                    db_session,
                )
            if len(flag_batch) == CONDSTORE_FLAGS_REFRESH_BATCH_SIZE:
                interim_highestmodseq = max(v.modseq for k, v in flag_batch)
                self.highestmodseq = interim_highestmodseq

        with session_scope(self.namespace_id) as db_session:
            local_uids = common.local_uids(self.account_id, db_session,
                                           self.folder_id)
            expunged_uids = set(local_uids).difference(remote_uids)

        if expunged_uids:
            # If new UIDs have appeared since we last checked in
            # get_new_uids, save them first. We want to always have the
            # latest UIDs before expunging anything, in order to properly
            # capture draft revisions.
            with session_scope(self.namespace_id) as db_session:
                lastseenuid = common.lastseenuid(self.account_id, db_session,
                                                 self.folder_id)
            if remote_uids and lastseenuid < max(remote_uids):
                log.info("Downloading new UIDs before expunging")
                self.get_new_uids(crispin_client)
            with self.syncmanager_lock:
                common.remove_deleted_uids(self.account_id, self.folder_id,
                                           expunged_uids)
        self.highestmodseq = new_highestmodseq