Exemple #1
0
    def __init__(self, account_id, folder_name, folder_id, email_address,
                 provider_name, poll_frequency, syncmanager_lock,
                 refresh_flags_max, retry_fail_classes):
        bind_context(self, 'foldersyncengine', account_id, folder_id)
        self.account_id = account_id
        self.folder_name = folder_name
        self.folder_id = folder_id
        self.poll_frequency = poll_frequency
        self.syncmanager_lock = syncmanager_lock
        self.refresh_flags_max = refresh_flags_max
        self.retry_fail_classes = retry_fail_classes
        self.state = None
        self.provider_name = provider_name

        with mailsync_session_scope() as db_session:
            account = db_session.query(Account).get(self.account_id)
            self.throttled = account.throttled
            self.namespace_id = account.namespace.id
            assert self.namespace_id is not None, "namespace_id is None"

        self.state_handlers = {
            'initial': self.initial_sync,
            'initial uidinvalid': self.resync_uids,
            'poll': self.poll,
            'poll uidinvalid': self.resync_uids,
            'finish': lambda self: 'finish',
        }

        Greenlet.__init__(self)

        self.heartbeat_status = HeartbeatStatusProxy(self.account_id,
                                                     self.folder_id,
                                                     self.folder_name,
                                                     email_address,
                                                     self.provider_name)
Exemple #2
0
    def __init__(self, account_id, namespace_id, folder_name,
                 email_address, provider_name, syncmanager_lock):

        with session_scope(namespace_id) as db_session:
            try:
                folder = db_session.query(Folder). \
                    filter(Folder.name == folder_name,
                           Folder.account_id == account_id).one()
            except NoResultFound:
                raise MailsyncError(u"Missing Folder '{}' on account {}"
                                    .format(folder_name, account_id))

            self.folder_id = folder.id
            self.folder_role = folder.canonical_name
            # Metric flags for sync performance
            self.is_initial_sync = folder.initial_sync_end is None
            self.is_first_sync = folder.initial_sync_start is None
            self.is_first_message = self.is_first_sync

        bind_context(self, 'foldersyncengine', account_id, self.folder_id)
        self.account_id = account_id
        self.namespace_id = namespace_id
        self.folder_name = folder_name
        if self.folder_name.lower() == 'inbox':
            self.poll_frequency = INBOX_POLL_FREQUENCY
        else:
            self.poll_frequency = DEFAULT_POLL_FREQUENCY
        self.syncmanager_lock = syncmanager_lock
        self.state = None
        self.provider_name = provider_name
        self.last_fast_refresh = None
        self.flags_fetch_results = {}
        self.conn_pool = connection_pool(self.account_id)

        self.state_handlers = {
            'initial': self.initial_sync,
            'initial uidinvalid': self.resync_uids,
            'poll': self.poll,
            'poll uidinvalid': self.resync_uids,
        }

        Greenlet.__init__(self)

        self.heartbeat_status = HeartbeatStatusProxy(self.account_id,
                                                     self.folder_id,
                                                     self.folder_name,
                                                     email_address,
                                                     self.provider_name)

        # Some generic IMAP servers are throwing UIDVALIDITY
        # errors forever. Instead of resyncing those servers
        # ad vitam, we keep track of the number of consecutive
        # times we got such an error and bail out if it's higher than
        # MAX_UIDINVALID_RESYNCS.
        self.uidinvalid_count = 0
Exemple #3
0
    def __init__(self, account_id, namespace_id, folder_name,
                 email_address, provider_name, syncmanager_lock):

        with session_scope(namespace_id) as db_session:
            try:
                folder = db_session.query(Folder). \
                    filter(Folder.name == folder_name,
                           Folder.account_id == account_id).one()
            except NoResultFound:
                raise MailsyncError(u"Missing Folder '{}' on account {}"
                                    .format(folder_name, account_id))

            self.folder_id = folder.id
            self.folder_role = folder.canonical_name
            # Metric flags for sync performance
            self.is_initial_sync = folder.initial_sync_end is None
            self.is_first_sync = folder.initial_sync_start is None
            self.is_first_message = self.is_first_sync

        bind_context(self, 'foldersyncengine', account_id, self.folder_id)
        self.account_id = account_id
        self.namespace_id = namespace_id
        self.folder_name = folder_name
        if self.folder_name.lower() == 'inbox':
            self.poll_frequency = INBOX_POLL_FREQUENCY
        else:
            self.poll_frequency = DEFAULT_POLL_FREQUENCY
        self.syncmanager_lock = syncmanager_lock
        self.state = None
        self.provider_name = provider_name
        self.last_fast_refresh = None
        self.conn_pool = connection_pool(self.account_id)

        self.state_handlers = {
            'initial': self.initial_sync,
            'initial uidinvalid': self.resync_uids,
            'poll': self.poll,
            'poll uidinvalid': self.resync_uids,
        }

        Greenlet.__init__(self)

        self.heartbeat_status = HeartbeatStatusProxy(self.account_id,
                                                     self.folder_id,
                                                     self.folder_name,
                                                     email_address,
                                                     self.provider_name)
Exemple #4
0
    def __init__(self, account_id, folder_name, folder_id, email_address,
                 provider_name, syncmanager_lock):
        bind_context(self, 'foldersyncengine', account_id, folder_id)
        self.account_id = account_id
        self.folder_name = folder_name
        self.folder_id = folder_id
        if self.folder_name.lower() == 'inbox':
            self.poll_frequency = INBOX_POLL_FREQUENCY
        else:
            self.poll_frequency = DEFAULT_POLL_FREQUENCY
        self.syncmanager_lock = syncmanager_lock
        self.state = None
        self.provider_name = provider_name
        self.last_fast_refresh = None
        self.conn_pool = connection_pool(self.account_id)

        # Metric flags for sync performance
        self.is_initial_sync = False
        self.is_first_sync = False
        self.is_first_message = False

        with session_scope() as db_session:
            account = Account.get(self.account_id, db_session)
            self.namespace_id = account.namespace.id
            assert self.namespace_id is not None, "namespace_id is None"

            folder = Folder.get(self.folder_id, db_session)
            if folder:
                self.is_initial_sync = folder.initial_sync_end is None
                self.is_first_sync = folder.initial_sync_start is None
                self.is_first_message = self.is_first_sync

        self.state_handlers = {
            'initial': self.initial_sync,
            'initial uidinvalid': self.resync_uids,
            'poll': self.poll,
            'poll uidinvalid': self.resync_uids,
        }

        Greenlet.__init__(self)

        self.heartbeat_status = HeartbeatStatusProxy(self.account_id,
                                                     self.folder_id,
                                                     self.folder_name,
                                                     email_address,
                                                     self.provider_name)
Exemple #5
0
    def __init__(self, account_id, namespace_id, email_address, folder_id,
                 folder_name, provider_name, poll_frequency=1,
                 scope=None):

        self.account_id = account_id
        self.namespace_id = namespace_id
        self.poll_frequency = poll_frequency
        self.scope = scope

        self.log = logger.new(account_id=account_id)

        self.shutdown = event.Event()
        self.heartbeat_status = HeartbeatStatusProxy(self.account_id,
                                                     folder_id,
                                                     folder_name,
                                                     email_address,
                                                     provider_name)
        Greenlet.__init__(self)
Exemple #6
0
class FolderSyncEngine(Greenlet):
    """Base class for a per-folder IMAP sync engine."""

    def __init__(self, account_id, namespace_id, folder_name, folder_id,
                 email_address, provider_name, syncmanager_lock):
        bind_context(self, 'foldersyncengine', account_id, folder_id)
        self.account_id = account_id
        self.namespace_id = namespace_id
        self.folder_name = folder_name
        self.folder_id = folder_id
        if self.folder_name.lower() == 'inbox':
            self.poll_frequency = INBOX_POLL_FREQUENCY
        else:
            self.poll_frequency = DEFAULT_POLL_FREQUENCY
        self.syncmanager_lock = syncmanager_lock
        self.state = None
        self.provider_name = provider_name
        self.last_fast_refresh = None
        self.conn_pool = connection_pool(self.account_id)

        # Metric flags for sync performance
        self.is_initial_sync = False
        self.is_first_sync = False
        self.is_first_message = False

        with session_scope(self.namespace_id) as db_session:
            folder = Folder.get(self.folder_id, db_session)
            if folder:
                self.is_initial_sync = folder.initial_sync_end is None
                self.is_first_sync = folder.initial_sync_start is None
                self.is_first_message = self.is_first_sync

        self.state_handlers = {
            'initial': self.initial_sync,
            'initial uidinvalid': self.resync_uids,
            'poll': self.poll,
            'poll uidinvalid': self.resync_uids,
        }

        Greenlet.__init__(self)

        self.heartbeat_status = HeartbeatStatusProxy(self.account_id,
                                                     self.folder_id,
                                                     self.folder_name,
                                                     email_address,
                                                     self.provider_name)

    def _run(self):
        # Bind greenlet-local logging context.
        self.log = log.new(account_id=self.account_id, folder=self.folder_name,
                           provider=self.provider_name)
        # eagerly signal the sync status
        self.heartbeat_status.publish()
        return retry_with_logging(self._run_impl, account_id=self.account_id,
                                  provider=self.provider_name, logger=log)

    def _run_impl(self):
        try:
            saved_folder_status = self._load_state()
        except IntegrityError:
            # The state insert failed because the folder ID ForeignKey
            # was no longer valid, ie. the folder for this engine was deleted
            # while we were starting up.
            # Exit the sync and let the monitor sort things out.
            log.info("Folder state loading failed due to IntegrityError",
                     folder_id=self.folder_id, account_id=self.account_id)
            raise MailsyncDone()

        # NOTE: The parent ImapSyncMonitor handler could kill us at any
        # time if it receives a shutdown command. The shutdown command is
        # equivalent to ctrl-c.
        while True:
            old_state = self.state
            try:
                self.state = self.state_handlers[old_state]()
                self.heartbeat_status.publish(state=self.state)
            except UidInvalid:
                self.state = self.state + ' uidinvalid'
                self.heartbeat_status.publish(state=self.state)
            except FolderMissingError:
                # Folder was deleted by monitor while its sync was running.
                # TODO: Monitor should handle shutting down the folder engine.
                log.info('Folder disappeared. Stopping sync.',
                         account_id=self.account_id,
                         folder_name=self.folder_name,
                         folder_id=self.folder_id)
                raise MailsyncDone()
            except ValidationError as exc:
                log.error('Error authenticating; stopping sync', exc_info=True,
                          account_id=self.account_id, folder_id=self.folder_id,
                          logstash_tag='mark_invalid')
                with session_scope(self.namespace_id) as db_session:
                    account = db_session.query(Account).get(self.account_id)
                    account.mark_invalid()
                    account.update_sync_error(str(exc))
                raise MailsyncDone()

            # State handlers are idempotent, so it's okay if we're
            # killed between the end of the handler and the commit.
            if self.state != old_state:
                # Don't need to re-query, will auto refresh on re-associate.
                with session_scope(self.namespace_id) as db_session:
                    db_session.add(saved_folder_status)
                    saved_folder_status.state = self.state
                    db_session.commit()

    def _load_state(self):
        with session_scope(self.namespace_id) as db_session:
            try:
                state = ImapFolderSyncStatus.state
                saved_folder_status = db_session.query(ImapFolderSyncStatus)\
                    .filter_by(account_id=self.account_id,
                               folder_id=self.folder_id).options(
                        load_only(state)).one()
            except NoResultFound:
                saved_folder_status = ImapFolderSyncStatus(
                    account_id=self.account_id, folder_id=self.folder_id)
                db_session.add(saved_folder_status)

            saved_folder_status.start_sync()
            db_session.commit()
            self.state = saved_folder_status.state
            return saved_folder_status

    def set_stopped(self, db_session):
        saved_folder_status = db_session.query(ImapFolderSyncStatus)\
            .filter_by(account_id=self.account_id,
                       folder_id=self.folder_id).one()

        saved_folder_status.stop_sync()
        self.state = saved_folder_status.state

    def _report_initial_sync_start(self):
        with session_scope(self.namespace_id) as db_session:
            q = db_session.query(Folder).get(self.folder_id)
            q.initial_sync_start = datetime.utcnow()

    def _report_initial_sync_end(self):
        with session_scope(self.namespace_id) as db_session:
            q = db_session.query(Folder).get(self.folder_id)
            q.initial_sync_end = datetime.utcnow()

    @retry_crispin
    def initial_sync(self):
        log.bind(state='initial')
        log.info('starting initial sync')

        if self.is_first_sync:
            self._report_initial_sync_start()
            self.is_first_sync = False

        with self.conn_pool.get() as crispin_client:
            crispin_client.select_folder(self.folder_name, uidvalidity_cb)
            # Ensure we have an ImapFolderInfo row created prior to sync start.
            with session_scope(self.namespace_id) as db_session:
                try:
                    db_session.query(ImapFolderInfo). \
                        filter(ImapFolderInfo.account_id == self.account_id,
                               ImapFolderInfo.folder_id == self.folder_id). \
                        one()
                except NoResultFound:
                    imapfolderinfo = ImapFolderInfo(
                        account_id=self.account_id, folder_id=self.folder_id,
                        uidvalidity=crispin_client.selected_uidvalidity,
                        uidnext=crispin_client.selected_uidnext)
                    db_session.add(imapfolderinfo)
                db_session.commit()

            self.initial_sync_impl(crispin_client)

        if self.is_initial_sync:
            self._report_initial_sync_end()
            self.is_initial_sync = False

        return 'poll'

    @retry_crispin
    def poll(self):
        log.bind(state='poll')
        log.info('polling')
        self.poll_impl()
        return 'poll'

    @retry_crispin
    def resync_uids(self):
        log.bind(state=self.state)
        log.warning('UIDVALIDITY changed; initiating resync')
        self.resync_uids_impl()
        return 'initial'

    def initial_sync_impl(self, crispin_client):
        # We wrap the block in a try/finally because the change_poller greenlet
        # needs to be killed when this greenlet is interrupted
        change_poller = None
        try:
            assert crispin_client.selected_folder_name == self.folder_name
            remote_uids = crispin_client.all_uids()
            with self.syncmanager_lock:
                with session_scope(self.namespace_id) as db_session:
                    local_uids = common.local_uids(self.account_id, db_session,
                                                   self.folder_id)
                    common.remove_deleted_uids(
                        self.account_id, self.folder_id,
                        set(local_uids).difference(remote_uids),
                        db_session)

            new_uids = set(remote_uids).difference(local_uids)
            with session_scope(self.namespace_id) as db_session:
                account = db_session.query(Account).get(self.account_id)
                throttled = account.throttled
                self.update_uid_counts(
                    db_session,
                    remote_uid_count=len(remote_uids),
                    # This is the initial size of our download_queue
                    download_uid_count=len(new_uids))

            change_poller = spawn(self.poll_for_changes)
            bind_context(change_poller, 'changepoller', self.account_id,
                         self.folder_id)
            uids = sorted(new_uids, reverse=True)
            count = 0
            for uid in uids:
                # The speedup from batching appears to be less clear for
                # non-Gmail accounts, so for now just download one-at-a-time.
                self.download_and_commit_uids(crispin_client, [uid])
                self.heartbeat_status.publish()
                count += 1
                if throttled and count >= THROTTLE_COUNT:
                    # Throttled accounts' folders sync at a rate of
                    # 1 message/ minute, after the first approx. THROTTLE_COUNT
                    # messages per folder are synced.
                    # Note this is an approx. limit since we use the #(uids),
                    # not the #(messages).
                    sleep(THROTTLE_WAIT)
        finally:
            if change_poller is not None:
                # schedule change_poller to die
                kill(change_poller)

    def should_idle(self, crispin_client):
        if not hasattr(self, '_should_idle'):
            self._should_idle = (
                crispin_client.idle_supported() and self.folder_name in
                crispin_client.folder_names()['inbox']
            )
        return self._should_idle

    def poll_impl(self):
        with self.conn_pool.get() as crispin_client:
            self.check_uid_changes(crispin_client)
            if self.should_idle(crispin_client):
                crispin_client.select_folder(self.folder_name,
                                             self.uidvalidity_cb)
                idling = True
                try:
                    crispin_client.idle(IDLE_WAIT)
                except Exception as exc:
                    # With some servers we get e.g.
                    # 'Unexpected IDLE response: * FLAGS  (...)'
                    if isinstance(exc, imaplib.IMAP4.error) and \
                            exc.message.startswith('Unexpected IDLE response'):
                        log.info('Error initiating IDLE, not idling',
                                 error=exc)
                        try:
                            # Still have to take the connection out of IDLE
                            # mode to reuse it though.
                            crispin_client.conn.idle_done()
                        except AttributeError:
                            pass
                        idling = False
                    else:
                        raise
            else:
                idling = False
        # Close IMAP connection before sleeping
        if not idling:
            sleep(self.poll_frequency)

    def resync_uids_impl(self):
        # First, let's check if the UIVDALIDITY change was spurious, if
        # it is, just discard it and go on.
        with self.conn_pool.get() as crispin_client:
            crispin_client.select_folder(self.folder_name, lambda *args: True)
            remote_uidvalidity = crispin_client.selected_uidvalidity
            remote_uidnext = crispin_client.selected_uidnext
            if remote_uidvalidity <= self.uidvalidity:
                log.debug('UIDVALIDITY unchanged')
                return
        # Otherwise, if the UIDVALIDITY really has changed, discard all saved
        # UIDs for the folder, mark associated messages for garbage-collection,
        # and return to the 'initial' state to resync.
        # This will cause message and threads to be deleted and recreated, but
        # uidinvalidity is sufficiently rare that this tradeoff is acceptable.
        with session_scope(self.namespace_id) as db_session:
            invalid_uids = {
                uid for uid, in db_session.query(ImapUid.msg_uid).
                filter_by(account_id=self.account_id,
                          folder_id=self.folder_id)
            }
            common.remove_deleted_uids(self.account_id, self.folder_id,
                                       invalid_uids, db_session)
        self.uidvalidity = remote_uidvalidity
        self.highestmodseq = None
        self.uidnext = remote_uidnext

    @retry_crispin
    def poll_for_changes(self):
        log.new(account_id=self.account_id, folder=self.folder_name)
        while True:
            log.info('polling for changes')
            self.poll_impl()

    def create_message(self, db_session, acct, folder, msg):
        assert acct is not None and acct.namespace is not None

        # Check if we somehow already saved the imapuid (shouldn't happen, but
        # possible due to race condition). If so, don't commit changes.
        existing_imapuid = db_session.query(ImapUid).filter(
            ImapUid.account_id == acct.id, ImapUid.folder_id == folder.id,
            ImapUid.msg_uid == msg.uid).first()
        if existing_imapuid is not None:
            log.error('Expected to create imapuid, but existing row found',
                      remote_msg_uid=msg.uid,
                      existing_imapuid=existing_imapuid.id)
            return None

        new_uid = common.create_imap_message(db_session, acct, folder, msg)
        self.add_message_to_thread(db_session, new_uid.message, msg)

        db_session.flush()

        # We're calling import_attached_events here instead of some more
        # obvious place (like Message.create_from_synced) because the function
        # requires new_uid.message to have been flushed.
        # This is necessary because the import_attached_events does db lookups.
        if new_uid.message.has_attached_events:
            with db_session.no_autoflush:
                import_attached_events(db_session, acct, new_uid.message)

        # If we're in the polling state, then we want to report the metric
        # for latency when the message was received vs created
        if self.state == 'poll':
            latency_millis = (
                datetime.utcnow() - new_uid.message.received_date) \
                .total_seconds() * 1000
            metrics = [
                '.'.join(['accounts', 'overall', 'message_latency']),
                '.'.join(['accounts', str(acct.id), 'message_latency']),
                '.'.join(['providers', self.provider_name, 'message_latency']),
            ]
            for metric in metrics:
                statsd_client.timing(metric, latency_millis)

        return new_uid

    def _count_thread_messages(self, thread_id, db_session):
        count, = db_session.query(func.count(Message.id)). \
            filter(Message.thread_id == thread_id).one()
        return count

    def add_message_to_thread(self, db_session, message_obj, raw_message):
        """Associate message_obj to the right Thread object, creating a new
        thread if necessary."""
        with db_session.no_autoflush:
            # Disable autoflush so we don't try to flush a message with null
            # thread_id.
            parent_thread = fetch_corresponding_thread(
                db_session, self.namespace_id, message_obj)
            construct_new_thread = True

            if parent_thread:
                # If there's a parent thread that isn't too long already,
                # add to it. Otherwise create a new thread.
                parent_message_count = self._count_thread_messages(
                    parent_thread.id, db_session)
                if parent_message_count < MAX_THREAD_LENGTH:
                    construct_new_thread = False

            if construct_new_thread:
                message_obj.thread = ImapThread.from_imap_message(
                    db_session, self.namespace_id, message_obj)
            else:
                parent_thread.messages.append(message_obj)

    def download_and_commit_uids(self, crispin_client, uids):
        start = datetime.utcnow()
        raw_messages = crispin_client.uids(uids)
        if not raw_messages:
            return 0

        new_uids = set()
        with self.syncmanager_lock:
            with session_scope(self.namespace_id) as db_session:
                account = Account.get(self.account_id, db_session)
                folder = Folder.get(self.folder_id, db_session)
                for msg in raw_messages:
                    uid = self.create_message(db_session, account,
                                              folder, msg)
                    if uid is not None:
                        db_session.add(uid)
                        db_session.flush()
                        new_uids.add(uid)
                db_session.commit()

        log.info('Committed new UIDs',
                 new_committed_message_count=len(new_uids))
        # If we downloaded uids, record message velocity (#uid / latency)
        if self.state == 'initial' and len(new_uids):
            self._report_message_velocity(datetime.utcnow() - start,
                                          len(new_uids))
        if self.is_first_message:
            self._report_first_message()
            self.is_first_message = False

        return len(new_uids)

    def _report_first_message(self):
        now = datetime.utcnow()

        with session_scope(self.namespace_id) as db_session:
            account = db_session.query(Account).get(self.account_id)
            account_created = account.created_at

        latency = (now - account_created).total_seconds() * 1000
        metrics = [
            '.'.join(['providers', self.provider_name, 'first_message']),
            '.'.join(['providers', 'overall', 'first_message'])
        ]

        for metric in metrics:
            statsd_client.timing(metric, latency)

    def _report_message_velocity(self, timedelta, num_uids):
        latency = (timedelta).total_seconds() * 1000
        latency_per_uid = float(latency) / num_uids
        metrics = [
            '.'.join(['providers', self.provider_name,
                      'message_velocity']),
            '.'.join(['providers', 'overall', 'message_velocity'])
        ]
        for metric in metrics:
            statsd_client.timing(metric, latency_per_uid)

    def update_uid_counts(self, db_session, **kwargs):
        saved_status = db_session.query(ImapFolderSyncStatus).join(Folder). \
            filter(ImapFolderSyncStatus.account_id == self.account_id,
                   Folder.name == self.folder_name).one()
        # We're not updating the current_remote_count metric
        # so don't update uid_checked_timestamp.
        if kwargs.get('remote_uid_count') is None:
            saved_status.update_metrics(kwargs)
        else:
            metrics = dict(uid_checked_timestamp=datetime.utcnow())
            metrics.update(kwargs)
            saved_status.update_metrics(metrics)

    def get_new_uids(self, crispin_client):
        try:
            remote_uidnext = crispin_client.conn.folder_status(
                self.folder_name, ['UIDNEXT']).get('UIDNEXT')
        except ValueError:
            # Work around issue where ValueError is raised on parsing STATUS
            # response.
            log.warning('Error getting UIDNEXT', exc_info=True)
            remote_uidnext = None
        except imaplib.IMAP4.error as e:
            if '[NONEXISTENT]' in e.message:
                raise FolderMissingError()
            else:
                raise e
        if remote_uidnext is not None and remote_uidnext == self.uidnext:
            return
        log.info('UIDNEXT changed, checking for new UIDs',
                 remote_uidnext=remote_uidnext, saved_uidnext=self.uidnext)

        crispin_client.select_folder(self.folder_name, self.uidvalidity_cb)
        with session_scope(self.namespace_id) as db_session:
            lastseenuid = common.lastseenuid(self.account_id, db_session,
                                             self.folder_id)
        latest_uids = crispin_client.conn.fetch('{}:*'.format(lastseenuid + 1),
                                                ['UID']).keys()
        new_uids = set(latest_uids) - {lastseenuid}
        if new_uids:
            for uid in sorted(new_uids):
                self.download_and_commit_uids(crispin_client, [uid])
        self.uidnext = remote_uidnext

    def condstore_refresh_flags(self, crispin_client):
        new_highestmodseq = crispin_client.conn.folder_status(
            self.folder_name, ['HIGHESTMODSEQ'])['HIGHESTMODSEQ']
        # Ensure that we have an initial highestmodseq value stored before we
        # begin polling for changes.
        if self.highestmodseq is None:
            self.highestmodseq = new_highestmodseq

        if new_highestmodseq == self.highestmodseq:
            # Don't need to do anything if the highestmodseq hasn't
            # changed.
            return
        elif new_highestmodseq < self.highestmodseq:
            # This should really never happen, but if it does, handle it.
            log.warning('got server highestmodseq less than saved '
                        'highestmodseq',
                        new_highestmodseq=new_highestmodseq,
                        saved_highestmodseq=self.highestmodseq)
            return

        # Highestmodseq has changed, update accordingly.
        crispin_client.select_folder(self.folder_name, self.uidvalidity_cb)
        changed_flags = crispin_client.condstore_changed_flags(
            self.highestmodseq)
        remote_uids = crispin_client.all_uids()
        with session_scope(self.namespace_id) as db_session:
            common.update_metadata(self.account_id, self.folder_id,
                                   changed_flags, db_session)
            local_uids = common.local_uids(self.account_id, db_session,
                                           self.folder_id)
            expunged_uids = set(local_uids).difference(remote_uids)

        if expunged_uids:
            # If new UIDs have appeared since we last checked in
            # get_new_uids, save them first. We want to always have the
            # latest UIDs before expunging anything, in order to properly
            # capture draft revisions.
            with session_scope(self.namespace_id) as db_session:
                lastseenuid = common.lastseenuid(self.account_id, db_session,
                                                 self.folder_id)
            if remote_uids and lastseenuid < max(remote_uids):
                log.info('Downloading new UIDs before expunging')
                self.get_new_uids(crispin_client)
            with session_scope(self.namespace_id) as db_session:
                common.remove_deleted_uids(self.account_id, self.folder_id,
                                           expunged_uids, db_session)
                db_session.commit()
        self.highestmodseq = new_highestmodseq

    def generic_refresh_flags(self, crispin_client):
        now = datetime.utcnow()
        slow_refresh_due = (
            self.last_slow_refresh is None or
            now > self.last_slow_refresh + SLOW_REFRESH_INTERVAL
        )
        fast_refresh_due = (
            self.last_fast_refresh is None or
            now > self.last_fast_refresh + FAST_REFRESH_INTERVAL
        )
        if slow_refresh_due:
            self.refresh_flags_impl(crispin_client, SLOW_FLAGS_REFRESH_LIMIT)
            self.last_slow_refresh = datetime.utcnow()
        elif fast_refresh_due:
            self.refresh_flags_impl(crispin_client, FAST_FLAGS_REFRESH_LIMIT)
            self.last_fast_refresh = datetime.utcnow()

    def refresh_flags_impl(self, crispin_client, max_uids):
        crispin_client.select_folder(self.folder_name, self.uidvalidity_cb)
        with session_scope(self.namespace_id) as db_session:
            local_uids = common.local_uids(account_id=self.account_id,
                                           session=db_session,
                                           folder_id=self.folder_id,
                                           limit=max_uids)

        flags = crispin_client.flags(local_uids)
        expunged_uids = set(local_uids).difference(flags.keys())
        with session_scope(self.namespace_id) as db_session:
            common.remove_deleted_uids(self.account_id, self.folder_id,
                                       expunged_uids, db_session)
            common.update_metadata(self.account_id, self.folder_id,
                                   flags, db_session)

    def check_uid_changes(self, crispin_client):
        self.get_new_uids(crispin_client)
        if crispin_client.condstore_supported():
            self.condstore_refresh_flags(crispin_client)
        else:
            self.generic_refresh_flags(crispin_client)

    @property
    def uidvalidity(self):
        if not hasattr(self, '_uidvalidity'):
            self._uidvalidity = self._load_imap_folder_info().uidvalidity
        return self._uidvalidity

    @uidvalidity.setter
    def uidvalidity(self, value):
        self._update_imap_folder_info('uidvalidity', value)
        self._uidvalidity = value

    @property
    def uidnext(self):
        if not hasattr(self, '_uidnext'):
            self._uidnext = self._load_imap_folder_info().uidnext
        return self._uidnext

    @uidnext.setter
    def uidnext(self, value):
        self._update_imap_folder_info('uidnext', value)
        self._uidnext = value

    @property
    def last_slow_refresh(self):
        # We persist the last_slow_refresh timestamp so that we don't end up
        # doing a (potentially expensive) full flags refresh for every account
        # on every process restart.
        if not hasattr(self, '_last_slow_refresh'):
            self._last_slow_refresh = self._load_imap_folder_info(). \
                last_slow_refresh
        return self._last_slow_refresh

    @last_slow_refresh.setter
    def last_slow_refresh(self, value):
        self._update_imap_folder_info('last_slow_refresh', value)
        self._last_slow_refresh = value

    @property
    def highestmodseq(self):
        if not hasattr(self, '_highestmodseq'):
            self._highestmodseq = self._load_imap_folder_info().highestmodseq
        return self._highestmodseq

    @highestmodseq.setter
    def highestmodseq(self, value):
        self._highestmodseq = value
        self._update_imap_folder_info('highestmodseq', value)

    def _load_imap_folder_info(self):
        with session_scope(self.namespace_id) as db_session:
            imapfolderinfo = db_session.query(ImapFolderInfo). \
                filter(ImapFolderInfo.account_id == self.account_id,
                       ImapFolderInfo.folder_id == self.folder_id). \
                one()
            db_session.expunge(imapfolderinfo)
            return imapfolderinfo

    def _update_imap_folder_info(self, attrname, value):
        with session_scope(self.namespace_id) as db_session:
            imapfolderinfo = db_session.query(ImapFolderInfo). \
                filter(ImapFolderInfo.account_id == self.account_id,
                       ImapFolderInfo.folder_id == self.folder_id). \
                one()
            setattr(imapfolderinfo, attrname, value)
            db_session.commit()

    def uidvalidity_cb(self, account_id, folder_name, select_info):
        assert folder_name == self.folder_name
        assert account_id == self.account_id
        selected_uidvalidity = select_info['UIDVALIDITY']
        is_valid = (self.uidvalidity is None or
                    selected_uidvalidity <= self.uidvalidity)
        if not is_valid:
            raise UidInvalid(
                'folder: {}, remote uidvalidity: {}, '
                'cached uidvalidity: {}'.format(folder_name.encode('utf-8'),
                                                selected_uidvalidity,
                                                self.uidvalidity))
        return select_info
Exemple #7
0
class FolderSyncEngine(Greenlet):
    """Base class for a per-folder IMAP sync engine."""

    def __init__(self, account_id, folder_name, folder_id, email_address,
                 provider_name, poll_frequency, syncmanager_lock,
                 refresh_flags_max, retry_fail_classes):
        bind_context(self, 'foldersyncengine', account_id, folder_id)
        self.account_id = account_id
        self.folder_name = folder_name
        self.folder_id = folder_id
        self.poll_frequency = poll_frequency
        self.syncmanager_lock = syncmanager_lock
        self.refresh_flags_max = refresh_flags_max
        self.retry_fail_classes = retry_fail_classes
        self.state = None
        self.provider_name = provider_name
        self.is_initial_sync = False
        self.is_first_sync = False

        with mailsync_session_scope() as db_session:
            account = db_session.query(Account).get(self.account_id)
            self.throttled = account.throttled
            self.namespace_id = account.namespace.id
            assert self.namespace_id is not None, "namespace_id is None"

            folder = db_session.query(Folder).get(self.folder_id)
            if folder:
                self.is_initial_sync = folder.initial_sync_end is None
                self.is_first_sync = folder.initial_sync_start is None

        self.state_handlers = {
            'initial': self.initial_sync,
            'initial uidinvalid': self.resync_uids,
            'poll': self.poll,
            'poll uidinvalid': self.resync_uids,
            'finish': lambda self: 'finish',
        }

        Greenlet.__init__(self)

        self.heartbeat_status = HeartbeatStatusProxy(self.account_id,
                                                     self.folder_id,
                                                     self.folder_name,
                                                     email_address,
                                                     self.provider_name)

    def _run(self):
        # Bind greenlet-local logging context.
        self.log = log.new(account_id=self.account_id, folder=self.folder_name)
        # eagerly signal the sync status
        self.heartbeat_status.publish()
        return retry_and_report_killed(self._run_impl,
                                       account_id=self.account_id,
                                       folder_name=self.folder_name,
                                       logger=log,
                                       fail_classes=self.retry_fail_classes)

    def _run_impl(self):
        # We defer initializing the pool to here so that we'll retry if there
        # are any errors (remote server 503s or similar) when initializing it.
        self.conn_pool = connection_pool(self.account_id)
        try:
            saved_folder_status = self._load_state()
        except IntegrityError:
            # The state insert failed because the folder ID ForeignKey
            # was no longer valid, ie. the folder for this engine was deleted
            # while we were starting up.
            # Exit the sync and let the monitor sort things out.
            log.info("Folder state loading failed due to IntegrityError",
                     folder_id=self.folder_id, account_id=self.account_id)
            raise MailsyncDone()

        # NOTE: The parent ImapSyncMonitor handler could kill us at any
        # time if it receives a shutdown command. The shutdown command is
        # equivalent to ctrl-c.
        while True:
            old_state = self.state
            try:
                self.state = self.state_handlers[old_state]()
                self.heartbeat_status.publish(state=self.state)
            except UidInvalid:
                self.state = self.state + ' uidinvalid'
                self.heartbeat_status.publish(state=self.state)
            except FolderMissingError:
                # Folder was deleted by monitor while its sync was running.
                # TODO: Monitor should handle shutting down the folder engine.
                log.info('Folder disappeared: {}. Stopping sync.'.format(
                    self.folder_name), account_id=self.account_id,
                    folder_id=self.folder_id)
                raise MailsyncDone()
            except ValidationError:
                log.error('Error authenticating; stopping sync', exc_info=True,
                          account_id=self.account_id, folder_id=self.folder_id)
                raise MailsyncDone()

            # State handlers are idempotent, so it's okay if we're
            # killed between the end of the handler and the commit.
            if self.state != old_state:
                # Don't need to re-query, will auto refresh on re-associate.
                with mailsync_session_scope() as db_session:
                    db_session.add(saved_folder_status)
                    saved_folder_status.state = self.state
                    db_session.commit()
            if self.state == 'finish':
                return

    def _load_state(self):
        with mailsync_session_scope() as db_session:
            try:
                state = ImapFolderSyncStatus.state
                saved_folder_status = db_session.query(ImapFolderSyncStatus)\
                    .filter_by(account_id=self.account_id,
                               folder_id=self.folder_id).options(
                        load_only(state)).one()
            except NoResultFound:
                saved_folder_status = ImapFolderSyncStatus(
                    account_id=self.account_id, folder_id=self.folder_id)
                db_session.add(saved_folder_status)

            saved_folder_status.start_sync()
            db_session.commit()
            self.state = saved_folder_status.state
            return saved_folder_status

    def set_stopped(self, db_session):
        saved_folder_status = db_session.query(ImapFolderSyncStatus)\
            .filter_by(account_id=self.account_id,
                       folder_id=self.folder_id).one()

        saved_folder_status.stop_sync()
        self.state = saved_folder_status.state

    def _report_initial_sync_start(self):
        with mailsync_session_scope() as db_session:
            q = db_session.query(Folder).get(self.folder_id)
            q.initial_sync_start = datetime.utcnow()

    def _report_initial_sync_end(self):
        with mailsync_session_scope() as db_session:
            q = db_session.query(Folder).get(self.folder_id)
            q.initial_sync_end = datetime.utcnow()

    @retry_crispin
    def initial_sync(self):
        log.bind(state='initial')
        log.info('starting initial sync')

        if self.is_first_sync:
            self._report_initial_sync_start()
            self.is_first_sync = False

        with self.conn_pool.get() as crispin_client:
            crispin_client.select_folder(self.folder_name, uidvalidity_cb)
            self.initial_sync_impl(crispin_client)

        if self.is_initial_sync:
            self._report_initial_sync_end()
            self.is_initial_sync = False

        return 'poll'

    @retry_crispin
    def poll(self):
        log.bind(state='poll')
        log.info('polling')
        self.poll_impl()
        return 'poll'

    @retry_crispin
    def resync_uids(self):
        log.bind(state=self.state)
        log.info('UIDVALIDITY changed')
        self.resync_uids_impl()
        return 'initial'

    def initial_sync_impl(self, crispin_client):
        # We wrap the block in a try/finally because the change_poller greenlet
        # needs to be killed when this greenlet is interrupted
        change_poller = None
        try:
            assert crispin_client.selected_folder_name == self.folder_name
            remote_uids = crispin_client.all_uids()
            with self.syncmanager_lock:
                with mailsync_session_scope() as db_session:
                    local_uids = common.all_uids(self.account_id, db_session,
                                                 self.folder_id)
                    self.remove_deleted_uids(db_session, local_uids,
                                             remote_uids)

            new_uids = set(remote_uids) - local_uids
            download_stack = UIDStack()
            for uid in sorted(new_uids):
                download_stack.put(uid, GenericUIDMetadata(self.throttled))

            with mailsync_session_scope() as db_session:
                self.update_uid_counts(
                    db_session,
                    remote_uid_count=len(remote_uids),
                    # This is the initial size of our download_queue
                    download_uid_count=len(new_uids))

            change_poller = spawn(self.poll_for_changes, download_stack)
            bind_context(change_poller, 'changepoller', self.account_id,
                         self.folder_id)
            self.download_uids(crispin_client, download_stack)

        finally:
            if change_poller is not None:
                # schedule change_poller to die
                kill(change_poller)

    def poll_impl(self):
        with self.conn_pool.get() as crispin_client:
            crispin_client.select_folder(self.folder_name, uidvalidity_cb)
            download_stack = UIDStack()
            self.check_uid_changes(crispin_client, download_stack,
                                   async_download=False)
        sleep(self.poll_frequency)

    def resync_uids_impl(self):
        # NOTE: first, let's check if the UIVDALIDITY change was spurious, if
        # it is, just discard it and go on, if it isn't, drop the relevant
        # entries (filtering by account and folder IDs) from the imapuid table,
        # download messages, if necessary - in case a message has changed UID -
        # update UIDs, and discard orphaned messages. -siro
        with mailsync_session_scope() as db_session:
            folder_info = db_session.query(ImapFolderInfo). \
                filter_by(account_id=self.account_id,
                          folder_id=self.folder_id).one()
            cached_uidvalidity = folder_info.uidvalidity
            with self.conn_pool.get() as crispin_client:
                crispin_client.select_folder(self.folder_name,
                                             lambda *args: True)
                uidvalidity = crispin_client.selected_uidvalidity
                if uidvalidity <= cached_uidvalidity:
                    log.debug('UIDVALIDITY unchanged')
                    return
                invalid_uids = db_session.query(ImapUid). \
                    filter_by(account_id=self.account_id,
                              folder_id=self.folder_id)
                data_sha256_message = {uid.message.data_sha256: uid.message
                                       for uid in invalid_uids}
                for uid in invalid_uids:
                    db_session.delete(uid)
                # NOTE: this is necessary (and OK since it doesn't persist any
                # data) to maintain the order between UIDs deletion and
                # insertion. Without this, I was seeing constraints violation
                # on the imapuid table. -siro
                db_session.flush()
                remote_uids = crispin_client.all_uids()
                for remote_uid in remote_uids:
                    raw_message = crispin_client.uids([remote_uid])[0]
                    data_sha256 = sha256(raw_message.body).hexdigest()
                    if data_sha256 in data_sha256_message:
                        message = data_sha256_message[data_sha256]

                        # Create a new imapuid
                        uid = ImapUid(msg_uid=raw_message.uid,
                                      message_id=message.id,
                                      account_id=self.account_id,
                                      folder_id=self.folder_id)
                        uid.update_flags(raw_message.flags)
                        db_session.add(uid)

                        # Update the existing message's metadata too
                        common.update_message_metadata(db_session, uid)

                        del data_sha256_message[data_sha256]
                    else:
                        self.download_and_commit_uids(crispin_client,
                                                      [remote_uid])
                    self.heartbeat_status.publish()
                    # FIXME: do we want to throttle the account when recovering
                    # from UIDVALIDITY changes? -siro
            for message in data_sha256_message.itervalues():
                db_session.delete(message)
            folder_info.uidvalidity = uidvalidity
            folder_info.highestmodseq = None

    @retry_crispin
    def poll_for_changes(self, download_stack):
        while True:
            with self.conn_pool.get() as crispin_client:
                crispin_client.select_folder(self.folder_name, uidvalidity_cb)
                self.check_uid_changes(crispin_client, download_stack,
                                       async_download=True)
            sleep(self.poll_frequency)

    def download_uids(self, crispin_client, download_stack):
        while not download_stack.empty():
            uid, metadata = download_stack.peekitem()
            self.download_and_commit_uids(crispin_client, [uid])
            download_stack.pop(uid)
            report_progress(self.account_id, self.folder_name, 1,
                            len(download_stack))
            self.heartbeat_status.publish()
            if self.throttled and metadata is not None and metadata.throttled:
                # Check to see if the account's throttled state has been
                # modified. If so, immediately accelerate.
                with mailsync_session_scope() as db_session:
                    acc = db_session.query(Account).get(self.account_id)
                    self.throttled = acc.throttled
                if self.throttled:
                    log.debug('throttled; sleeping')
                    sleep(THROTTLE_WAIT)

    def create_message(self, db_session, acct, folder, msg):
        assert acct is not None and acct.namespace is not None

        # Check if we somehow already saved the imapuid (shouldn't happen, but
        # possible due to race condition). If so, don't commit changes.
        existing_imapuid = db_session.query(ImapUid).filter(
            ImapUid.account_id == acct.id, ImapUid.folder_id == folder.id,
            ImapUid.msg_uid == msg.uid).first()
        if existing_imapuid is not None:
            log.error('Expected to create imapuid, but existing row found',
                      remote_msg_uid=msg.uid,
                      existing_imapuid=existing_imapuid.id)
            return None

        new_uid = common.create_imap_message(db_session, log, acct, folder,
                                             msg)
        new_uid = self.add_message_attrs(db_session, new_uid, msg)

        # We're calling import_attached_events here instead of some more
        # obvious place (like Message.create_from_synced) because the function
        # requires new_uid.message to have been flushed.
        # This is necessary because the import_attached_events does db lookups.
        if new_uid.message.has_attached_events:
            with db_session.no_autoflush:
                import_attached_events(db_session, acct, new_uid.message)

        # If we're in the polling state, then we want to report the metric
        # for latency when the message was received vs created
        if self.state == 'poll':
            latency_millis = (
                datetime.utcnow() - new_uid.message.received_date) \
                .total_seconds() * 1000
            statsd_client.timing(
                '.'.join(['accounts', 'overall', 'message_latency']),
                latency_millis)
            statsd_client.timing(
                '.'.join(['accounts', str(acct.id), 'message_latency']),
                latency_millis)

        return new_uid

    def _count_thread_messages(self, thread_id, db_session):
        count, = db_session.query(func.count(Message.id)). \
            filter(Message.thread_id == thread_id).one()
        return count

    def add_message_attrs(self, db_session, new_uid, msg):
        """ Post-create-message bits."""
        with db_session.no_autoflush:
            parent_thread = fetch_corresponding_thread(
                db_session, self.namespace_id, new_uid.message)
            construct_new_thread = True

            if parent_thread:
                # If there's a parent thread that isn't too long already,
                # add to it. Otherwise create a new thread.
                parent_message_count = self._count_thread_messages(
                    parent_thread.id, db_session)
                if parent_message_count < MAX_THREAD_LENGTH:
                    construct_new_thread = False

            if construct_new_thread:
                new_uid.message.thread = ImapThread.from_imap_message(
                    db_session, new_uid.account.namespace, new_uid.message)
            else:
                parent_thread.messages.append(new_uid.message)

        db_session.flush()
        return new_uid

    def remove_deleted_uids(self, db_session, local_uids, remote_uids):
        """
        Remove imapuid entries that no longer exist on the remote.

        Works as follows:
            1. Do a LIST on the current folder to see what messages are on the
                server.
            2. Compare to message uids stored locally.
            3. Purge uids we have locally but not on the server. Ignore
               remote uids that aren't saved locally.

        Make SURE to be holding `syncmanager_lock` when calling this function;
        we do not grab it here to allow callers to lock higher level
        functionality.

        """
        to_delete = set(local_uids) - set(remote_uids)
        common.remove_deleted_uids(self.account_id, db_session, to_delete,
                                   self.folder_id)

    def download_and_commit_uids(self, crispin_client, uids):
        raw_messages = crispin_client.uids(uids)
        if not raw_messages:
            return 0

        new_uids = set()
        with self.syncmanager_lock:
            # there is the possibility that another green thread has already
            # downloaded some message(s) from this batch... check within the
            # lock
            with mailsync_session_scope() as db_session:
                account = db_session.query(Account).get(self.account_id)
                folder = db_session.query(Folder).get(self.folder_id)
                for msg in raw_messages:
                    uid = self.create_message(db_session, account, folder,
                                              msg)
                    if uid is not None:
                        db_session.add(uid)
                        db_session.flush()
                        new_uids.add(uid)
                db_session.commit()

        return len(new_uids)

    def update_metadata(self, crispin_client, updated):
        """ Update flags (the only metadata that can change). """
        # bigger chunk because the data being fetched here is very small
        for uids in chunk(updated, 5 * crispin_client.CHUNK_SIZE):
            new_flags = crispin_client.flags(uids)
            # Messages can disappear in the meantime; we'll update them next
            # sync.
            uids = [uid for uid in uids if uid in new_flags]
            with self.syncmanager_lock:
                with mailsync_session_scope() as db_session:
                    common.update_metadata(self.account_id, db_session,
                                           self.folder_name, self.folder_id,
                                           uids, new_flags)
                    db_session.commit()

    def update_uid_counts(self, db_session, **kwargs):
        saved_status = db_session.query(ImapFolderSyncStatus).join(Folder). \
            filter(ImapFolderSyncStatus.account_id == self.account_id,
                   Folder.name == self.folder_name).one()
        # We're not updating the current_remote_count metric
        # so don't update uid_checked_timestamp.
        if kwargs.get('remote_uid_count') is None:
            saved_status.update_metrics(kwargs)
        else:
            metrics = dict(uid_checked_timestamp=datetime.utcnow())
            metrics.update(kwargs)
            saved_status.update_metrics(metrics)

    def check_uid_changes(self, crispin_client, download_stack,
                          async_download):
        remote_uids = set(crispin_client.all_uids())
        with self.syncmanager_lock:
            with mailsync_session_scope() as db_session:
                local_uids = common.all_uids(self.account_id, db_session,
                                             self.folder_id)
                # Download new UIDs.
                stack_uids = set(download_stack.keys())
                local_with_pending_uids = local_uids | stack_uids
                for uid in sorted(remote_uids):
                    if uid not in local_with_pending_uids:
                        download_stack.put(uid, None)
                self.remove_deleted_uids(db_session, local_uids, remote_uids)
        if not async_download:
            self.download_uids(crispin_client, download_stack)
            with mailsync_session_scope() as db_session:
                self.update_uid_counts(
                    db_session,
                    remote_uid_count=len(remote_uids),
                    download_uid_count=len(download_stack))
        to_refresh = sorted(remote_uids &
                            local_uids)[-self.refresh_flags_max:]
        self.update_metadata(crispin_client, to_refresh)
Exemple #8
0
 def setup_heartbeats(self):
     self.heartbeat_status = HeartbeatStatusProxy(self.account_id,
                                                  self.folder_id,
                                                  self.folder_name,
                                                  self.email_address,
                                                  self.provider_name)
Exemple #9
0
class BaseSyncMonitor(Greenlet):
    """
    Abstracted sync monitor, based on BaseMailSyncMonitor but not mail-specific

    Subclasses should run
    bind_context(self, 'mailsyncmonitor', account.id)

    poll_frequency : int
        How often to check for commands.
    retry_fail_classes : list
        Exceptions to *not* retry on.

    """
    def __init__(self, account_id, namespace_id, email_address, folder_id,
                 folder_name, provider_name, poll_frequency=1,
                 scope=None):

        self.account_id = account_id
        self.namespace_id = namespace_id
        self.poll_frequency = poll_frequency
        self.scope = scope

        self.log = logger.new(account_id=account_id)

        self.shutdown = event.Event()
        self.heartbeat_status = HeartbeatStatusProxy(self.account_id,
                                                     folder_id,
                                                     folder_name,
                                                     email_address,
                                                     provider_name)
        Greenlet.__init__(self)

    def _run(self):
        # Bind greenlet-local logging context.
        self.log = self.log.new(account_id=self.account_id)
        return retry_with_logging(self._run_impl, account_id=self.account_id,
                                  logger=self.log)

    def _run_impl(self):
        # Return true/false based on whether the greenlet succeeds or throws
        # and error. Note this is not how the mailsync monitor works
        try:
            while True:
                # Check to see if this greenlet should exit
                if self.shutdown.is_set():
                    self._cleanup()
                    return False

                try:
                    self.sync()
                    self.heartbeat_status.publish(state='poll')

                # If we get a connection or API permissions error, then sleep
                # 2x poll frequency.
                except ConnectionError:
                    self.log.error('Error while polling', exc_info=True)
                    self.heartbeat_status.publish(state='poll error')
                    sleep(self.poll_frequency)
                sleep(self.poll_frequency)
        except ValidationError:
            # Bad account credentials; exit.
            self.log.error('Error while establishing the connection',
                           account_id=self.account_id,
                           exc_info=True, logstash_tag='mark_invalid')
            self._cleanup()
            with session_scope(self.namespace_id) as db_session:
                account = db_session.query(Account).get(self.account_id)
                account.mark_invalid(scope=self.scope)

            return False

    def sync(self):
        """ Subclasses should override this to do work """
        raise NotImplementedError

    def _cleanup(self):
        self.heartbeat_status.clear()
Exemple #10
0
class BaseSyncMonitor(Greenlet):
    """
    Abstracted sync monitor, based on BaseMailSyncMonitor but not mail-specific

    Subclasses should run
    bind_context(self, 'mailsyncmonitor', account.id)

    poll_frequency : int
        How often to check for commands.
    retry_fail_classes : list
        Exceptions to *not* retry on.

    """

    def __init__(self, account_id, namespace_id, email_address, folder_id,
                 folder_name, provider_name, poll_frequency=1,
                 scope=None):

        self.account_id = account_id
        self.namespace_id = namespace_id
        self.provider_name = provider_name
        self.poll_frequency = poll_frequency
        self.scope = scope

        self.log = logger.new(account_id=account_id)

        self.shutdown = event.Event()
        self.heartbeat_status = HeartbeatStatusProxy(self.account_id,
                                                     folder_id,
                                                     folder_name,
                                                     email_address,
                                                     provider_name)
        Greenlet.__init__(self)

    def _run(self):
        # Bind greenlet-local logging context.
        self.log = self.log.new(account_id=self.account_id)
        return retry_with_logging(self._run_impl, account_id=self.account_id,
                                  provider=self.provider_name, logger=self.log)

    def _run_impl(self):
        # Return true/false based on whether the greenlet succeeds or throws
        # and error. Note this is not how the mailsync monitor works
        try:
            while True:
                # Check to see if this greenlet should exit
                if self.shutdown.is_set():
                    self._cleanup()
                    return False

                try:
                    self.sync()
                    self.heartbeat_status.publish(state='poll')

                # If we get a connection or API permissions error, then sleep
                # 2x poll frequency.
                except ConnectionError:
                    self.log.error('Error while polling', exc_info=True)
                    self.heartbeat_status.publish(state='poll error')
                    sleep(self.poll_frequency)
                sleep(self.poll_frequency)
        except ValidationError:
            # Bad account credentials; exit.
            self.log.error('Error while establishing the connection',
                           account_id=self.account_id,
                           exc_info=True, logstash_tag='mark_invalid')
            self._cleanup()
            with session_scope(self.namespace_id) as db_session:
                account = db_session.query(Account).get(self.account_id)
                account.mark_invalid(scope=self.scope)

            return False

    def sync(self):
        """ Subclasses should override this to do work """
        raise NotImplementedError

    def _cleanup(self):
        self.heartbeat_status.clear()
Exemple #11
0
 def setup_heartbeats(self):
     self.heartbeat_status = HeartbeatStatusProxy(self.account_id,
                                                  self.folder_id,
                                                  self.folder_name,
                                                  self.email_address,
                                                  self.provider_name)
Exemple #12
0
class FolderSyncEngine(Greenlet):
    """Base class for a per-folder IMAP sync engine."""

    def __init__(self, account_id, namespace_id, folder_name,
                 email_address, provider_name, syncmanager_lock):

        with session_scope(namespace_id) as db_session:
            try:
                folder = db_session.query(Folder). \
                    filter(Folder.name == folder_name,
                           Folder.account_id == account_id).one()
            except NoResultFound:
                raise MailsyncError(u"Missing Folder '{}' on account {}"
                                    .format(folder_name, account_id))

            self.folder_id = folder.id
            self.folder_role = folder.canonical_name
            # Metric flags for sync performance
            self.is_initial_sync = folder.initial_sync_end is None
            self.is_first_sync = folder.initial_sync_start is None
            self.is_first_message = self.is_first_sync

        bind_context(self, 'foldersyncengine', account_id, self.folder_id)
        self.account_id = account_id
        self.namespace_id = namespace_id
        self.folder_name = folder_name
        self.email_address = email_address

        if self.folder_name.lower() == 'inbox':
            self.poll_frequency = INBOX_POLL_FREQUENCY
        else:
            self.poll_frequency = DEFAULT_POLL_FREQUENCY
        self.syncmanager_lock = syncmanager_lock
        self.state = None
        self.provider_name = provider_name
        self.last_fast_refresh = None
        self.flags_fetch_results = {}
        self.conn_pool = connection_pool(self.account_id)

        self.state_handlers = {
            'initial': self.initial_sync,
            'initial uidinvalid': self.resync_uids,
            'poll': self.poll,
            'poll uidinvalid': self.resync_uids,
        }

        self.setup_heartbeats()
        Greenlet.__init__(self)

        # Some generic IMAP servers are throwing UIDVALIDITY
        # errors forever. Instead of resyncing those servers
        # ad vitam, we keep track of the number of consecutive
        # times we got such an error and bail out if it's higher than
        # MAX_UIDINVALID_RESYNCS.
        self.uidinvalid_count = 0

    def setup_heartbeats(self):
        self.heartbeat_status = HeartbeatStatusProxy(self.account_id,
                                                     self.folder_id,
                                                     self.folder_name,
                                                     self.email_address,
                                                     self.provider_name)

    def _run(self):
        # Bind greenlet-local logging context.
        self.log = log.new(account_id=self.account_id, folder=self.folder_name,
                           provider=self.provider_name)
        # eagerly signal the sync status
        self.heartbeat_status.publish()

        try:
            self.update_folder_sync_status(lambda s: s.start_sync())
        except IntegrityError:
            # The state insert failed because the folder ID ForeignKey
            # was no longer valid, ie. the folder for this engine was deleted
            # while we were starting up.
            # Exit the sync and let the monitor sort things out.
            log.info("Folder state loading failed due to IntegrityError",
                     folder_id=self.folder_id, account_id=self.account_id)
            raise MailsyncDone()

        # NOTE: The parent ImapSyncMonitor handler could kill us at any
        # time if it receives a shutdown command. The shutdown command is
        # equivalent to ctrl-c.
        while True:
            retry_with_logging(self._run_impl, account_id=self.account_id,
                               provider=self.provider_name, logger=log)

    def _run_impl(self):
        old_state = self.state
        try:
            self.state = self.state_handlers[old_state]()
            self.heartbeat_status.publish(state=self.state)
        except UidInvalid:
            self.state = self.state + ' uidinvalid'
            self.uidinvalid_count += 1
            self.heartbeat_status.publish(state=self.state)

            # Check that we're not stuck in an endless uidinvalidity resync loop.
            if self.uidinvalid_count > MAX_UIDINVALID_RESYNCS:
                log.error('Resynced more than MAX_UIDINVALID_RESYNCS in a'
                          ' row. Stopping sync.')

                with session_scope(self.namespace_id) as db_session:
                    account = db_session.query(Account).get(self.account_id)
                    account.disable_sync('Detected endless uidvalidity '
                                         'resync loop')
                    account.sync_state = 'stopped'
                    db_session.commit()

                raise MailsyncDone()

        except FolderMissingError:
            # Folder was deleted by monitor while its sync was running.
            # TODO: Monitor should handle shutting down the folder engine.
            log.info('Folder disappeared. Stopping sync.',
                     account_id=self.account_id, folder_id=self.folder_id)
            raise MailsyncDone()
        except ValidationError as exc:
            log.error('Error authenticating; stopping sync', exc_info=True,
                      account_id=self.account_id, folder_id=self.folder_id,
                      logstash_tag='mark_invalid')
            with session_scope(self.namespace_id) as db_session:
                account = db_session.query(Account).get(self.account_id)
                account.mark_invalid()
                account.update_sync_error(exc)
            raise MailsyncDone()

        # State handlers are idempotent, so it's okay if we're
        # killed between the end of the handler and the commit.
        if self.state != old_state:
            def update(status):
                status.state = self.state
            self.update_folder_sync_status(update)

        if self.state == old_state and self.state in ['initial', 'poll']:
            # We've been through a normal state transition without raising any
            # error. It's safe to reset the uidvalidity counter.
            self.uidinvalid_count = 0

    def update_folder_sync_status(self, cb):
        # Loads the folder sync status and invokes the provided callback to
        # modify it. Commits any changes and updates `self.state` to ensure
        # they are never out of sync.
        with session_scope(self.namespace_id) as db_session:
            try:
                state = ImapFolderSyncStatus.state
                saved_folder_status = db_session.query(ImapFolderSyncStatus)\
                    .filter_by(account_id=self.account_id, folder_id=self.folder_id)\
                    .options(load_only(state)).one()
            except NoResultFound:
                saved_folder_status = ImapFolderSyncStatus(
                    account_id=self.account_id, folder_id=self.folder_id)
                db_session.add(saved_folder_status)

            cb(saved_folder_status)
            db_session.commit()

            self.state = saved_folder_status.state

    def set_stopped(self, db_session):
        self.update_folder_sync_status(lambda s: s.stop_sync())

    def _report_initial_sync_start(self):
        with session_scope(self.namespace_id) as db_session:
            q = db_session.query(Folder).get(self.folder_id)
            q.initial_sync_start = datetime.utcnow()

    def _report_initial_sync_end(self):
        with session_scope(self.namespace_id) as db_session:
            q = db_session.query(Folder).get(self.folder_id)
            q.initial_sync_end = datetime.utcnow()

    @retry_crispin
    def initial_sync(self):
        log.bind(state='initial')
        log.info('starting initial sync')

        if self.is_first_sync:
            self._report_initial_sync_start()
            self.is_first_sync = False

        with self.conn_pool.get() as crispin_client:
            crispin_client.select_folder(self.folder_name, uidvalidity_cb)
            # Ensure we have an ImapFolderInfo row created prior to sync start.
            with session_scope(self.namespace_id) as db_session:
                try:
                    db_session.query(ImapFolderInfo). \
                        filter(ImapFolderInfo.account_id == self.account_id,
                               ImapFolderInfo.folder_id == self.folder_id). \
                        one()
                except NoResultFound:
                    imapfolderinfo = ImapFolderInfo(
                        account_id=self.account_id, folder_id=self.folder_id,
                        uidvalidity=crispin_client.selected_uidvalidity,
                        uidnext=crispin_client.selected_uidnext)
                    db_session.add(imapfolderinfo)
                db_session.commit()

            self.initial_sync_impl(crispin_client)

        if self.is_initial_sync:
            self._report_initial_sync_end()
            self.is_initial_sync = False

        return 'poll'

    @retry_crispin
    def poll(self):
        log.bind(state='poll')
        log.info('polling')
        self.poll_impl()
        return 'poll'

    @retry_crispin
    def resync_uids(self):
        log.bind(state=self.state)
        log.warning('UIDVALIDITY changed; initiating resync')
        self.resync_uids_impl()
        return 'initial'

    def initial_sync_impl(self, crispin_client):
        # We wrap the block in a try/finally because the change_poller greenlet
        # needs to be killed when this greenlet is interrupted
        change_poller = None
        try:
            assert crispin_client.selected_folder_name == self.folder_name
            remote_uids = crispin_client.all_uids()
            with self.syncmanager_lock:
                with session_scope(self.namespace_id) as db_session:
                    local_uids = common.local_uids(self.account_id, db_session,
                                                   self.folder_id)
                common.remove_deleted_uids(
                    self.account_id, self.folder_id,
                    set(local_uids).difference(remote_uids))

            new_uids = set(remote_uids).difference(local_uids)
            with session_scope(self.namespace_id) as db_session:
                account = db_session.query(Account).get(self.account_id)
                throttled = account.throttled
                self.update_uid_counts(
                    db_session,
                    remote_uid_count=len(remote_uids),
                    # This is the initial size of our download_queue
                    download_uid_count=len(new_uids))

            change_poller = gevent.spawn(self.poll_for_changes)
            bind_context(change_poller, 'changepoller', self.account_id,
                         self.folder_id)
            uids = sorted(new_uids, reverse=True)
            count = 0
            for uid in uids:
                # The speedup from batching appears to be less clear for
                # non-Gmail accounts, so for now just download one-at-a-time.
                self.download_and_commit_uids(crispin_client, [uid])
                self.heartbeat_status.publish()
                count += 1
                if throttled and count >= THROTTLE_COUNT:
                    # Throttled accounts' folders sync at a rate of
                    # 1 message/ minute, after the first approx. THROTTLE_COUNT
                    # messages per folder are synced.
                    # Note this is an approx. limit since we use the #(uids),
                    # not the #(messages).
                    gevent.sleep(THROTTLE_WAIT)
        finally:
            if change_poller is not None:
                # schedule change_poller to die
                gevent.kill(change_poller)

    def should_idle(self, crispin_client):
        if not hasattr(self, '_should_idle'):
            self._should_idle = (
                crispin_client.idle_supported() and self.folder_name in
                crispin_client.folder_names()['inbox']
            )
        return self._should_idle

    def poll_impl(self):
        with self.conn_pool.get() as crispin_client:
            self.check_uid_changes(crispin_client)
            if self.should_idle(crispin_client):
                crispin_client.select_folder(self.folder_name,
                                             self.uidvalidity_cb)
                idling = True
                try:
                    crispin_client.idle(IDLE_WAIT)
                except Exception as exc:
                    # With some servers we get e.g.
                    # 'Unexpected IDLE response: * FLAGS  (...)'
                    if isinstance(exc, imaplib.IMAP4.error) and \
                            exc.message.startswith('Unexpected IDLE response'):
                        log.info('Error initiating IDLE, not idling',
                                 error=exc)
                        try:
                            # Still have to take the connection out of IDLE
                            # mode to reuse it though.
                            crispin_client.conn.idle_done()
                        except AttributeError:
                            pass
                        idling = False
                    else:
                        raise
            else:
                idling = False
        # Close IMAP connection before sleeping
        if not idling:
            gevent.sleep(self.poll_frequency)

    def resync_uids_impl(self):
        # First, let's check if the UIVDALIDITY change was spurious, if
        # it is, just discard it and go on.
        with self.conn_pool.get() as crispin_client:
            crispin_client.select_folder(self.folder_name, lambda *args: True)
            remote_uidvalidity = crispin_client.selected_uidvalidity
            remote_uidnext = crispin_client.selected_uidnext
            if remote_uidvalidity <= self.uidvalidity:
                log.debug('UIDVALIDITY unchanged')
                return
        # Otherwise, if the UIDVALIDITY really has changed, discard all saved
        # UIDs for the folder, mark associated messages for garbage-collection,
        # and return to the 'initial' state to resync.
        # This will cause message and threads to be deleted and recreated, but
        # uidinvalidity is sufficiently rare that this tradeoff is acceptable.
        with session_scope(self.namespace_id) as db_session:
            invalid_uids = {
                uid for uid, in db_session.query(ImapUid.msg_uid).
                filter_by(account_id=self.account_id,
                          folder_id=self.folder_id)
            }
        common.remove_deleted_uids(self.account_id, self.folder_id,
                                   invalid_uids)
        self.uidvalidity = remote_uidvalidity
        self.highestmodseq = None
        self.uidnext = remote_uidnext

    @retry_crispin
    def poll_for_changes(self):
        log.new(account_id=self.account_id, folder=self.folder_name)
        while True:
            log.info('polling for changes')
            self.poll_impl()

    def create_message(self, db_session, acct, folder, msg):
        assert acct is not None and acct.namespace is not None

        # Check if we somehow already saved the imapuid (shouldn't happen, but
        # possible due to race condition). If so, don't commit changes.
        existing_imapuid = db_session.query(ImapUid).filter(
            ImapUid.account_id == acct.id, ImapUid.folder_id == folder.id,
            ImapUid.msg_uid == msg.uid).first()
        if existing_imapuid is not None:
            log.error('Expected to create imapuid, but existing row found',
                      remote_msg_uid=msg.uid,
                      existing_imapuid=existing_imapuid.id)
            return None

        # Check if the message is valid.
        # https://sentry.nylas.com/sentry/sync-prod/group/3387/
        if msg.body is None:
            log.warning('Server returned a message with an empty body.')
            return None

        new_uid = common.create_imap_message(db_session, acct, folder, msg)
        self.add_message_to_thread(db_session, new_uid.message, msg)

        db_session.flush()

        # We're calling import_attached_events here instead of some more
        # obvious place (like Message.create_from_synced) because the function
        # requires new_uid.message to have been flushed.
        # This is necessary because the import_attached_events does db lookups.
        if new_uid.message.has_attached_events:
            with db_session.no_autoflush:
                import_attached_events(db_session, acct, new_uid.message)

        # If we're in the polling state, then we want to report the metric
        # for latency when the message was received vs created
        if self.state == 'poll':
            latency_millis = (
                datetime.utcnow() - new_uid.message.received_date) \
                .total_seconds() * 1000
            metrics = [
                '.'.join(['accounts', 'overall', 'message_latency']),
                '.'.join(['providers', self.provider_name, 'message_latency']),
            ]
            for metric in metrics:
                statsd_client.timing(metric, latency_millis)

        return new_uid

    def _count_thread_messages(self, thread_id, db_session):
        count, = db_session.query(func.count(Message.id)). \
            filter(Message.thread_id == thread_id).one()
        return count

    def add_message_to_thread(self, db_session, message_obj, raw_message):
        """Associate message_obj to the right Thread object, creating a new
        thread if necessary."""
        with db_session.no_autoflush:
            # Disable autoflush so we don't try to flush a message with null
            # thread_id.
            parent_thread = fetch_corresponding_thread(
                db_session, self.namespace_id, message_obj)
            construct_new_thread = True

            if parent_thread:
                # If there's a parent thread that isn't too long already,
                # add to it. Otherwise create a new thread.
                parent_message_count = self._count_thread_messages(
                    parent_thread.id, db_session)
                if parent_message_count < MAX_THREAD_LENGTH:
                    construct_new_thread = False

            if construct_new_thread:
                message_obj.thread = ImapThread.from_imap_message(
                    db_session, self.namespace_id, message_obj)
            else:
                parent_thread.messages.append(message_obj)

    def download_and_commit_uids(self, crispin_client, uids):
        start = datetime.utcnow()
        raw_messages = crispin_client.uids(uids)
        if not raw_messages:
            return 0

        new_uids = set()
        with self.syncmanager_lock:
            with session_scope(self.namespace_id) as db_session:
                account = Account.get(self.account_id, db_session)
                folder = Folder.get(self.folder_id, db_session)
                for msg in raw_messages:
                    uid = self.create_message(db_session, account,
                                              folder, msg)
                    if uid is not None:
                        db_session.add(uid)
                        db_session.flush()
                        new_uids.add(uid)
                db_session.commit()

        log.info('Committed new UIDs',
                 new_committed_message_count=len(new_uids))
        # If we downloaded uids, record message velocity (#uid / latency)
        if self.state == 'initial' and len(new_uids):
            self._report_message_velocity(datetime.utcnow() - start,
                                          len(new_uids))
        if self.is_first_message:
            self._report_first_message()
            self.is_first_message = False

        return len(new_uids)

    def _report_first_message(self):
        now = datetime.utcnow()

        with session_scope(self.namespace_id) as db_session:
            account = db_session.query(Account).get(self.account_id)
            account_created = account.created_at

        latency = (now - account_created).total_seconds() * 1000
        metrics = [
            '.'.join(['providers', self.provider_name, 'first_message']),
            '.'.join(['providers', 'overall', 'first_message'])
        ]

        for metric in metrics:
            statsd_client.timing(metric, latency)

    def _report_message_velocity(self, timedelta, num_uids):
        latency = (timedelta).total_seconds() * 1000
        latency_per_uid = float(latency) / num_uids
        metrics = [
            '.'.join(['providers', self.provider_name,
                      'message_velocity']),
            '.'.join(['providers', 'overall', 'message_velocity'])
        ]
        for metric in metrics:
            statsd_client.timing(metric, latency_per_uid)

    def update_uid_counts(self, db_session, **kwargs):
        saved_status = db_session.query(ImapFolderSyncStatus).join(Folder). \
            filter(ImapFolderSyncStatus.account_id == self.account_id,
                   Folder.name == self.folder_name).one()
        # We're not updating the current_remote_count metric
        # so don't update uid_checked_timestamp.
        if kwargs.get('remote_uid_count') is None:
            saved_status.update_metrics(kwargs)
        else:
            metrics = dict(uid_checked_timestamp=datetime.utcnow())
            metrics.update(kwargs)
            saved_status.update_metrics(metrics)

    def get_new_uids(self, crispin_client):
        try:
            remote_uidnext = crispin_client.conn.folder_status(
                self.folder_name, ['UIDNEXT']).get('UIDNEXT')
        except ValueError:
            # Work around issue where ValueError is raised on parsing STATUS
            # response.
            log.warning('Error getting UIDNEXT', exc_info=True)
            remote_uidnext = None
        except imaplib.IMAP4.error as e:
            if '[NONEXISTENT]' in e.message:
                raise FolderMissingError()
            else:
                raise e
        if remote_uidnext is not None and remote_uidnext == self.uidnext:
            return
        log.info('UIDNEXT changed, checking for new UIDs',
                 remote_uidnext=remote_uidnext, saved_uidnext=self.uidnext)

        crispin_client.select_folder(self.folder_name, self.uidvalidity_cb)
        with session_scope(self.namespace_id) as db_session:
            lastseenuid = common.lastseenuid(self.account_id, db_session,
                                             self.folder_id)
        latest_uids = crispin_client.conn.fetch('{}:*'.format(lastseenuid + 1),
                                                ['UID']).keys()
        new_uids = set(latest_uids) - {lastseenuid}
        if new_uids:
            for uid in sorted(new_uids):
                self.download_and_commit_uids(crispin_client, [uid])
        self.uidnext = remote_uidnext

    def condstore_refresh_flags(self, crispin_client):
        new_highestmodseq = crispin_client.conn.folder_status(
            self.folder_name, ['HIGHESTMODSEQ'])['HIGHESTMODSEQ']
        # Ensure that we have an initial highestmodseq value stored before we
        # begin polling for changes.
        if self.highestmodseq is None:
            self.highestmodseq = new_highestmodseq

        if new_highestmodseq == self.highestmodseq:
            # Don't need to do anything if the highestmodseq hasn't
            # changed.
            return
        elif new_highestmodseq < self.highestmodseq:
            # This should really never happen, but if it does, handle it.
            log.warning('got server highestmodseq less than saved '
                        'highestmodseq',
                        new_highestmodseq=new_highestmodseq,
                        saved_highestmodseq=self.highestmodseq)
            return

        log.info('HIGHESTMODSEQ has changed, getting changed UIDs',
                 new_highestmodseq=new_highestmodseq,
                 saved_highestmodseq=self.highestmodseq)
        crispin_client.select_folder(self.folder_name, self.uidvalidity_cb)
        changed_flags = crispin_client.condstore_changed_flags(
            self.highestmodseq)
        remote_uids = crispin_client.all_uids()

        # In order to be able to sync changes to tens of thousands of flags at
        # once, we commit updates in batches. We do this in ascending order by
        # modseq and periodically "checkpoint" our saved highestmodseq. (It's
        # safe to checkpoint *because* we go in ascending order by modseq.)
        # That way if the process gets restarted halfway through this refresh,
        # we don't have to completely start over. It's also slow to load many
        # objects into the SQLAlchemy session and then issue lots of commits;
        # we avoid that by batching.
        flag_batches = chunk(
            sorted(changed_flags.items(), key=lambda (k, v): v.modseq),
            CONDSTORE_FLAGS_REFRESH_BATCH_SIZE)
        for flag_batch in flag_batches:
            with session_scope(self.namespace_id) as db_session:
                common.update_metadata(self.account_id, self.folder_id,
                                       self.folder_role, dict(flag_batch),
                                       db_session)
            if len(flag_batch) == CONDSTORE_FLAGS_REFRESH_BATCH_SIZE:
                interim_highestmodseq = max(v.modseq for k, v in flag_batch)
                self.highestmodseq = interim_highestmodseq

        with session_scope(self.namespace_id) as db_session:
            local_uids = common.local_uids(self.account_id, db_session,
                                           self.folder_id)
            expunged_uids = set(local_uids).difference(remote_uids)

        if expunged_uids:
            # If new UIDs have appeared since we last checked in
            # get_new_uids, save them first. We want to always have the
            # latest UIDs before expunging anything, in order to properly
            # capture draft revisions.
            with session_scope(self.namespace_id) as db_session:
                lastseenuid = common.lastseenuid(self.account_id, db_session,
                                                 self.folder_id)
            if remote_uids and lastseenuid < max(remote_uids):
                log.info('Downloading new UIDs before expunging')
                self.get_new_uids(crispin_client)
            common.remove_deleted_uids(self.account_id, self.folder_id,
                                       expunged_uids)
        self.highestmodseq = new_highestmodseq

    def generic_refresh_flags(self, crispin_client):
        now = datetime.utcnow()
        slow_refresh_due = (
            self.last_slow_refresh is None or
            now > self.last_slow_refresh + SLOW_REFRESH_INTERVAL
        )
        fast_refresh_due = (
            self.last_fast_refresh is None or
            now > self.last_fast_refresh + FAST_REFRESH_INTERVAL
        )
        if slow_refresh_due:
            self.refresh_flags_impl(crispin_client, SLOW_FLAGS_REFRESH_LIMIT)
            self.last_slow_refresh = datetime.utcnow()
        elif fast_refresh_due:
            self.refresh_flags_impl(crispin_client, FAST_FLAGS_REFRESH_LIMIT)
            self.last_fast_refresh = datetime.utcnow()

    def refresh_flags_impl(self, crispin_client, max_uids):
        crispin_client.select_folder(self.folder_name, self.uidvalidity_cb)
        with session_scope(self.namespace_id) as db_session:
            local_uids = common.local_uids(account_id=self.account_id,
                                           session=db_session,
                                           folder_id=self.folder_id,
                                           limit=max_uids)

        flags = crispin_client.flags(local_uids)
        if (max_uids in self.flags_fetch_results and
                self.flags_fetch_results[max_uids] == (local_uids, flags)):
            # If the flags fetch response is exactly the same as the last one
            # we got, then we don't need to persist any changes.
            log.debug('Unchanged flags refresh response, '
                      'not persisting changes', max_uids=max_uids)
            return
        log.debug('Changed flags refresh response, persisting changes',
                  max_uids=max_uids)
        expunged_uids = set(local_uids).difference(flags.keys())
        common.remove_deleted_uids(self.account_id, self.folder_id,
                                   expunged_uids)
        with session_scope(self.namespace_id) as db_session:
            common.update_metadata(self.account_id, self.folder_id,
                                   self.folder_role, flags, db_session)
        self.flags_fetch_results[max_uids] = (local_uids, flags)

    def check_uid_changes(self, crispin_client):
        self.get_new_uids(crispin_client)
        if crispin_client.condstore_supported():
            self.condstore_refresh_flags(crispin_client)
        else:
            self.generic_refresh_flags(crispin_client)

    @property
    def uidvalidity(self):
        if not hasattr(self, '_uidvalidity'):
            self._uidvalidity = self._load_imap_folder_info().uidvalidity
        return self._uidvalidity

    @uidvalidity.setter
    def uidvalidity(self, value):
        self._update_imap_folder_info('uidvalidity', value)
        self._uidvalidity = value

    @property
    def uidnext(self):
        if not hasattr(self, '_uidnext'):
            self._uidnext = self._load_imap_folder_info().uidnext
        return self._uidnext

    @uidnext.setter
    def uidnext(self, value):
        self._update_imap_folder_info('uidnext', value)
        self._uidnext = value

    @property
    def last_slow_refresh(self):
        # We persist the last_slow_refresh timestamp so that we don't end up
        # doing a (potentially expensive) full flags refresh for every account
        # on every process restart.
        if not hasattr(self, '_last_slow_refresh'):
            self._last_slow_refresh = self._load_imap_folder_info(). \
                last_slow_refresh
        return self._last_slow_refresh

    @last_slow_refresh.setter
    def last_slow_refresh(self, value):
        self._update_imap_folder_info('last_slow_refresh', value)
        self._last_slow_refresh = value

    @property
    def highestmodseq(self):
        if not hasattr(self, '_highestmodseq'):
            self._highestmodseq = self._load_imap_folder_info().highestmodseq
        return self._highestmodseq

    @highestmodseq.setter
    def highestmodseq(self, value):
        self._highestmodseq = value
        self._update_imap_folder_info('highestmodseq', value)

    def _load_imap_folder_info(self):
        with session_scope(self.namespace_id) as db_session:
            imapfolderinfo = db_session.query(ImapFolderInfo). \
                filter(ImapFolderInfo.account_id == self.account_id,
                       ImapFolderInfo.folder_id == self.folder_id). \
                one()
            db_session.expunge(imapfolderinfo)
            return imapfolderinfo

    def _update_imap_folder_info(self, attrname, value):
        with session_scope(self.namespace_id) as db_session:
            imapfolderinfo = db_session.query(ImapFolderInfo). \
                filter(ImapFolderInfo.account_id == self.account_id,
                       ImapFolderInfo.folder_id == self.folder_id). \
                one()
            setattr(imapfolderinfo, attrname, value)
            db_session.commit()

    def uidvalidity_cb(self, account_id, folder_name, select_info):
        assert folder_name == self.folder_name
        assert account_id == self.account_id
        selected_uidvalidity = select_info['UIDVALIDITY']
        is_valid = (self.uidvalidity is None or
                    selected_uidvalidity <= self.uidvalidity)
        if not is_valid:
            raise UidInvalid(
                'folder: {}, remote uidvalidity: {}, '
                'cached uidvalidity: {}'.format(folder_name.encode('utf-8'),
                                                selected_uidvalidity,
                                                self.uidvalidity))
        return select_info
Exemple #13
0
class FolderSyncEngine(Greenlet):
    """Base class for a per-folder IMAP sync engine."""
    def __init__(self, account_id, folder_name, folder_id, email_address,
                 provider_name, poll_frequency, syncmanager_lock,
                 refresh_flags_max, retry_fail_classes):
        bind_context(self, 'foldersyncengine', account_id, folder_id)
        self.account_id = account_id
        self.folder_name = folder_name
        self.folder_id = folder_id
        self.poll_frequency = poll_frequency
        self.syncmanager_lock = syncmanager_lock
        self.refresh_flags_max = refresh_flags_max
        self.retry_fail_classes = retry_fail_classes
        self.state = None
        self.provider_name = provider_name

        # Metric flags for sync performance
        self.is_initial_sync = False
        self.is_first_sync = False
        self.is_first_message = False

        with mailsync_session_scope() as db_session:
            account = db_session.query(Account).get(self.account_id)
            self.throttled = account.throttled
            self.namespace_id = account.namespace.id
            assert self.namespace_id is not None, "namespace_id is None"

            folder = db_session.query(Folder).get(self.folder_id)
            if folder:
                self.is_initial_sync = folder.initial_sync_end is None
                self.is_first_sync = folder.initial_sync_start is None
                self.is_first_message = self.is_first_sync

        self.state_handlers = {
            'initial': self.initial_sync,
            'initial uidinvalid': self.resync_uids,
            'poll': self.poll,
            'poll uidinvalid': self.resync_uids,
            'finish': lambda self: 'finish',
        }

        Greenlet.__init__(self)

        self.heartbeat_status = HeartbeatStatusProxy(self.account_id,
                                                     self.folder_id,
                                                     self.folder_name,
                                                     email_address,
                                                     self.provider_name)

    def _run(self):
        # Bind greenlet-local logging context.
        self.log = log.new(account_id=self.account_id, folder=self.folder_name)
        # eagerly signal the sync status
        self.heartbeat_status.publish()
        return retry_and_report_killed(self._run_impl,
                                       account_id=self.account_id,
                                       folder_name=self.folder_name,
                                       logger=log,
                                       fail_classes=self.retry_fail_classes)

    def _run_impl(self):
        # We defer initializing the pool to here so that we'll retry if there
        # are any errors (remote server 503s or similar) when initializing it.
        self.conn_pool = connection_pool(self.account_id)
        try:
            saved_folder_status = self._load_state()
        except IntegrityError:
            # The state insert failed because the folder ID ForeignKey
            # was no longer valid, ie. the folder for this engine was deleted
            # while we were starting up.
            # Exit the sync and let the monitor sort things out.
            log.info("Folder state loading failed due to IntegrityError",
                     folder_id=self.folder_id,
                     account_id=self.account_id)
            raise MailsyncDone()

        # NOTE: The parent ImapSyncMonitor handler could kill us at any
        # time if it receives a shutdown command. The shutdown command is
        # equivalent to ctrl-c.
        while True:
            old_state = self.state
            try:
                self.state = self.state_handlers[old_state]()
                self.heartbeat_status.publish(state=self.state)
            except UidInvalid:
                self.state = self.state + ' uidinvalid'
                self.heartbeat_status.publish(state=self.state)
            except FolderMissingError:
                # Folder was deleted by monitor while its sync was running.
                # TODO: Monitor should handle shutting down the folder engine.
                log.info('Folder disappeared. Stopping sync.',
                         account_id=self.account_id,
                         folder_name=self.folder_name,
                         folder_id=self.folder_id)
                raise MailsyncDone()
            except ValidationError as exc:
                log.error('Error authenticating; stopping sync',
                          exc_info=True,
                          account_id=self.account_id,
                          folder_id=self.folder_id,
                          logstash_tag='mark_invalid')
                with mailsync_session_scope() as db_session:
                    account = db_session.query(Account).get(self.account_id)
                    account.mark_invalid()
                    account.update_sync_error(str(exc))
                raise MailsyncDone()

            # State handlers are idempotent, so it's okay if we're
            # killed between the end of the handler and the commit.
            if self.state != old_state:
                # Don't need to re-query, will auto refresh on re-associate.
                with mailsync_session_scope() as db_session:
                    db_session.add(saved_folder_status)
                    saved_folder_status.state = self.state
                    db_session.commit()
            if self.state == 'finish':
                return

    def _load_state(self):
        with mailsync_session_scope() as db_session:
            try:
                state = ImapFolderSyncStatus.state
                saved_folder_status = db_session.query(ImapFolderSyncStatus)\
                    .filter_by(account_id=self.account_id,
                               folder_id=self.folder_id).options(
                        load_only(state)).one()
            except NoResultFound:
                saved_folder_status = ImapFolderSyncStatus(
                    account_id=self.account_id, folder_id=self.folder_id)
                db_session.add(saved_folder_status)

            saved_folder_status.start_sync()
            db_session.commit()
            self.state = saved_folder_status.state
            return saved_folder_status

    def set_stopped(self, db_session):
        saved_folder_status = db_session.query(ImapFolderSyncStatus)\
            .filter_by(account_id=self.account_id,
                       folder_id=self.folder_id).one()

        saved_folder_status.stop_sync()
        self.state = saved_folder_status.state

    def _report_initial_sync_start(self):
        with mailsync_session_scope() as db_session:
            q = db_session.query(Folder).get(self.folder_id)
            q.initial_sync_start = datetime.utcnow()

    def _report_initial_sync_end(self):
        with mailsync_session_scope() as db_session:
            q = db_session.query(Folder).get(self.folder_id)
            q.initial_sync_end = datetime.utcnow()

    @retry_crispin
    def initial_sync(self):
        log.bind(state='initial')
        log.info('starting initial sync')

        if self.is_first_sync:
            self._report_initial_sync_start()
            self.is_first_sync = False

        with self.conn_pool.get() as crispin_client:
            crispin_client.select_folder(self.folder_name, uidvalidity_cb)
            self.initial_sync_impl(crispin_client)

        if self.is_initial_sync:
            self._report_initial_sync_end()
            self.is_initial_sync = False

        return 'poll'

    @retry_crispin
    def poll(self):
        log.bind(state='poll')
        log.info('polling')
        self.poll_impl()
        return 'poll'

    @retry_crispin
    def resync_uids(self):
        log.bind(state=self.state)
        log.info('UIDVALIDITY changed')
        self.resync_uids_impl()
        return 'initial'

    def initial_sync_impl(self, crispin_client):
        # We wrap the block in a try/finally because the change_poller greenlet
        # needs to be killed when this greenlet is interrupted
        change_poller = None
        try:
            assert crispin_client.selected_folder_name == self.folder_name
            remote_uids = crispin_client.all_uids()
            with self.syncmanager_lock:
                with mailsync_session_scope() as db_session:
                    local_uids = common.all_uids(self.account_id, db_session,
                                                 self.folder_id)
                    self.remove_deleted_uids(db_session, local_uids,
                                             remote_uids)

            new_uids = set(remote_uids) - local_uids
            download_stack = UIDStack()
            for uid in sorted(new_uids):
                download_stack.put(uid, GenericUIDMetadata(self.throttled))

            with mailsync_session_scope() as db_session:
                self.update_uid_counts(
                    db_session,
                    remote_uid_count=len(remote_uids),
                    # This is the initial size of our download_queue
                    download_uid_count=len(new_uids))

            change_poller = spawn(self.poll_for_changes, download_stack)
            bind_context(change_poller, 'changepoller', self.account_id,
                         self.folder_id)
            self.download_uids(crispin_client, download_stack)

        finally:
            if change_poller is not None:
                # schedule change_poller to die
                kill(change_poller)

    def poll_impl(self):
        with self.conn_pool.get() as crispin_client:
            crispin_client.select_folder(self.folder_name, uidvalidity_cb)
            download_stack = UIDStack()
            self.check_uid_changes(crispin_client,
                                   download_stack,
                                   async_download=False)
        sleep(self.poll_frequency)

    def resync_uids_impl(self):
        # NOTE: first, let's check if the UIVDALIDITY change was spurious, if
        # it is, just discard it and go on, if it isn't, drop the relevant
        # entries (filtering by account and folder IDs) from the imapuid table,
        # download messages, if necessary - in case a message has changed UID -
        # update UIDs, and discard orphaned messages. -siro
        with mailsync_session_scope() as db_session:
            account = db_session.query(Account).get(self.account_id)
            folder_info = db_session.query(ImapFolderInfo). \
                filter_by(account_id=self.account_id,
                          folder_id=self.folder_id).one()
            cached_uidvalidity = folder_info.uidvalidity
            with self.conn_pool.get() as crispin_client:
                crispin_client.select_folder(self.folder_name,
                                             lambda *args: True)
                uidvalidity = crispin_client.selected_uidvalidity
                if uidvalidity <= cached_uidvalidity:
                    log.debug('UIDVALIDITY unchanged')
                    return
                invalid_uids = db_session.query(ImapUid). \
                    filter_by(account_id=self.account_id,
                              folder_id=self.folder_id)
                data_sha256_message = {
                    uid.message.data_sha256: uid.message
                    for uid in invalid_uids
                }
                for uid in invalid_uids:
                    db_session.delete(uid)
                # NOTE: this is necessary (and OK since it doesn't persist any
                # data) to maintain the order between UIDs deletion and
                # insertion. Without this, I was seeing constraints violation
                # on the imapuid table. -siro
                db_session.flush()
                remote_uids = crispin_client.all_uids()
                for remote_uid in remote_uids:
                    raw_message = crispin_client.uids([remote_uid])[0]
                    data_sha256 = sha256(raw_message.body).hexdigest()
                    if data_sha256 in data_sha256_message:
                        message = data_sha256_message[data_sha256]

                        # Create a new imapuid
                        uid = ImapUid(msg_uid=raw_message.uid,
                                      message=message,
                                      account_id=self.account_id,
                                      folder_id=self.folder_id)
                        uid.update_flags(raw_message.flags)
                        db_session.add(uid)

                        # Update the existing message's metadata too
                        common.update_message_metadata(db_session, account,
                                                       message, uid.is_draft)

                        del data_sha256_message[data_sha256]
                    else:
                        self.download_and_commit_uids(crispin_client,
                                                      [remote_uid])
                    self.heartbeat_status.publish()
                    # FIXME: do we want to throttle the account when recovering
                    # from UIDVALIDITY changes? -siro
            for message in data_sha256_message.itervalues():
                db_session.delete(message)
            folder_info.uidvalidity = uidvalidity
            folder_info.highestmodseq = None

    @retry_crispin
    def poll_for_changes(self, download_stack):
        while True:
            with self.conn_pool.get() as crispin_client:
                crispin_client.select_folder(self.folder_name, uidvalidity_cb)
                self.check_uid_changes(crispin_client,
                                       download_stack,
                                       async_download=True)
            sleep(self.poll_frequency)

    def download_uids(self, crispin_client, download_stack):
        while not download_stack.empty():
            uid, metadata = download_stack.peekitem()
            self.download_and_commit_uids(crispin_client, [uid])
            download_stack.pop(uid)
            report_progress(self.account_id, self.folder_name, 1,
                            len(download_stack))
            self.heartbeat_status.publish()
            if self.throttled and metadata is not None and metadata.throttled:
                # Check to see if the account's throttled state has been
                # modified. If so, immediately accelerate.
                with mailsync_session_scope() as db_session:
                    acc = db_session.query(Account).get(self.account_id)
                    self.throttled = acc.throttled
                if self.throttled:
                    log.debug('throttled; sleeping')
                    sleep(THROTTLE_WAIT)

    def create_message(self, db_session, acct, folder, msg):
        assert acct is not None and acct.namespace is not None

        # Check if we somehow already saved the imapuid (shouldn't happen, but
        # possible due to race condition). If so, don't commit changes.
        existing_imapuid = db_session.query(ImapUid).filter(
            ImapUid.account_id == acct.id, ImapUid.folder_id == folder.id,
            ImapUid.msg_uid == msg.uid).first()
        if existing_imapuid is not None:
            log.error('Expected to create imapuid, but existing row found',
                      remote_msg_uid=msg.uid,
                      existing_imapuid=existing_imapuid.id)
            return None

        new_uid = common.create_imap_message(db_session, log, acct, folder,
                                             msg)
        new_uid = self.add_message_attrs(db_session, new_uid, msg)

        # We're calling import_attached_events here instead of some more
        # obvious place (like Message.create_from_synced) because the function
        # requires new_uid.message to have been flushed.
        # This is necessary because the import_attached_events does db lookups.
        if new_uid.message.has_attached_events:
            with db_session.no_autoflush:
                import_attached_events(db_session, acct, new_uid.message)

        # If we're in the polling state, then we want to report the metric
        # for latency when the message was received vs created
        if self.state == 'poll':
            latency_millis = (
                datetime.utcnow() - new_uid.message.received_date) \
                .total_seconds() * 1000
            metrics = [
                '.'.join(['accounts', 'overall', 'message_latency']),
                '.'.join(['accounts',
                          str(acct.id), 'message_latency']),
                '.'.join(['providers', self.provider_name, 'message_latency']),
            ]
            for metric in metrics:
                statsd_client.timing(metric, latency_millis)

        return new_uid

    def _count_thread_messages(self, thread_id, db_session):
        count, = db_session.query(func.count(Message.id)). \
            filter(Message.thread_id == thread_id).one()
        return count

    def add_message_attrs(self, db_session, new_uid, msg):
        """ Post-create-message bits."""
        with db_session.no_autoflush:
            parent_thread = fetch_corresponding_thread(db_session,
                                                       self.namespace_id,
                                                       new_uid.message)
            construct_new_thread = True

            if parent_thread:
                # If there's a parent thread that isn't too long already,
                # add to it. Otherwise create a new thread.
                parent_message_count = self._count_thread_messages(
                    parent_thread.id, db_session)
                if parent_message_count < MAX_THREAD_LENGTH:
                    construct_new_thread = False

            if construct_new_thread:
                new_uid.message.thread = ImapThread.from_imap_message(
                    db_session, new_uid.account.namespace, new_uid.message)
            else:
                parent_thread.messages.append(new_uid.message)

        db_session.flush()
        return new_uid

    def remove_deleted_uids(self, db_session, local_uids, remote_uids):
        """
        Remove imapuid entries that no longer exist on the remote.

        Works as follows:
            1. Do a LIST on the current folder to see what messages are on the
                server.
            2. Compare to message uids stored locally.
            3. Purge uids we have locally but not on the server. Ignore
               remote uids that aren't saved locally.

        Make SURE to be holding `syncmanager_lock` when calling this function;
        we do not grab it here to allow callers to lock higher level
        functionality.

        """
        to_delete = set(local_uids) - set(remote_uids)
        common.remove_deleted_uids(self.account_id, db_session, to_delete,
                                   self.folder_id)

    def download_and_commit_uids(self, crispin_client, uids):
        start = datetime.utcnow()
        raw_messages = crispin_client.uids(uids)
        if not raw_messages:
            return 0

        new_uids = set()
        with self.syncmanager_lock:
            # there is the possibility that another green thread has already
            # downloaded some message(s) from this batch... check within the
            # lock
            with mailsync_session_scope() as db_session:
                account = db_session.query(Account).get(self.account_id)
                folder = db_session.query(Folder).get(self.folder_id)
                for msg in raw_messages:
                    uid = self.create_message(db_session, account, folder, msg)
                    if uid is not None:
                        db_session.add(uid)
                        db_session.flush()
                        new_uids.add(uid)
                db_session.commit()

        # If we downloaded uids, record message velocity (#uid / latency)
        if self.state == 'initial' and len(new_uids):
            self._report_message_velocity(datetime.utcnow() - start,
                                          len(new_uids))
        if self.is_first_message:
            self._report_first_message()
            self.is_first_message = False

        return len(new_uids)

    def _report_first_message(self):
        now = datetime.utcnow()

        with mailsync_session_scope() as db_session:
            account = db_session.query(Account).get(self.account_id)
            account_created = account.created_at

        latency = (now - account_created).total_seconds() * 1000
        metrics = [
            '.'.join(['providers', self.provider_name, 'first_message']),
            '.'.join(['providers', 'overall', 'first_message'])
        ]

        for metric in metrics:
            statsd_client.timing(metric, latency)

    def _report_message_velocity(self, timedelta, num_uids):
        latency = (timedelta).total_seconds() * 1000
        latency_per_uid = float(latency) / num_uids
        metrics = [
            '.'.join(['providers', self.provider_name, 'message_velocity']),
            '.'.join(['providers', 'overall', 'message_velocity'])
        ]
        for metric in metrics:
            statsd_client.timing(metric, latency_per_uid)

    def update_metadata(self, crispin_client, updated):
        """ Update flags (the only metadata that can change). """
        # bigger chunk because the data being fetched here is very small
        for uids in chunk(updated, 5 * crispin_client.CHUNK_SIZE):
            new_flags = crispin_client.flags(uids)
            # Messages can disappear in the meantime; we'll update them next
            # sync.
            uids = [uid for uid in uids if uid in new_flags]
            with self.syncmanager_lock:
                with mailsync_session_scope() as db_session:
                    common.update_metadata(self.account_id, db_session,
                                           self.folder_name, self.folder_id,
                                           uids, new_flags)
                    db_session.commit()

    def update_uid_counts(self, db_session, **kwargs):
        saved_status = db_session.query(ImapFolderSyncStatus).join(Folder). \
            filter(ImapFolderSyncStatus.account_id == self.account_id,
                   Folder.name == self.folder_name).one()
        # We're not updating the current_remote_count metric
        # so don't update uid_checked_timestamp.
        if kwargs.get('remote_uid_count') is None:
            saved_status.update_metrics(kwargs)
        else:
            metrics = dict(uid_checked_timestamp=datetime.utcnow())
            metrics.update(kwargs)
            saved_status.update_metrics(metrics)

    def check_uid_changes(self, crispin_client, download_stack,
                          async_download):
        remote_uids = set(crispin_client.all_uids())
        with self.syncmanager_lock:
            with mailsync_session_scope() as db_session:
                local_uids = common.all_uids(self.account_id, db_session,
                                             self.folder_id)
                # Download new UIDs.
                stack_uids = set(download_stack.keys())
                local_with_pending_uids = local_uids | stack_uids
                for uid in sorted(remote_uids):
                    if uid not in local_with_pending_uids:
                        download_stack.put(uid, None)
                self.remove_deleted_uids(db_session, local_uids, remote_uids)
        if not async_download:
            self.download_uids(crispin_client, download_stack)
            with mailsync_session_scope() as db_session:
                self.update_uid_counts(db_session,
                                       remote_uid_count=len(remote_uids),
                                       download_uid_count=len(download_stack))
        to_refresh = sorted(remote_uids & local_uids)[-self.refresh_flags_max:]
        self.update_metadata(crispin_client, to_refresh)