Exemple #1
0
 def command(self):
     try:
         self._idx().save(self.session)
         GlobalPostingList.Optimize(self.session, self._idx(),
                                    force=('harder' in self.args))
         return True
     except KeyboardInterrupt:
         self.session.ui.mark(_('Aborted'))
         return False
Exemple #2
0
    def _rescan_mailboxes(self, session, config):
        idx = self._idx()
        msg_count = 0
        mbox_count = 0
        rv = True
        try:
            pre_command = config.prefs.rescan_command
            if pre_command:
                session.ui.mark(_('Running: %s') % pre_command)
                subprocess.check_call(pre_command, shell=True)
            msg_count = 1
            for fid, fpath in config.get_mailboxes():
                if fpath == '/dev/null':
                    continue
                if mailpile.util.QUITTING:
                    break
                try:
                    count = idx.scan_mailbox(session, fid, fpath,
                                             config.open_mailbox)
                except ValueError:
                    session.ui.warning(_('Failed to rescan: %s') % fpath)
                    count = 0

                if count:
                    msg_count += count
                    mbox_count += 1
                config.clear_mbox_cache()
                session.ui.mark('\n')
            msg_count -= 1
            if msg_count:
                if not mailpile.util.QUITTING:
                    idx.cache_sort_orders(session)
                if not mailpile.util.QUITTING:
                    GlobalPostingList.Optimize(session, idx, quick=True)
            else:
                session.ui.mark(_('Nothing changed'))
        except (KeyboardInterrupt, subprocess.CalledProcessError), e:
            session.ui.mark(_('Aborted: %s') % e)
            self._ignore_exception()
            return {
                'aborted': True,
                'messages': msg_count,
                'mailboxes': mbox_count
            }
Exemple #3
0
    def _rescan_mailboxes(self, session, config):
        # FIXME: Need a lock here?
        if 'rescan' in config._running:
            return True
        config._running['rescan'] = True

        idx = self._idx()
        msg_count = 0
        mbox_count = 0
        rv = True
        try:
            pre_command = config.prefs.rescan_command
            if pre_command:
                session.ui.mark('Running: %s' % pre_command)
                subprocess.check_call(pre_command, shell=True)
            msg_count = 1
            for fid, fpath in config.get_mailboxes():
                if fpath == '/dev/null':
                    continue
                if mailpile.util.QUITTING:
                    break
                count = idx.scan_mailbox(session, fid, fpath,
                                         config.open_mailbox)
                if count:
                    msg_count += count
                    mbox_count += 1
                config.clear_mbox_cache()
                session.ui.mark('\n')
            msg_count -= 1
            if msg_count:
                idx.cache_sort_orders(session)
                if not mailpile.util.QUITTING:
                    GlobalPostingList.Optimize(session, idx, quick=True)
            else:
                session.ui.mark('Nothing changed')
        except (KeyboardInterrupt, subprocess.CalledProcessError), e:
            session.ui.mark('Aborted: %s' % e)
            self._ignore_exception()
            return False
Exemple #4
0
class MailIndex:
    """This is a lazily parsing object representing a mailpile index."""

    MSG_MID = 0
    MSG_PTRS = 1
    MSG_ID = 2
    MSG_DATE = 3
    MSG_FROM = 4
    MSG_TO = 5
    MSG_SUBJECT = 6
    MSG_SNIPPET = 7
    MSG_TAGS = 8
    MSG_REPLIES = 9
    MSG_CONV_MID = 10

    def __init__(self, config):
        self.config = config
        self.STATS = {}
        self.INDEX = []
        self.INDEX_SORT = {}
        self.INDEX_CONV = []
        self.PTRS = {}
        self.MSGIDS = {}
        self.EMAILS = []
        self.EMAIL_IDS = {}
        self.CACHE = {}
        self.MODIFIED = set()
        self.EMAILS_SAVED = 0

    def l2m(self, line):
        return line.decode('utf-8').split(u'\t')

    # A translation table for message parts stored in the index, consists of
    # a mapping from unicode ordinals to either another unicode ordinal or
    # None, to remove a character. By default it removes the ASCII control
    # characters and replaces tabs and newlines with spaces.
    NORM_TABLE = dict(
        [(i, None) for i in range(0, 0x20)], **{
            ord(u'\t'): ord(u' '),
            ord(u'\r'): ord(u' '),
            ord(u'\n'): ord(u' '),
            0x7F: None
        })

    def m2l(self, message):
        # Normalize the message before saving it so we can be sure that we will
        # be able to read it back later.
        parts = [unicode(p).translate(self.NORM_TABLE) for p in message]
        return (u'\t'.join(parts)).encode('utf-8')

    def load(self, session=None):
        self.INDEX = []
        self.CACHE = {}
        self.PTRS = {}
        self.MSGIDS = {}
        self.EMAILS = []
        self.EMAIL_IDS = {}
        CachedSearchResultSet.DropCaches()

        def process_line(line):
            try:
                line = line.strip()
                if line.startswith('#'):
                    pass
                elif line.startswith('@'):
                    pos, email = line[1:].split('\t', 1)
                    pos = int(pos, 36)
                    while len(self.EMAILS) < pos + 1:
                        self.EMAILS.append('')
                    self.EMAILS[pos] = unquote(email)
                    self.EMAIL_IDS[unquote(email).lower()] = pos
                elif line:
                    words = line.split('\t')
                    # FIXME: Delete this old crap.
                    if len(words) == 10:
                        # This is an old index file, reorder it!
                        pos, p, unused, msgid, d, f, s, t, r, c = words
                        ptrs = ','.join(['0' + ptr for ptr in p.split(',')])
                        line = '\t'.join(
                            [pos, ptrs, msgid, d, f, '', s, '', t, r, c])
                    else:
                        pos, ptrs, msgid = words[:3]
                    pos = int(pos, 36)
                    while len(self.INDEX) < pos + 1:
                        self.INDEX.append('')
                    self.INDEX[pos] = line
                    self.MSGIDS[msgid] = pos
                    for msg_ptr in ptrs.split(','):
                        self.PTRS[msg_ptr] = pos
            except ValueError:
                pass

        if session:
            session.ui.mark('Loading metadata index...')
        try:
            fd = open(self.config.mailindex_file(), 'r')
            for line in fd:
                if line.startswith(GPG_BEGIN_MESSAGE):
                    for line in decrypt_gpg([line], fd):
                        process_line(line)
                else:
                    process_line(line)
            fd.close()
        except IOError:
            if session:
                session.ui.warning(('Metadata index not found: %s') %
                                   self.config.mailindex_file())
        self.cache_sort_orders(session)
        if session:
            session.ui.mark('Loaded metadata, %d messages' % len(self.INDEX))
        self.EMAILS_SAVED = len(self.EMAILS)

    def save_changes(self, session=None):
        mods, self.MODIFIED = self.MODIFIED, set()
        if mods or len(self.EMAILS) > self.EMAILS_SAVED:
            if session:
                session.ui.mark("Saving metadata index changes...")
            fd = gpg_open(self.config.mailindex_file(),
                          self.config.prefs.gpg_recipient, 'a')
            for eid in range(self.EMAILS_SAVED, len(self.EMAILS)):
                fd.write('@%s\t%s\n' % (b36(eid), quote(self.EMAILS[eid])))
            for pos in mods:
                fd.write(self.INDEX[pos] + '\n')
            fd.close()
            flush_append_cache()
            if session:
                session.ui.mark("Saved metadata index changes")
            self.EMAILS_SAVED = len(self.EMAILS)

    def save(self, session=None):
        self.MODIFIED = set()
        if session:
            session.ui.mark("Saving metadata index...")
        fd = gpg_open(self.config.mailindex_file(),
                      self.config.prefs.gpg_recipient, 'w')
        fd.write('# This is the mailpile.py index file.\n')
        fd.write('# We have %d messages!\n' % len(self.INDEX))
        for eid in range(0, len(self.EMAILS)):
            fd.write('@%s\t%s\n' % (b36(eid), quote(self.EMAILS[eid])))
        for item in self.INDEX:
            fd.write(item + '\n')
        fd.close()
        flush_append_cache()
        if session:
            session.ui.mark("Saved metadata index")

    def update_ptrs_and_msgids(self, session):
        session.ui.mark('Updating high level indexes')
        for offset in range(0, len(self.INDEX)):
            message = self.l2m(self.INDEX[offset])
            if len(message) > self.MSG_CONV_MID:
                self.MSGIDS[message[self.MSG_ID]] = offset
                for msg_ptr in message[self.MSG_PTRS].split(','):
                    self.PTRS[msg_ptr] = offset
            else:
                session.ui.warning('Bogus line: %s' % line)

    def try_decode(self, text, charset):
        for cs in (charset, 'iso-8859-1', 'utf-8'):
            if cs:
                try:
                    return text.decode(cs)
                except (UnicodeEncodeError, UnicodeDecodeError, LookupError):
                    pass
        return "".join(i for i in text if ord(i) < 128)

    def hdr(self, msg, name, value=None):
        try:
            if value is None and msg:
                # Security: RFC822 headers are not allowed to have (unencoded)
                # non-ascii characters in them, so we just strip them all out
                # before parsing.
                # FIXME: This is "safe", but can we be smarter/gentler?
                value = CleanText(msg[name], replace='_').clean
            # Note: decode_header does the wrong thing with "quoted" data.
            decoded = email.header.decode_header((value
                                                  or '').replace('"', ''))
            return (' '.join([self.try_decode(t[0], t[1]) for t in decoded
                              ])).replace('\r',
                                          ' ').replace('\t',
                                                       ' ').replace('\n', ' ')
        except email.errors.HeaderParseError:
            return ''

    def update_location(self, session, msg_idx_pos, msg_ptr):
        msg_info = self.get_msg_at_idx_pos(msg_idx_pos)
        msg_ptrs = msg_info[self.MSG_PTRS].split(',')
        self.PTRS[msg_ptr] = msg_idx_pos

        # If message was seen in this mailbox before, update the location
        for i in range(0, len(msg_ptrs)):
            if (msg_ptrs[i][:MBX_ID_LEN] == msg_ptr[:MBX_ID_LEN]):
                msg_ptrs[i] = msg_ptr
                msg_ptr = None
                break
        # Otherwise, this is a new mailbox, record this sighting as well!
        if msg_ptr:
            msg_ptrs.append(msg_ptr)

        msg_info[self.MSG_PTRS] = ','.join(msg_ptrs)
        self.set_msg_at_idx_pos(msg_idx_pos, msg_info)

    def _parse_date(self, date_hdr):
        """Parse a Date: or Received: header into a unix timestamp."""
        try:
            if ';' in date_hdr:
                date_hdr = date_hdr.split(';')[-1].strip()
            msg_ts = long(rfc822.mktime_tz(rfc822.parsedate_tz(date_hdr)))
            if (msg_ts > (time.time() + 24 * 3600)) or (msg_ts < 1):
                return None
            else:
                return msg_ts
        except (ValueError, TypeError, OverflowError):
            return None

    def _extract_date_ts(self, session, msg_mid, msg_id, msg, last_date):
        """Extract a date, sanity checking against the Received: headers."""
        hdrs = [self.hdr(msg, 'date')] + (msg.get_all('received') or [])
        dates = [self._parse_date(date_hdr) for date_hdr in hdrs]
        msg_ts = dates[0]
        nz_dates = sorted([d for d in dates if d])

        if nz_dates:
            median = nz_dates[len(nz_dates) / 2]
            if msg_ts and abs(msg_ts - median) < 31 * 24 * 3600:
                return msg_ts
            else:
                session.ui.warning(
                    ('=%s/%s using Recieved: instead of Date:') %
                    (msg_mid, msg_id))
                return median
        else:
            # If the above fails, we assume the messages in the mailbox are in
            # chronological order and just add 1 second to the date of the last
            # message if date parsing fails for some reason.
            session.ui.warning('=%s/%s has a bogus date' % (msg_mid, msg_id))
            return last_date + 1

    def scan_mailbox(self, session, mailbox_idx, mailbox_fn, mailbox_opener):
        try:
            mbox = mailbox_opener(session, mailbox_idx)
            if mbox.editable:
                session.ui.mark('%s: Skipped: %s' % (mailbox_idx, mailbox_fn))
                return 0
            else:
                session.ui.mark('%s: Checking: %s' % (mailbox_idx, mailbox_fn))
        except (IOError, OSError, NoSuchMailboxError), e:
            session.ui.mark(
                ('%s: Error opening: %s (%s)') % (mailbox_idx, mailbox_fn, e))
            return 0

        unparsed = mbox.unparsed()
        if not unparsed:
            return 0

        if len(self.PTRS.keys()) == 0:
            self.update_ptrs_and_msgids(session)

        snippet_max = session.config.sys.snippet_max
        added = 0
        msg_ts = int(time.time())
        for ui in range(0, len(unparsed)):
            if mailpile.util.QUITTING:
                break

            i = unparsed[ui]
            parse_status = ('%s: Reading your mail: %d%% (%d/%d messages)') % (
                mailbox_idx, 100 * ui / len(unparsed), ui, len(unparsed))

            msg_ptr = mbox.get_msg_ptr(mailbox_idx, i)
            if msg_ptr in self.PTRS:
                if (ui % 317) == 0:
                    session.ui.mark(parse_status)
                continue
            else:
                session.ui.mark(parse_status)

            # Message new or modified, let's parse it.
            msg = ParseMessage(mbox.get_file(i), pgpmime=False)
            msg_id = b64c(
                sha1b64((self.hdr(msg, 'message-id') or msg_ptr).strip()))
            if msg_id in self.MSGIDS:
                self.update_location(session, self.MSGIDS[msg_id], msg_ptr)
                added += 1
            else:
                # Add new message!
                msg_mid = b36(len(self.INDEX))

                msg_ts = self._extract_date_ts(session, msg_mid, msg_id, msg,
                                               msg_ts)

                keywords, snippet = self.index_message(
                    session,
                    msg_mid,
                    msg_id,
                    msg,
                    msg_ts,
                    mailbox=mailbox_idx,
                    compact=False,
                    filter_hooks=[self.filter_keywords])

                msg_subject = self.hdr(msg, 'subject')
                msg_snippet = snippet[:max(0, snippet_max - len(msg_subject))]

                tags = [
                    k.split(':')[0] for k in keywords if k.endswith(':tag')
                ]

                msg_to = (ExtractEmails(self.hdr(msg, 'to')) +
                          ExtractEmails(self.hdr(msg, 'cc')) +
                          ExtractEmails(self.hdr(msg, 'bcc')))

                msg_idx_pos, msg_info = self.add_new_msg(
                    msg_ptr, msg_id, msg_ts, self.hdr(msg, 'from'), msg_to,
                    msg_subject, msg_snippet, tags)
                self.set_conversation_ids(msg_info[self.MSG_MID], msg)
                mbox.mark_parsed(i)

                added += 1
                if (added % 1000) == 0:
                    GlobalPostingList.Optimize(session, self, quick=True)

        if added:
            mbox.save(session)
        session.ui.mark('%s: Indexed mailbox: %s' % (mailbox_idx, mailbox_fn))
        return added
Exemple #5
0
class MailIndex:
    """This is a lazily parsing object representing a mailpile index."""

    MSG_MID = 0
    MSG_PTRS = 1
    MSG_ID = 2
    MSG_DATE = 3
    MSG_FROM = 4
    MSG_TO = 5
    MSG_CC = 6
    MSG_KB = 7
    MSG_SUBJECT = 8
    MSG_BODY = 9
    MSG_TAGS = 10
    MSG_REPLIES = 11
    MSG_THREAD_MID = 12

    MSG_FIELDS_V1 = 11
    MSG_FIELDS_V2 = 13

    BOGUS_METADATA = [
        None, '', None, '0', '(no sender)', '', '', '0', '(not in index)', '',
        '', '', '-1'
    ]

    MAX_INCREMENTAL_SAVES = 25

    def __init__(self, config):
        self.config = config
        self.INDEX = []
        self.INDEX_SORT = {}
        self.INDEX_THR = []
        self.PTRS = {}
        self.TAGS = {}
        self.MSGIDS = {}
        self.EMAILS = []
        self.EMAIL_IDS = {}
        self.CACHE = {}
        self.MODIFIED = set()
        self.EMAILS_SAVED = 0
        self._saved_changes = 0
        self._lock = threading.Lock()

    def l2m(self, line):
        return line.decode('utf-8').split(u'\t')

    # A translation table for message parts stored in the index, consists of
    # a mapping from unicode ordinals to either another unicode ordinal or
    # None, to remove a character. By default it removes the ASCII control
    # characters and replaces tabs and newlines with spaces.
    NORM_TABLE = dict(
        [(i, None) for i in range(0, 0x20)], **{
            ord(u'\t'): ord(u' '),
            ord(u'\r'): ord(u' '),
            ord(u'\n'): ord(u' '),
            0x7F: None
        })

    def m2l(self, message):
        # Normalize the message before saving it so we can be sure that we will
        # be able to read it back later.
        parts = [unicode(p).translate(self.NORM_TABLE) for p in message]
        return (u'\t'.join(parts)).encode('utf-8')

    def load(self, session=None):
        self.INDEX = []
        self.CACHE = {}
        self.PTRS = {}
        self.MSGIDS = {}
        self.EMAILS = []
        self.EMAIL_IDS = {}
        CachedSearchResultSet.DropCaches()

        def process_line(line):
            try:
                line = line.strip()
                if line.startswith('#'):
                    pass
                elif line.startswith('@'):
                    pos, email = line[1:].split('\t', 1)
                    pos = int(pos, 36)
                    while len(self.EMAILS) < pos + 1:
                        self.EMAILS.append('')
                    unquoted_email = unquote(email).decode('utf-8')
                    self.EMAILS[pos] = unquoted_email
                    self.EMAIL_IDS[unquoted_email.split()[0].lower()] = pos
                elif line:
                    words = line.split('\t')

                    # Migration: converting old metadata into new!
                    if len(words) != self.MSG_FIELDS_V2:

                        # V1 -> V2 adds MSG_CC and MSG_KB
                        if len(words) == self.MSG_FIELDS_V1:
                            words[self.MSG_CC:self.MSG_CC] = ['']
                            words[self.MSG_KB:self.MSG_KB] = ['0']

                        # Add V2 -> V3 here, etc. etc.

                        if len(words) == self.MSG_FIELDS_V2:
                            line = '\t'.join(words)
                        else:
                            raise Exception(
                                _('Your metadata index is either '
                                  'too old, too new or corrupt!'))

                    pos = int(words[self.MSG_MID], 36)
                    while len(self.INDEX) < pos + 1:
                        self.INDEX.append('')

                    self.INDEX[pos] = line
                    self.MSGIDS[words[self.MSG_ID]] = pos
                    self.update_msg_tags(pos, words)
                    for msg_ptr in words[self.MSG_PTRS].split(','):
                        self.PTRS[msg_ptr] = pos

            except ValueError:
                pass

        if session:
            session.ui.mark(_('Loading metadata index...'))
        try:
            self._lock.acquire()
            fd = open(self.config.mailindex_file(), 'r')
            for line in fd:
                if line.startswith(GPG_BEGIN_MESSAGE):
                    for line in decrypt_gpg([line], fd):
                        process_line(line)
                else:
                    process_line(line)
            fd.close()
        except IOError:
            if session:
                session.ui.warning(
                    _('Metadata index not found: %s') %
                    self.config.mailindex_file())
        finally:
            self._lock.release()

        self.cache_sort_orders(session)
        if session:
            session.ui.mark(
                _('Loaded metadata, %d messages') % len(self.INDEX))
        self.EMAILS_SAVED = len(self.EMAILS)

    def update_msg_tags(self, msg_idx_pos, msg_info):
        tags = set([t for t in msg_info[self.MSG_TAGS].split(',') if t])
        for tid in (set(self.TAGS.keys()) - tags):
            self.TAGS[tid] -= set([msg_idx_pos])
        for tid in tags:
            if tid not in self.TAGS:
                self.TAGS[tid] = set()
            self.TAGS[tid].add(msg_idx_pos)

    def save_changes(self, session=None):
        mods, self.MODIFIED = self.MODIFIED, set()
        if mods or len(self.EMAILS) > self.EMAILS_SAVED:
            if self._saved_changes >= self.MAX_INCREMENTAL_SAVES:
                return self.save(session=session)
            try:
                self._lock.acquire()
                if session:
                    session.ui.mark(_("Saving metadata index changes..."))
                fd = gpg_open(self.config.mailindex_file(),
                              self.config.prefs.gpg_recipient, 'a')
                for eid in range(self.EMAILS_SAVED, len(self.EMAILS)):
                    quoted_email = quote(self.EMAILS[eid].encode('utf-8'))
                    fd.write('@%s\t%s\n' % (b36(eid), quoted_email))
                for pos in mods:
                    fd.write(self.INDEX[pos] + '\n')
                fd.close()
                flush_append_cache()
                if session:
                    session.ui.mark(_("Saved metadata index changes"))
                self.EMAILS_SAVED = len(self.EMAILS)
                self._saved_changes += 1
            finally:
                self._lock.release()

    def save(self, session=None):
        try:
            self._lock.acquire()
            self.MODIFIED = set()
            if session:
                session.ui.mark(_("Saving metadata index..."))

            idxfile = self.config.mailindex_file()
            newfile = '%s.new' % idxfile

            fd = gpg_open(newfile, self.config.prefs.gpg_recipient, 'w')
            fd.write('# This is the mailpile.py index file.\n')
            fd.write('# We have %d messages!\n' % len(self.INDEX))
            for eid in range(0, len(self.EMAILS)):
                quoted_email = quote(self.EMAILS[eid].encode('utf-8'))
                fd.write('@%s\t%s\n' % (b36(eid), quoted_email))
            for item in self.INDEX:
                fd.write(item + '\n')
            fd.close()

            # Keep the last 5 index files around... just in case.
            backup_file(idxfile, backups=5, min_age_delta=10)
            os.rename(newfile, idxfile)

            flush_append_cache()
            self._saved_changes = 0
            if session:
                session.ui.mark(_("Saved metadata index"))
        finally:
            self._lock.release()

    def update_ptrs_and_msgids(self, session):
        session.ui.mark(_('Updating high level indexes'))
        for offset in range(0, len(self.INDEX)):
            message = self.l2m(self.INDEX[offset])
            if len(message) == self.MSG_FIELDS_V2:
                self.MSGIDS[message[self.MSG_ID]] = offset
                for msg_ptr in message[self.MSG_PTRS].split(','):
                    self.PTRS[msg_ptr] = offset
            else:
                session.ui.warning(_('Bogus line: %s') % line)

    def try_decode(self, text, charset):
        for cs in (charset, 'iso-8859-1', 'utf-8'):
            if cs:
                try:
                    return text.decode(cs)
                except (UnicodeEncodeError, UnicodeDecodeError, LookupError):
                    pass
        return "".join(i for i in text if ord(i) < 128)

    def hdr(self, msg, name, value=None):
        try:
            if value is None and msg:
                # Security: RFC822 headers are not allowed to have (unencoded)
                # non-ascii characters in them, so we just strip them all out
                # before parsing.
                # FIXME: This is "safe", but can we be smarter/gentler?
                value = CleanText(msg[name], replace='_').clean
            # Note: decode_header does the wrong thing with "quoted" data.
            decoded = email.header.decode_header((value
                                                  or '').replace('"', ''))
            return (' '.join([self.try_decode(t[0], t[1]) for t in decoded
                              ])).replace('\r',
                                          ' ').replace('\t',
                                                       ' ').replace('\n', ' ')
        except email.errors.HeaderParseError:
            return ''

    def update_location(self, session, msg_idx_pos, msg_ptr):
        msg_info = self.get_msg_at_idx_pos(msg_idx_pos)
        msg_ptrs = msg_info[self.MSG_PTRS].split(',')
        self.PTRS[msg_ptr] = msg_idx_pos

        # If message was seen in this mailbox before, update the location
        for i in range(0, len(msg_ptrs)):
            if msg_ptrs[i][:MBX_ID_LEN] == msg_ptr[:MBX_ID_LEN]:
                msg_ptrs[i] = msg_ptr
                msg_ptr = None
                break
        # Otherwise, this is a new mailbox, record this sighting as well!
        if msg_ptr:
            msg_ptrs.append(msg_ptr)

        msg_info[self.MSG_PTRS] = ','.join(msg_ptrs)
        self.set_msg_at_idx_pos(msg_idx_pos, msg_info)

    def _parse_date(self, date_hdr):
        """Parse a Date: or Received: header into a unix timestamp."""
        try:
            if ';' in date_hdr:
                date_hdr = date_hdr.split(';')[-1].strip()
            msg_ts = long(rfc822.mktime_tz(rfc822.parsedate_tz(date_hdr)))
            if (msg_ts > (time.time() + 24 * 3600)) or (msg_ts < 1):
                return None
            else:
                return msg_ts
        except (ValueError, TypeError, OverflowError):
            return None

    def _extract_date_ts(self, session, msg_mid, msg_id, msg, last_date):
        """Extract a date, sanity checking against the Received: headers."""
        hdrs = [self.hdr(msg, 'date')] + (msg.get_all('received') or [])
        dates = [self._parse_date(date_hdr) for date_hdr in hdrs]
        msg_ts = dates[0]
        nz_dates = sorted([d for d in dates if d])

        if nz_dates:
            median = nz_dates[len(nz_dates) / 2]
            if msg_ts and abs(msg_ts - median) < 31 * 24 * 3600:
                return msg_ts
            else:
                session.ui.warning(
                    _('=%s/%s using Received: instead of Date:') %
                    (msg_mid, msg_id))
                return median
        else:
            # If the above fails, we assume the messages in the mailbox are in
            # chronological order and just add 1 second to the date of the last
            # message if date parsing fails for some reason.
            session.ui.warning(
                _('=%s/%s has a bogus date') % (msg_mid, msg_id))
            return last_date + 1

    def encode_msg_id(self, msg_id):
        return b64c(sha1b64(msg_id.strip()))

    def get_msg_id(self, msg, msg_ptr):
        raw_msg_id = self.hdr(msg, 'message-id')
        if not raw_msg_id:
            # Create a very long pseudo-msgid for messages without a
            # Message-ID. This was a very badly behaved mailer, so if
            # we create duplicates this way, we are probably only
            # losing spam. Even then the Received line should save us.
            raw_msg_id = ('\t'.join([
                self.hdr(msg, 'date'),
                self.hdr(msg, 'subject'),
                self.hdr(msg, 'received'),
                self.hdr(msg, 'from'),
                self.hdr(msg, 'to')
            ])).strip()
        # Fall back to the msg_ptr if all else fails.
        if not raw_msg_id:
            print _('WARNING: No proper Message-ID for %s') % msg_ptr
        return self.encode_msg_id(raw_msg_id or msg_ptr)

    def scan_mailbox(self, session, mailbox_idx, mailbox_fn, mailbox_opener):
        try:
            mbox = mailbox_opener(session, mailbox_idx)
            if mbox.editable:
                session.ui.mark(
                    _('%s: Skipped: %s') % (mailbox_idx, mailbox_fn))
                return 0
            else:
                session.ui.mark(
                    _('%s: Checking: %s') % (mailbox_idx, mailbox_fn))
        except (IOError, OSError, NoSuchMailboxError), e:
            session.ui.mark(
                _('%s: Error opening: %s (%s)') % (mailbox_idx, mailbox_fn, e))
            return 0

        unparsed = mbox.unparsed()
        if not unparsed:
            return 0

        if len(self.PTRS.keys()) == 0:
            self.update_ptrs_and_msgids(session)

        snippet_max = session.config.sys.snippet_max
        added = 0
        msg_ts = int(time.time())
        for ui in range(0, len(unparsed)):
            if mailpile.util.QUITTING:
                break

            i = unparsed[ui]
            parse_status = _('%s: Reading your mail: %d%% (%d/%d messages)'
                             ) % (mailbox_idx, 100 * ui / len(unparsed), ui,
                                  len(unparsed))

            msg_ptr = mbox.get_msg_ptr(mailbox_idx, i)
            if msg_ptr in self.PTRS:
                if (ui % 317) == 0:
                    session.ui.mark(parse_status)
                    play_nice_with_threads()
                continue
            else:
                session.ui.mark(parse_status)
                play_nice_with_threads()

            # Message new or modified, let's parse it.
            if 'rescan' in session.config.sys.debug:
                session.ui.debug('Reading message %s/%s' % (mailbox_idx, i))
            try:
                msg_fd = mbox.get_file(i)
                msg = ParseMessage(
                    msg_fd, pgpmime=session.config.prefs.index_encrypted)
            except (IOError, OSError, ValueError, IndexError, KeyError):
                if session.config.sys.debug:
                    traceback.print_exc()
                session.ui.warning(('Reading message %s/%s FAILED, skipping') %
                                   (mailbox_idx, i))
                continue

            msg_size = msg_fd.tell()
            msg_id = self.get_msg_id(msg, msg_ptr)
            if msg_id in self.MSGIDS:
                self.update_location(session, self.MSGIDS[msg_id], msg_ptr)
                added += 1
            else:
                # Add new message!
                msg_mid = b36(len(self.INDEX))

                msg_ts = self._extract_date_ts(session, msg_mid, msg_id, msg,
                                               msg_ts)

                play_nice_with_threads()
                keywords, snippet = self.index_message(
                    session,
                    msg_mid,
                    msg_id,
                    msg,
                    msg_size,
                    msg_ts,
                    mailbox=mailbox_idx,
                    compact=False,
                    filter_hooks=plugins.filter_hooks([self.filter_keywords]))

                msg_subject = self.hdr(msg, 'subject')
                msg_snippet = snippet[:max(0, snippet_max - len(msg_subject))]

                tags = [
                    k.split(':')[0] for k in keywords
                    if k.endswith(':in') or k.endswith(':tag')
                ]

                msg_to = ExtractEmails(self.hdr(msg, 'to'))
                msg_cc = (ExtractEmails(self.hdr(msg, 'cc')) +
                          ExtractEmails(self.hdr(msg, 'bcc')))

                msg_idx_pos, msg_info = self.add_new_msg(
                    msg_ptr, msg_id, msg_ts, self.hdr(msg, 'from'), msg_to,
                    msg_cc, msg_size, msg_subject, msg_snippet, tags)
                self.set_conversation_ids(msg_info[self.MSG_MID], msg)
                mbox.mark_parsed(i)

                added += 1
                GlobalPostingList.Optimize(session,
                                           self,
                                           lazy=True,
                                           quick=True)

        if added:
            mbox.save(session)
        session.ui.mark(
            _('%s: Indexed mailbox: %s') % (mailbox_idx, mailbox_fn))
        return added