def matching_docids(repo, response, params):
    """Search for documents matching query=terms and return zero or more ids.

    The list of document ids is whitespace delimited.
    There is no support for cutoff/narrow/widen; the intended use is mostly
    for querying unique text which will match exactly one document."""
    # This is an abbreviated version of basicPlugins.py _repo_search().
    query = params['query']
    cutoff = 0.0
    coll = PrestoCollection(repo, None, query, None, None, cutoff)
    response.reply(' '.join([doc.id for doc in coll.docs()]), 'text/plain')
def test_algorithms (repo, *algs):

    from uplib.collection import PrestoCollection

    # find or create a test set
    c = repo.get_collection("categorize algorithms test set")
    if not c:
        c = PrestoCollection(repo, None, query="uplibdate:[1/1/2005 TO 5/1/2008]")
        repo.add_collection("categorize algorithms test set", c)

    # get "ground truth"
    id_to_categories_mapping = repo.get_docids_with_categories()

    # now run the tests
    algresults = []
    for alg in algs:
        # for each document, see how many tags are in first 10,
        # and how many are missed
        docs = {}
        untagged = 0
        tagged = 0
        for doc in c.docs():
            dtags = id_to_categories_mapping.get(doc.id, ())[:]
            # ignore untagged docs
            if dtags:
                tagged += 1
                found = []
                missed = []
                tags = find_likely_tags(doc, score_adjust=alg) or []
                for i in range(min(10, len(tags))):
                    if tags[i][0] in dtags:
                        found.append(tags[i])
                        dtags.remove(tags[i][0])
                missed = dtags
                text = doc.text()
                textlen = (text and len(text.strip())) or 0
                docs[doc.id] = (found, missed, int(doc.get_metadata("page-count") or doc.get_metadata("pagecount")),
                                textlen, doc.get_metadata("title") or "")
            else:
                untagged += 1
        note(3, "%s:  %d untagged docs, %d tagged docs", alg, untagged, tagged)
        algresults.append((alg, docs,))
    return algresults
def matching_ids_and_filenames(repo, response, params):
    """Search for documents matching query=terms and return zero or more matches.

    Each line lists a matching document as:  doc_id ' ' filename
    """
    # This is an abbreviated version of basicPlugins.py _repo_search().
    query = params['query']
    global INTERACTION_CHARSET
    if not INTERACTION_CHARSET:
        conf = configurator.default_configurator()
        INTERACTION_CHARSET = conf.get('interaction-charset', 'UTF-8')
    query = unicode(query, INTERACTION_CHARSET, 'replace')
    cutoff = 0.0
    coll = PrestoCollection(repo, None, query, None, None, cutoff)
    result = []
    for doc in coll.docs():
        title, mtype = doc_title_and_type(doc)
        result.append('%s %s' % (doc.id, title))

    response.reply('\n'.join(result), 'text/plain')
    def __init__(self, name, mailcontextinst, category=None, flags=None, email_folder=True, ip=None, collection=None):

        query = (email_folder and '+apparent-mime-type:"message/rfc822"') or ""
        self.category = category
        self.collection = collection
        self.ip = ip
        self.mailcontext = weakref.ref(mailcontextinst)
        if collection:
            self.folder = collection
        else:
            if category:
                query += (' +categories:"' + category + '"')
            elif category == False:
                query += (' -categories:email')
            self.folder = PrestoCollection(mailcontextinst.repo, None, query)
        mailbox.__init__(self, name, ids=self.folder.docs(), flags=flags, allseen=(not email_folder))
class uplib_email_mailbox (mailbox):

    def __init__(self, name, mailcontextinst, category=None, flags=None, email_folder=True, ip=None, collection=None):

        query = (email_folder and '+apparent-mime-type:"message/rfc822"') or ""
        self.category = category
        self.collection = collection
        self.ip = ip
        self.mailcontext = weakref.ref(mailcontextinst)
        if collection:
            self.folder = collection
        else:
            if category:
                query += (' +categories:"' + category + '"')
            elif category == False:
                query += (' -categories:email')
            self.folder = PrestoCollection(mailcontextinst.repo, None, query)
        mailbox.__init__(self, name, ids=self.folder.docs(), flags=flags, allseen=(not email_folder))

    def __str__(self):
        return '<UpLib mailbox "%s", id=%x, %s messages>' % (self.name, id(self), (self.msglist and len(self.msglist) or 0))

    def __repr__(self):
        return '<UpLib mailbox "%s", id=%x, %s messages>' % (self.name, id(self), (self.msglist and len(self.msglist) or 0))

    def may_have_children(self):
        return not self.collection        

    def get_internaldate(self, msg):
        return time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime(msg.mbox_id.add_time()))

    def get_msg_as_email (self, doc):
        try:
            mime_type = doc.get_metadata("apparent-mime-type")
            if mime_type == "message/rfc822":
                f = os.path.join(doc.folder(), "originals")
                filepath = os.path.join(f, os.listdir(f)[0])
                fp = open(filepath, 'r')
                s = fp.read()
                fp.close()
                msg = message_from_string(s)
            else:

                def make_header(name, value):
                    try:
                        v = value.encode("US-ASCII")
                        charset = "US-ASCII"
                    except:
                        v = value.encode("UTF-8")
                        charset = "UTF-8"
                    return name, email.Header.Header(v, charset, 77, name).encode()

                def build_icon(doc):
                    icon = doc.document_icon()
                    img_part = email.Message.Message()
                    img_part.set_type("image/png")
                    cid = "%s.%s.%s.icon" % (self.ip, doc.repo.secure_port(), doc.id)
                    img_part.add_header("Content-ID", cid)
                    img_part.add_header("Content-Transfer-Encoding", "base64")
                    img_part.set_payload(base64.encodestring(icon))
                    return img_part

                def build_description(doc, display):
                    desc_part = email.Message.Message()
                    desc_part.set_type("text/html")
                    desc_part.add_header("Content-Transfer-Encoding", "quoted-printable")
                    desc_part.set_payload(quopri.encodestring('<html><body bgcolor="%s">' % STANDARD_BACKGROUND_COLOR +
                                                              display.encode('UTF-8') + "</body></html>\n"), "UTF-8")
                    return desc_part

                icon_payload = build_icon(doc)
                display, name = self.build_html_abstract_display(doc, icon_payload.get("Content-ID"))
                msg = email.Message.Message()
                msg.set_type("multipart/related;boundary=%s%s%s%s" % (self.ip, doc.repo.secure_port(), doc.id, long(time.time())))
                msg.add_header(*make_header("Message-ID", "%s:%s:%s" % (self.ip, doc.repo.secure_port(), doc.id)))
                d = doc.get_date()
                if d:
                    try:
                        d = email.Utils.formatdate(time.mktime((d[0], (d[1] or 1), (d[2] or 1), 0, 0, 0, 0, 1, -1,)))
                    except:
                        d = email.Utils.formatdate(id_to_time(doc.id))
                else:
                    d = email.Utils.formatdate(id_to_time(doc.id))
                msg.add_header(*make_header("Date", d))
                msg.add_header(*make_header("Subject", name))
                authors = doc.get_metadata("authors")
                if authors:
                    authors = authors.replace(" and ", ", ").replace('"', '\\"').replace('\r', '\\\r').replace('\\', '\\\\')
                    msg.add_header(*make_header("From", '"' + authors + '"'))
                body_payload = build_description(doc, display)
                msg.attach(body_payload)
                msg.attach(icon_payload)
                # note("msg is:\n%s", str(msg))
            return msg
        except:
            note("Exception getting document %s as email:\n%s", doc.id, string.join(traceback.format_exception(*sys.exc_info())))
            return None

    def _read_state(self, doc):
        d = doc.get_metadata("imap-state")
        if d:
            d = eval(d)
            note(5, "imap state for %s is %s", doc.id, d)
        if d and d.has_key(self.name):
            state = d[self.name]
            uid = state.get("uid")
            flags = state.get("flags")
            return uid, []
        return None, []

    def _save_state(self, doc, uid, flags):
        d = doc.get_metadata("imap-state")
        d = eval(d or "{}")
        d[self.name] = { "uid": uid, "flags": flags }
        note("imap state is now %s", repr(d))
        doc.update_metadata({"imap-state": repr(d)}, True)

    def add_message(self, msg):
        uid, flags = self._read_state(msg.mbox_id)
        if uid:
            msg.uid = uid
        else:
            msg.uid = self.next_uid_val
            self.next_uid_val += 1
        msg.flags = flags
        if self.allseen:
            msg.flags.append("\\Seen")
        if msg.mbox_id not in self.folder.docs():
            self.folder.include_doc(msg.mbox_id)
        doc = msg.mbox_id
        if self.category and (self.category not in doc.get_category_strings()):
            doc.add_category(self.category)
            msg.set_needs_saving()
        mailbox.add_message(self, msg)

    def build_html_abstract_display (self, doc, icon_cid):

        fp = StringIO()
        dict = doc.get_metadata()
        pubdate = dict.get("date")
        date = re.sub(" 0|^0", " ",
                      time.strftime("%d %b %Y, %I:%M %p",
                                    time.localtime(id_to_time(doc.id))))
        name = doc.id
        page_count = dict.get('page-count')
        summary = '<i>(No summary available.)</i>'
        if dict:
            if dict.has_key('title'):
                name = dict.get('title')
            elif dict.has_key('name'):
                name = '[' + dict.get('name') + ']'
        fp.write(u'<table border=0><tr><td>')
        fp.write(u'<center>')
        fp.write(u'<a href="https://%s:%d/action/basic/dv_show?doc_id=%s" border=0>' % (self.ip, doc.repo.secure_port(), doc.id))
        fp.write(u'<img src="cid:%s">' % icon_cid)
        fp.write(u'</a><p><small><font color="%s">(%s)</font></small></center></td><td>&nbsp;</td>'
                 % (STANDARD_DARK_COLOR, date))
        fp.write(u'<td valign=top><h3>%s</h3>' % htmlescape(name))
        if dict.has_key(u'authors') or pubdate:
            fp.write(u'<p><small>')
            if dict.has_key('authors'):
                fp.write(u'<b>&nbsp;&nbsp;&nbsp;&nbsp;%s</b>'
                         % (re.sub(' and ', ', ', dict['authors'])))
            if pubdate:
                formatted_date = format_date(pubdate, True)
                fp.write(u'&nbsp;&nbsp;&nbsp;&nbsp;<i><font color="%s">%s</font></i>' % (STANDARD_DARK_COLOR,
                                                                                        formatted_date))
            fp.write(u'</small>\n')
        if dict.has_key('comment'):
            summary = htmlescape(dict.get('comment', ''))
        elif dict.has_key('abstract'):
            summary = "<i>" + htmlescape(dict.get('abstract', '')) + '</i>'
        elif dict.has_key('summary'):
            summary = '<font color="%s">' % STANDARD_DARK_COLOR + htmlescape(dict.get('summary')) + '</font>'
        fp.write(u'<P>%s' % summary)
        if page_count:
            fp.write(u'<small><i><font color="%s"> &middot; (%s page%s)'
                     % (STANDARD_DARK_COLOR, page_count, ((int(page_count) != 1) and "s") or ""))
            fp.write(u'</font></i></small>\n')
        cstrings = doc.get_category_strings()
        fp.write(u'<p>Categories:  ')
        if cstrings:
            fp.write(string.join([htmlescape(s) for s in cstrings], u' &middot; '))
        else:
            fp.write('(none)')
        typ = doc.get_metadata("apparent-mime-type")
        if typ:
            mtype = ' &middot; <small>%s</small>' % typ
        else:
            mtype = ''
        fp.write(u'<p><a href="https://%s:%s/action/externalAPI/fetch_original?doc_id=%s&browser=true"><font color="%s">(Original%s)</font></a>'
                 % (self.ip, doc.repo.secure_port(), doc.id, STANDARD_DARK_COLOR, mtype))
        fp.write(u' &middot; <a href="https://%s:%s/action/basic/doc_pdf?doc_id=%s"><font color="%s">(PDF)</font></a>'
                 % (self.ip, doc.repo.secure_port(), doc.id, STANDARD_DARK_COLOR))
        if not mtype.lower().startswith("text/html"):
            fp.write(u' &middot; <a href="https://%s:%s/action/basic/doc_html?doc_id=%s"><font color="%s">(HTML)</font></a>'
                     % (self.ip, doc.repo.secure_port(), doc.id, STANDARD_DARK_COLOR))
        fp.write(u'</td></tr></table>')
        d = fp.getvalue()
        fp.close()
        return d, name

    def save_message(self, msg):
        doc = msg.mbox_id
        if os.path.isdir(doc.folder()):
            note("   saving flags for message mailbox=%s/uid=%s '%s'", msg.mailbox.name, msg.uid, msg.mbox_id.get_metadata("title"))
            self._save_state(msg.mbox_id, msg.uid, msg.flags)
            msg.saved()

    def remove_message(self, msg, save_state=True):
        mailbox.remove_message(self, msg)
        note("    removed from %s...", self)
        if self.category:
            try:
                msg.mbox_id.remove_category(self.category)
                note("    removed category %s from %s...", self.category, msg.mbox_id)
            except:
                note(2, "%s", ''.join(traceback.format_exception(*sys.exc_info())))
        elif self.collection:
            try:
                if isinstance(self.collection, PrestoCollection):
                    self.collection.exclude_doc(msg.mbox_id)
                    note("    excluded %s from %s...", msg.mbox_id, self.collection)
                elif msg.mbox_id in self.collection:
                    self.collection.exclude_doc(msg.mbox_id)
                    note("    excluded %s from %s...", msg.mbox_id, self.collection)
            except:
                note(2, "%s", ''.join(traceback.format_exception(*sys.exc_info())))
        else:
            # if neither category nor collection, self.folder is a PrestoCollection
            try:
                assert isinstance(self.folder, PrestoCollection)
                self.folder.exclude_doc(msg.mbox_id)
                note("    excluded %s from %s...", msg.mbox_id, self.folder)
            except:
                note(2, "%s", ''.join(traceback.format_exception(*sys.exc_info())))
                raise
        try:
            if save_state and msg.needs_saving():
                self.save_message(msg)
        except:
            note(2, "%s", ''.join(traceback.format_exception(*sys.exc_info())))

    def expunge_message(self, msg):
        note(3, "Expunging %s...", msg.mbox_id)
        self.remove_message(msg, save_state=False)
        remaining_categories = msg.mbox_id.get_category_strings()
        if "email" in remaining_categories:
            remaining_categories = list(remaining_categories)
            remaining_categories.remove("email")
        #note("    remaining categories in %s are %s...", msg.mbox_id, remaining_categories)
        # should we remove any categories not beginning with "email/"?
        repo = self.mailcontext().repo
        if repo.valid_doc_id(msg.mbox_id.id):
            if not remaining_categories:
                # no categories left, candidate for deletion
                if self.mailcontext().expunge_deletes_docs:
                    #note("    deleting document %s...", msg.mbox_id.id)
                    self.mailcontext().repo.delete_document(msg.mbox_id.id)
                elif self.mailcontext().expunge_deletes_inbox_docs and self.name.lower() == "inbox":
                    #note("    deleting document %s...", msg.mbox_id.id)
                    self.mailcontext().repo.delete_document(msg.mbox_id.id)
            else:
                self.save_message(msg)
        self.rescan()

    def rescan (self, clear_recency=False, callback=None):
        if clear_recency:
            self.recent_msgs = []
        self.folder.rescan()
        existing_msgs = [x.mbox_id for x in self.msgs.values()]
        current_docs = self.folder.docs()
        mailbox_uids = self.msgs.keys()[:]
        newmsgs = []
        delmsgs = []
        for uid in mailbox_uids:
            if self.msgs[uid].mbox_id not in current_docs:
                if callback: delmsgs.append(self.msgs[uid])
                self.remove_message(self.msgs[uid])
        for doc in self.folder.docs():
            if doc not in existing_msgs:
                msg = message(self, doc, None, [])
                self.add_message(msg)
                self.next_uid_val += 1
                if callback: newmsgs.append(msg)
        return newmsgs, delmsgs
            
    def checkpoint(self):
        for msg in self.msglist:
            if msg.needs_saving():
                self.save_message(msg)

#       ALL
#          All messages in the mailbox; the default initial key for
#          ANDing.

#       ANSWERED
#          Messages with the \Answered flag set.

#       BCC <string>
#          Messages that contain the specified string in the envelope
#          structure's BCC field.

#       BEFORE <date>
#          Messages whose internal date (disregarding time and timezone)
#          is earlier than the specified date.

#       BODY <string>
#          Messages that contain the specified string in the body of the
#          message.

#       CC <string>
#          Messages that contain the specified string in the envelope
#          structure's CC field.

#       DELETED
#          Messages with the \Deleted flag set.

#       DRAFT
#          Messages with the \Draft flag set.

#       FLAGGED
#          Messages with the \Flagged flag set.

#       FROM <string>
#          Messages that contain the specified string in the envelope
#          structure's FROM field.

#       HEADER <field-name> <string>
#          Messages that have a header with the specified field-name (as
#          defined in [RFC-2822]) and that contains the specified string
#          in the text of the header (what comes after the colon).  If the
#          string to search is zero-length, this matches all messages that
#          have a header line with the specified field-name regardless of
#          the contents.

#       KEYWORD <flag>
#          Messages with the specified keyword flag set.

#       LARGER <n>
#          Messages with an [RFC-2822] size larger than the specified
#          number of octets.

#       NEW
#          Messages that have the \Recent flag set but not the \Seen flag.
#          This is functionally equivalent to "(RECENT UNSEEN)".

#       NOT <search-key>
#          Messages that do not match the specified search key.

#       OLD
#          Messages that do not have the \Recent flag set.  This is
#          functionally equivalent to "NOT RECENT" (as opposed to "NOT
#          NEW").

#       ON <date>
#          Messages whose internal date (disregarding time and timezone)
#          is within the specified date.

#       OR <search-key1> <search-key2>
#          Messages that match either search key.

#       RECENT
#          Messages that have the \Recent flag set.

#       SEEN
#          Messages that have the \Seen flag set.

#       SENTBEFORE <date>
#          Messages whose [RFC-2822] Date: header (disregarding time and
#          timezone) is earlier than the specified date.

#       SENTON <date>
#          Messages whose [RFC-2822] Date: header (disregarding time and
#          timezone) is within the specified date.

#       SENTSINCE <date>
#          Messages whose [RFC-2822] Date: header (disregarding time and
#          timezone) is within or later than the specified date.

#       SINCE <date>
#          Messages whose internal date (disregarding time and timezone)
#          is within or later than the specified date.

#       SMALLER <n>
#          Messages with an [RFC-2822] size smaller than the specified
#          number of octets.

#       SUBJECT <string>
#          Messages that contain the specified string in the envelope
#          structure's SUBJECT field.

#       TEXT <string>
#          Messages that contain the specified string in the header or
#          body of the message.

#       TO <string>
#          Messages that contain the specified string in the envelope
#          structure's TO field.

#       UID <sequence set>
#          Messages with unique identifiers corresponding to the specified
#          unique identifier set.  Sequence set ranges are permitted.

#       UNANSWERED
#          Messages that do not have the \Answered flag set.

#       UNDELETED
#          Messages that do not have the \Deleted flag set.

#       UNDRAFT
#          Messages that do not have the \Draft flag set.

#       UNFLAGGED
#          Messages that do not have the \Flagged flag set.

#       UNKEYWORD <flag>
#          Messages that do not have the specified keyword flag set.

#       UNSEEN
#          Messages that do not have the \Seen flag set.


    def search (self, charset, args):
        # general strategy:  do a search for possible hits on header fields and/or
        # body text, then filter out things like "\Recent"

        def consume_args(charset, args, setset, prohibited_flags, required_flags, query):
            count = 0
            arg = args[count]
            charset = charset or "US-ASCII"
            arg = unicode(arg, charset, "strict").lower()
            if arg[0] in string.digits:
                seqset.append(('seq_nos', arg[0]))
            elif arg == u'uid' and arg[count+1][0] in string.digits:
                seqset.append(('uids', arg[0]))
            elif arg in (u"unanswered", u"undeleted", u"undraft", u"unflagged", u"unseen"):
                prohibited_flags.append(u"\\" + arg[2:].capitalize())
            elif arg in (u"answered", u"deleted", u"draft", u"flagged", u"seen"):
                required_flags.append(u"\\" + arg.capitalize())
            elif arg == u'old':
                prohibited_flags.append(u"\\Recent")
            elif arg == u'new':
                required_flags.append(u"\\Recent")
                prohibited_flags.append(u"\\Seen")
            elif arg == u'text':
                query += u' %s' % quote(unicode(args[count+1], charset, "strict"))
                count += 1
            elif arg == u'body':
                query += u' contents:%s' % quote(unicode(args[count+1], charset, "strict"))
                count += 1
            elif arg == u'subject':
                a = quote(unicode(args[count+1], charset, "strict"))
                query += u' (email-subject:%s OR title:%s)' % (a, a)
                count += 1
            elif arg in (u'to', u'cc', u'bcc'):
                # we don't handle "to" searches yet...
                count += 1
            elif arg in (u'smaller', u'larger'):
                # we don't handle "smaller" searches yet...
                count += 1
            elif arg == u'keyword':
                # we don't handle "keyword" searches yet...
                count += 1
            elif arg == u'since':
                d = email.Utils.parsedate(args[count+1])
                query += u' uplibdate:[%s/%s/%s TO NOW]' % (d[2], d[1], d[0])
            elif arg == u'before':
                d = email.Utils.parsedate(args[count+1])
                query += u' uplibdate:[1/1/1 TO %d/%d/%d]' % (d[2], d[1], d[0])
            elif arg == u'sentbefore':
                d = email.Utils.parsedate(args[count+1])
                query += u' date:[1/1/1 TO %d/%d/%d]' % (d[2], d[1], d[0])
            elif arg == u'sentsince':
                d = email.Utils.parsedate(args[count+1])
                query += u' date:[%s/%s/%s TO NOW]' % (d[2], d[1], d[0])
            elif arg == u'senton':
                d = email.Utils.parsedate(args[count+1])
                query += u' date:[%s/%s/%s TO %s/%s/%s]' % (d[2], d[1], d[0], d[2], d[1], d[0])
            elif arg == u'header':
                # we don't do header yet
                count += 2
            elif arg == u'from':
                query += u' (email-from:%s OR author:%s)' % (quote(args[count+1]), quote(args[count+1]))
                count += 1
            elif arg == u'not':
                args, subquery = consume_args(charset, args[1:], None, [], [], "")
                count = -1
                if subquery:
                    query += u" -( " + subquery + " )"
            elif arg == u'and':
                args, subquery1 = consume_args(charset, args[1:], None, [], [], "")
                args, subquery2 = consume_args(charset, args, None, [], [], "")
                count = -1
                if (subquery1 and subquery2):
                    query += u" ( " + subquery1 + u" AND " + subquery2 + " )"
            elif arg == u'or':
                args, subquery1 = consume_args(charset, args[1:], None, [], [], "")
                args, subquery2 = consume_args(charset, args, None, [], [], "")
                count = -1
                if (subquery1 and subquery2):
                    query += u" ( " + subquery1 + u" OR " + subquery2 + u" )"
            elif arg == u'all':
                pass
            return args[count+1:], query

        def satisfies_flags (msg, required, prohibited):
            for f in required:
                if f not in msg.flags:
                    return False
            for f in prohibited:
                if f in msg.flags:
                    return False
            return True

        query = self.folder.query[:]
        if not isinstance(query, unicode):
            query = unicode(query, "UTF-8", "strict")
        seqset = []
        prohibited_flags = []
        required_flags = []
        while args:
            args, query = consume_args(charset, args, seqset, prohibited_flags, required_flags, query)
        note("query is %s, required_flags are %s, prohibited_flags are %s", repr(query), required_flags, prohibited_flags)
        hits = QueryCollection(self.folder.repository, None, query).docs()
        note("hits are %s", hits)
        count = 0
        while count < len(self.msglist):
            msg = self.msglist[count]
            if (msg.mbox_id in hits) and satisfies_flags(msg, required_flags, prohibited_flags):
                yield msg, count+1
            count += 1

    def read_only (self, client):
        return ((client.state == STATE_NOT_AUTHENTICATED) or (client.user is None))