def matching_docids(repo, response, params): """Search for documents matching query=terms and return zero or more ids. The list of document ids is whitespace delimited. There is no support for cutoff/narrow/widen; the intended use is mostly for querying unique text which will match exactly one document.""" # This is an abbreviated version of basicPlugins.py _repo_search(). query = params['query'] cutoff = 0.0 coll = PrestoCollection(repo, None, query, None, None, cutoff) response.reply(' '.join([doc.id for doc in coll.docs()]), 'text/plain')
def test_algorithms (repo, *algs): from uplib.collection import PrestoCollection # find or create a test set c = repo.get_collection("categorize algorithms test set") if not c: c = PrestoCollection(repo, None, query="uplibdate:[1/1/2005 TO 5/1/2008]") repo.add_collection("categorize algorithms test set", c) # get "ground truth" id_to_categories_mapping = repo.get_docids_with_categories() # now run the tests algresults = [] for alg in algs: # for each document, see how many tags are in first 10, # and how many are missed docs = {} untagged = 0 tagged = 0 for doc in c.docs(): dtags = id_to_categories_mapping.get(doc.id, ())[:] # ignore untagged docs if dtags: tagged += 1 found = [] missed = [] tags = find_likely_tags(doc, score_adjust=alg) or [] for i in range(min(10, len(tags))): if tags[i][0] in dtags: found.append(tags[i]) dtags.remove(tags[i][0]) missed = dtags text = doc.text() textlen = (text and len(text.strip())) or 0 docs[doc.id] = (found, missed, int(doc.get_metadata("page-count") or doc.get_metadata("pagecount")), textlen, doc.get_metadata("title") or "") else: untagged += 1 note(3, "%s: %d untagged docs, %d tagged docs", alg, untagged, tagged) algresults.append((alg, docs,)) return algresults
def matching_ids_and_filenames(repo, response, params): """Search for documents matching query=terms and return zero or more matches. Each line lists a matching document as: doc_id ' ' filename """ # This is an abbreviated version of basicPlugins.py _repo_search(). query = params['query'] global INTERACTION_CHARSET if not INTERACTION_CHARSET: conf = configurator.default_configurator() INTERACTION_CHARSET = conf.get('interaction-charset', 'UTF-8') query = unicode(query, INTERACTION_CHARSET, 'replace') cutoff = 0.0 coll = PrestoCollection(repo, None, query, None, None, cutoff) result = [] for doc in coll.docs(): title, mtype = doc_title_and_type(doc) result.append('%s %s' % (doc.id, title)) response.reply('\n'.join(result), 'text/plain')
def __init__(self, name, mailcontextinst, category=None, flags=None, email_folder=True, ip=None, collection=None): query = (email_folder and '+apparent-mime-type:"message/rfc822"') or "" self.category = category self.collection = collection self.ip = ip self.mailcontext = weakref.ref(mailcontextinst) if collection: self.folder = collection else: if category: query += (' +categories:"' + category + '"') elif category == False: query += (' -categories:email') self.folder = PrestoCollection(mailcontextinst.repo, None, query) mailbox.__init__(self, name, ids=self.folder.docs(), flags=flags, allseen=(not email_folder))
class uplib_email_mailbox (mailbox): def __init__(self, name, mailcontextinst, category=None, flags=None, email_folder=True, ip=None, collection=None): query = (email_folder and '+apparent-mime-type:"message/rfc822"') or "" self.category = category self.collection = collection self.ip = ip self.mailcontext = weakref.ref(mailcontextinst) if collection: self.folder = collection else: if category: query += (' +categories:"' + category + '"') elif category == False: query += (' -categories:email') self.folder = PrestoCollection(mailcontextinst.repo, None, query) mailbox.__init__(self, name, ids=self.folder.docs(), flags=flags, allseen=(not email_folder)) def __str__(self): return '<UpLib mailbox "%s", id=%x, %s messages>' % (self.name, id(self), (self.msglist and len(self.msglist) or 0)) def __repr__(self): return '<UpLib mailbox "%s", id=%x, %s messages>' % (self.name, id(self), (self.msglist and len(self.msglist) or 0)) def may_have_children(self): return not self.collection def get_internaldate(self, msg): return time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime(msg.mbox_id.add_time())) def get_msg_as_email (self, doc): try: mime_type = doc.get_metadata("apparent-mime-type") if mime_type == "message/rfc822": f = os.path.join(doc.folder(), "originals") filepath = os.path.join(f, os.listdir(f)[0]) fp = open(filepath, 'r') s = fp.read() fp.close() msg = message_from_string(s) else: def make_header(name, value): try: v = value.encode("US-ASCII") charset = "US-ASCII" except: v = value.encode("UTF-8") charset = "UTF-8" return name, email.Header.Header(v, charset, 77, name).encode() def build_icon(doc): icon = doc.document_icon() img_part = email.Message.Message() img_part.set_type("image/png") cid = "%s.%s.%s.icon" % (self.ip, doc.repo.secure_port(), doc.id) img_part.add_header("Content-ID", cid) img_part.add_header("Content-Transfer-Encoding", "base64") img_part.set_payload(base64.encodestring(icon)) return img_part def build_description(doc, display): desc_part = email.Message.Message() desc_part.set_type("text/html") desc_part.add_header("Content-Transfer-Encoding", "quoted-printable") desc_part.set_payload(quopri.encodestring('<html><body bgcolor="%s">' % STANDARD_BACKGROUND_COLOR + display.encode('UTF-8') + "</body></html>\n"), "UTF-8") return desc_part icon_payload = build_icon(doc) display, name = self.build_html_abstract_display(doc, icon_payload.get("Content-ID")) msg = email.Message.Message() msg.set_type("multipart/related;boundary=%s%s%s%s" % (self.ip, doc.repo.secure_port(), doc.id, long(time.time()))) msg.add_header(*make_header("Message-ID", "%s:%s:%s" % (self.ip, doc.repo.secure_port(), doc.id))) d = doc.get_date() if d: try: d = email.Utils.formatdate(time.mktime((d[0], (d[1] or 1), (d[2] or 1), 0, 0, 0, 0, 1, -1,))) except: d = email.Utils.formatdate(id_to_time(doc.id)) else: d = email.Utils.formatdate(id_to_time(doc.id)) msg.add_header(*make_header("Date", d)) msg.add_header(*make_header("Subject", name)) authors = doc.get_metadata("authors") if authors: authors = authors.replace(" and ", ", ").replace('"', '\\"').replace('\r', '\\\r').replace('\\', '\\\\') msg.add_header(*make_header("From", '"' + authors + '"')) body_payload = build_description(doc, display) msg.attach(body_payload) msg.attach(icon_payload) # note("msg is:\n%s", str(msg)) return msg except: note("Exception getting document %s as email:\n%s", doc.id, string.join(traceback.format_exception(*sys.exc_info()))) return None def _read_state(self, doc): d = doc.get_metadata("imap-state") if d: d = eval(d) note(5, "imap state for %s is %s", doc.id, d) if d and d.has_key(self.name): state = d[self.name] uid = state.get("uid") flags = state.get("flags") return uid, [] return None, [] def _save_state(self, doc, uid, flags): d = doc.get_metadata("imap-state") d = eval(d or "{}") d[self.name] = { "uid": uid, "flags": flags } note("imap state is now %s", repr(d)) doc.update_metadata({"imap-state": repr(d)}, True) def add_message(self, msg): uid, flags = self._read_state(msg.mbox_id) if uid: msg.uid = uid else: msg.uid = self.next_uid_val self.next_uid_val += 1 msg.flags = flags if self.allseen: msg.flags.append("\\Seen") if msg.mbox_id not in self.folder.docs(): self.folder.include_doc(msg.mbox_id) doc = msg.mbox_id if self.category and (self.category not in doc.get_category_strings()): doc.add_category(self.category) msg.set_needs_saving() mailbox.add_message(self, msg) def build_html_abstract_display (self, doc, icon_cid): fp = StringIO() dict = doc.get_metadata() pubdate = dict.get("date") date = re.sub(" 0|^0", " ", time.strftime("%d %b %Y, %I:%M %p", time.localtime(id_to_time(doc.id)))) name = doc.id page_count = dict.get('page-count') summary = '<i>(No summary available.)</i>' if dict: if dict.has_key('title'): name = dict.get('title') elif dict.has_key('name'): name = '[' + dict.get('name') + ']' fp.write(u'<table border=0><tr><td>') fp.write(u'<center>') fp.write(u'<a href="https://%s:%d/action/basic/dv_show?doc_id=%s" border=0>' % (self.ip, doc.repo.secure_port(), doc.id)) fp.write(u'<img src="cid:%s">' % icon_cid) fp.write(u'</a><p><small><font color="%s">(%s)</font></small></center></td><td> </td>' % (STANDARD_DARK_COLOR, date)) fp.write(u'<td valign=top><h3>%s</h3>' % htmlescape(name)) if dict.has_key(u'authors') or pubdate: fp.write(u'<p><small>') if dict.has_key('authors'): fp.write(u'<b> %s</b>' % (re.sub(' and ', ', ', dict['authors']))) if pubdate: formatted_date = format_date(pubdate, True) fp.write(u' <i><font color="%s">%s</font></i>' % (STANDARD_DARK_COLOR, formatted_date)) fp.write(u'</small>\n') if dict.has_key('comment'): summary = htmlescape(dict.get('comment', '')) elif dict.has_key('abstract'): summary = "<i>" + htmlescape(dict.get('abstract', '')) + '</i>' elif dict.has_key('summary'): summary = '<font color="%s">' % STANDARD_DARK_COLOR + htmlescape(dict.get('summary')) + '</font>' fp.write(u'<P>%s' % summary) if page_count: fp.write(u'<small><i><font color="%s"> · (%s page%s)' % (STANDARD_DARK_COLOR, page_count, ((int(page_count) != 1) and "s") or "")) fp.write(u'</font></i></small>\n') cstrings = doc.get_category_strings() fp.write(u'<p>Categories: ') if cstrings: fp.write(string.join([htmlescape(s) for s in cstrings], u' · ')) else: fp.write('(none)') typ = doc.get_metadata("apparent-mime-type") if typ: mtype = ' · <small>%s</small>' % typ else: mtype = '' fp.write(u'<p><a href="https://%s:%s/action/externalAPI/fetch_original?doc_id=%s&browser=true"><font color="%s">(Original%s)</font></a>' % (self.ip, doc.repo.secure_port(), doc.id, STANDARD_DARK_COLOR, mtype)) fp.write(u' · <a href="https://%s:%s/action/basic/doc_pdf?doc_id=%s"><font color="%s">(PDF)</font></a>' % (self.ip, doc.repo.secure_port(), doc.id, STANDARD_DARK_COLOR)) if not mtype.lower().startswith("text/html"): fp.write(u' · <a href="https://%s:%s/action/basic/doc_html?doc_id=%s"><font color="%s">(HTML)</font></a>' % (self.ip, doc.repo.secure_port(), doc.id, STANDARD_DARK_COLOR)) fp.write(u'</td></tr></table>') d = fp.getvalue() fp.close() return d, name def save_message(self, msg): doc = msg.mbox_id if os.path.isdir(doc.folder()): note(" saving flags for message mailbox=%s/uid=%s '%s'", msg.mailbox.name, msg.uid, msg.mbox_id.get_metadata("title")) self._save_state(msg.mbox_id, msg.uid, msg.flags) msg.saved() def remove_message(self, msg, save_state=True): mailbox.remove_message(self, msg) note(" removed from %s...", self) if self.category: try: msg.mbox_id.remove_category(self.category) note(" removed category %s from %s...", self.category, msg.mbox_id) except: note(2, "%s", ''.join(traceback.format_exception(*sys.exc_info()))) elif self.collection: try: if isinstance(self.collection, PrestoCollection): self.collection.exclude_doc(msg.mbox_id) note(" excluded %s from %s...", msg.mbox_id, self.collection) elif msg.mbox_id in self.collection: self.collection.exclude_doc(msg.mbox_id) note(" excluded %s from %s...", msg.mbox_id, self.collection) except: note(2, "%s", ''.join(traceback.format_exception(*sys.exc_info()))) else: # if neither category nor collection, self.folder is a PrestoCollection try: assert isinstance(self.folder, PrestoCollection) self.folder.exclude_doc(msg.mbox_id) note(" excluded %s from %s...", msg.mbox_id, self.folder) except: note(2, "%s", ''.join(traceback.format_exception(*sys.exc_info()))) raise try: if save_state and msg.needs_saving(): self.save_message(msg) except: note(2, "%s", ''.join(traceback.format_exception(*sys.exc_info()))) def expunge_message(self, msg): note(3, "Expunging %s...", msg.mbox_id) self.remove_message(msg, save_state=False) remaining_categories = msg.mbox_id.get_category_strings() if "email" in remaining_categories: remaining_categories = list(remaining_categories) remaining_categories.remove("email") #note(" remaining categories in %s are %s...", msg.mbox_id, remaining_categories) # should we remove any categories not beginning with "email/"? repo = self.mailcontext().repo if repo.valid_doc_id(msg.mbox_id.id): if not remaining_categories: # no categories left, candidate for deletion if self.mailcontext().expunge_deletes_docs: #note(" deleting document %s...", msg.mbox_id.id) self.mailcontext().repo.delete_document(msg.mbox_id.id) elif self.mailcontext().expunge_deletes_inbox_docs and self.name.lower() == "inbox": #note(" deleting document %s...", msg.mbox_id.id) self.mailcontext().repo.delete_document(msg.mbox_id.id) else: self.save_message(msg) self.rescan() def rescan (self, clear_recency=False, callback=None): if clear_recency: self.recent_msgs = [] self.folder.rescan() existing_msgs = [x.mbox_id for x in self.msgs.values()] current_docs = self.folder.docs() mailbox_uids = self.msgs.keys()[:] newmsgs = [] delmsgs = [] for uid in mailbox_uids: if self.msgs[uid].mbox_id not in current_docs: if callback: delmsgs.append(self.msgs[uid]) self.remove_message(self.msgs[uid]) for doc in self.folder.docs(): if doc not in existing_msgs: msg = message(self, doc, None, []) self.add_message(msg) self.next_uid_val += 1 if callback: newmsgs.append(msg) return newmsgs, delmsgs def checkpoint(self): for msg in self.msglist: if msg.needs_saving(): self.save_message(msg) # ALL # All messages in the mailbox; the default initial key for # ANDing. # ANSWERED # Messages with the \Answered flag set. # BCC <string> # Messages that contain the specified string in the envelope # structure's BCC field. # BEFORE <date> # Messages whose internal date (disregarding time and timezone) # is earlier than the specified date. # BODY <string> # Messages that contain the specified string in the body of the # message. # CC <string> # Messages that contain the specified string in the envelope # structure's CC field. # DELETED # Messages with the \Deleted flag set. # DRAFT # Messages with the \Draft flag set. # FLAGGED # Messages with the \Flagged flag set. # FROM <string> # Messages that contain the specified string in the envelope # structure's FROM field. # HEADER <field-name> <string> # Messages that have a header with the specified field-name (as # defined in [RFC-2822]) and that contains the specified string # in the text of the header (what comes after the colon). If the # string to search is zero-length, this matches all messages that # have a header line with the specified field-name regardless of # the contents. # KEYWORD <flag> # Messages with the specified keyword flag set. # LARGER <n> # Messages with an [RFC-2822] size larger than the specified # number of octets. # NEW # Messages that have the \Recent flag set but not the \Seen flag. # This is functionally equivalent to "(RECENT UNSEEN)". # NOT <search-key> # Messages that do not match the specified search key. # OLD # Messages that do not have the \Recent flag set. This is # functionally equivalent to "NOT RECENT" (as opposed to "NOT # NEW"). # ON <date> # Messages whose internal date (disregarding time and timezone) # is within the specified date. # OR <search-key1> <search-key2> # Messages that match either search key. # RECENT # Messages that have the \Recent flag set. # SEEN # Messages that have the \Seen flag set. # SENTBEFORE <date> # Messages whose [RFC-2822] Date: header (disregarding time and # timezone) is earlier than the specified date. # SENTON <date> # Messages whose [RFC-2822] Date: header (disregarding time and # timezone) is within the specified date. # SENTSINCE <date> # Messages whose [RFC-2822] Date: header (disregarding time and # timezone) is within or later than the specified date. # SINCE <date> # Messages whose internal date (disregarding time and timezone) # is within or later than the specified date. # SMALLER <n> # Messages with an [RFC-2822] size smaller than the specified # number of octets. # SUBJECT <string> # Messages that contain the specified string in the envelope # structure's SUBJECT field. # TEXT <string> # Messages that contain the specified string in the header or # body of the message. # TO <string> # Messages that contain the specified string in the envelope # structure's TO field. # UID <sequence set> # Messages with unique identifiers corresponding to the specified # unique identifier set. Sequence set ranges are permitted. # UNANSWERED # Messages that do not have the \Answered flag set. # UNDELETED # Messages that do not have the \Deleted flag set. # UNDRAFT # Messages that do not have the \Draft flag set. # UNFLAGGED # Messages that do not have the \Flagged flag set. # UNKEYWORD <flag> # Messages that do not have the specified keyword flag set. # UNSEEN # Messages that do not have the \Seen flag set. def search (self, charset, args): # general strategy: do a search for possible hits on header fields and/or # body text, then filter out things like "\Recent" def consume_args(charset, args, setset, prohibited_flags, required_flags, query): count = 0 arg = args[count] charset = charset or "US-ASCII" arg = unicode(arg, charset, "strict").lower() if arg[0] in string.digits: seqset.append(('seq_nos', arg[0])) elif arg == u'uid' and arg[count+1][0] in string.digits: seqset.append(('uids', arg[0])) elif arg in (u"unanswered", u"undeleted", u"undraft", u"unflagged", u"unseen"): prohibited_flags.append(u"\\" + arg[2:].capitalize()) elif arg in (u"answered", u"deleted", u"draft", u"flagged", u"seen"): required_flags.append(u"\\" + arg.capitalize()) elif arg == u'old': prohibited_flags.append(u"\\Recent") elif arg == u'new': required_flags.append(u"\\Recent") prohibited_flags.append(u"\\Seen") elif arg == u'text': query += u' %s' % quote(unicode(args[count+1], charset, "strict")) count += 1 elif arg == u'body': query += u' contents:%s' % quote(unicode(args[count+1], charset, "strict")) count += 1 elif arg == u'subject': a = quote(unicode(args[count+1], charset, "strict")) query += u' (email-subject:%s OR title:%s)' % (a, a) count += 1 elif arg in (u'to', u'cc', u'bcc'): # we don't handle "to" searches yet... count += 1 elif arg in (u'smaller', u'larger'): # we don't handle "smaller" searches yet... count += 1 elif arg == u'keyword': # we don't handle "keyword" searches yet... count += 1 elif arg == u'since': d = email.Utils.parsedate(args[count+1]) query += u' uplibdate:[%s/%s/%s TO NOW]' % (d[2], d[1], d[0]) elif arg == u'before': d = email.Utils.parsedate(args[count+1]) query += u' uplibdate:[1/1/1 TO %d/%d/%d]' % (d[2], d[1], d[0]) elif arg == u'sentbefore': d = email.Utils.parsedate(args[count+1]) query += u' date:[1/1/1 TO %d/%d/%d]' % (d[2], d[1], d[0]) elif arg == u'sentsince': d = email.Utils.parsedate(args[count+1]) query += u' date:[%s/%s/%s TO NOW]' % (d[2], d[1], d[0]) elif arg == u'senton': d = email.Utils.parsedate(args[count+1]) query += u' date:[%s/%s/%s TO %s/%s/%s]' % (d[2], d[1], d[0], d[2], d[1], d[0]) elif arg == u'header': # we don't do header yet count += 2 elif arg == u'from': query += u' (email-from:%s OR author:%s)' % (quote(args[count+1]), quote(args[count+1])) count += 1 elif arg == u'not': args, subquery = consume_args(charset, args[1:], None, [], [], "") count = -1 if subquery: query += u" -( " + subquery + " )" elif arg == u'and': args, subquery1 = consume_args(charset, args[1:], None, [], [], "") args, subquery2 = consume_args(charset, args, None, [], [], "") count = -1 if (subquery1 and subquery2): query += u" ( " + subquery1 + u" AND " + subquery2 + " )" elif arg == u'or': args, subquery1 = consume_args(charset, args[1:], None, [], [], "") args, subquery2 = consume_args(charset, args, None, [], [], "") count = -1 if (subquery1 and subquery2): query += u" ( " + subquery1 + u" OR " + subquery2 + u" )" elif arg == u'all': pass return args[count+1:], query def satisfies_flags (msg, required, prohibited): for f in required: if f not in msg.flags: return False for f in prohibited: if f in msg.flags: return False return True query = self.folder.query[:] if not isinstance(query, unicode): query = unicode(query, "UTF-8", "strict") seqset = [] prohibited_flags = [] required_flags = [] while args: args, query = consume_args(charset, args, seqset, prohibited_flags, required_flags, query) note("query is %s, required_flags are %s, prohibited_flags are %s", repr(query), required_flags, prohibited_flags) hits = QueryCollection(self.folder.repository, None, query).docs() note("hits are %s", hits) count = 0 while count < len(self.msglist): msg = self.msglist[count] if (msg.mbox_id in hits) and satisfies_flags(msg, required_flags, prohibited_flags): yield msg, count+1 count += 1 def read_only (self, client): return ((client.state == STATE_NOT_AUTHENTICATED) or (client.user is None))