def doc_from_bytes(docid, rdkey, b): msg = message_from_string(b) doc = {} mp = doc['multipart'] = msg.is_multipart() headers = doc['headers'] = {} # Given we have no opportunity to introduce an object which can ignore # the case of headers, we lowercase the keys for hn in msg.keys(): vals = msg.get_all(hn) if vals: # first do any charset etc conversion... vals = [_safe_convert_header(v) for v in vals] if hn.lower() == 'references': # email.utils.unquote will do bad things to references headers (stripping # initial and trailing <>'s, so we don't want to use it for the # references header-- but other fields seem ok. We split the references # into a list here because why not. headers[hn.lower()] = [extract_message_ids(vals[0])] else: headers[hn.lower()] = [unquote(v) for v in vals] # a sanity check and to help debug an obscure bug which seemed to # cause the wrong 'source' doc being passed! if __debug__ and rdkey[0]=='email' and hn.lower()=='message-id': from raindrop.proto.imap import get_rdkey_for_email assert tuple(rdkey)==get_rdkey_for_email(vals[0]), (rdkey, docid, vals) # XXX - technically msg objects are recursive; handling that requires # more thought. For now, assume they are flat. # We must return non-text parts in attachments, so just return # *everything* in attachments. attachments = doc['_attachments'] = {} if mp: # a multi-part message - flatten it here by walking the list, but # only looking at the 'leaf' nodes. # attachments have lost their order; this object helps keep the # other and is a convenient place to stash other headers coming # with this part. mi = doc['multipart_info'] = [] i = 1 for attach in msg.walk(): if not attach.is_multipart(): name = sanitize_attach_name(attach.get_filename()) if not name: name = "subpart-%d" % i i += 1 attachments[name] = attach_from_msg((docid, name), attach) # Put together info about the attachment. ah = {} for hn, hv in attach.items(): ah[hn.lower()] = _safe_convert_header(hv) # content-type is redundant, but may be helpful... ct = attachments[name]['content_type'] info = {'name': name, 'headers': ah, 'content_type': ct} mi.append(info) else: attachments['body'] = attach_from_msg((docid, 'body'), msg) return doc
def handler(doc): # a 'rfc822' stores 'headers' as a dict, with each entry being a list. # We only care about headers which rfc5322 must appear 0 or 1 times, so # flatten the header values here... headers = dict((k, v[0]) for (k, v) in doc['headers'].iteritems()) self_header_message_id = headers.get('message-id') # check something hasn't got confused... assert get_rdkey_for_email(self_header_message_id) == tuple(doc['rd_key']), doc if 'references' in headers: header_message_ids = headers['references'] elif 'in-reply-to' in headers: header_message_ids = [headers['in-reply-to']] else: header_message_ids = [] # save off the list of referenced messages (XXX - but this isn't used?) references = header_message_ids[:] # see if the self-message already exists... header_message_ids.append(self_header_message_id) uniq_header_message_ids = set(header_message_ids) logger.debug("header_message_ids: %s ", header_message_ids) logger.debug("references: %s", '\n\t'.join(references)) # Open a view trying to locate an existing conversation for any of these # headers. keys = [['rd.core.content', 'key-schema_id', [['email', mid], 'rd.msg.conversation']] for mid in uniq_header_message_ids] result = open_view(keys=keys, reduce=False, include_docs=True) # build a map of the keys we actually got back. rows = [r for r in result['rows'] if 'error' not in r] if rows: assert 'doc' in rows[0], rows convo_id = rows[0]['doc']['conversation_id'] logger.debug("FOUND CONVERSATION header_message_id %s with conversation_id %s", self_header_message_id, convo_id) seen_ids = set(r['value']['rd_key'][1] for r in rows) else: logger.debug("CREATING conversation_id %s", header_message_ids[0]) convo_id = header_message_ids[0] seen_ids = None items = {'conversation_id': convo_id} # create convo records for any messages which don't yet exist - # presumably that includes me too! for hid in uniq_header_message_ids: if seen_ids is None or hid not in seen_ids: rdkey = ['email', hid] logger.debug('emitting convo schema referenced message %r', rdkey) emit_schema('rd.msg.conversation', items, rd_key=rdkey) # make sure current doc gets emitted in case it was # not part of the uniq_header_message_ids if doc['rd_key'][1] not in uniq_header_message_ids: logger.debug('emitting convo schema for my document %(rd_key)r', doc) emit_schema('rd.msg.conversation', items, rd_key=doc['rd_key'])
def gen_corpus_docs(self, corpus_name, item_spec="*"): cwd = os.getcwd() corpus_dir = self.get_corpus_dir(corpus_name) num = 0 # We try and make life simple for people by auto-determining the # 'schema' for some well-known file types (eg, .rfc822.txt) pattern = "%s/%s.*" % (corpus_dir, item_spec) base_names = set() for filename in glob.iglob(pattern): try: path, name = os.path.split(filename) # don't use splitext - we want the *first* dot. first, _ = filename.split(".", 1) base = os.path.join(path, first) except ValueError: base = filename base_names.add(base) for basename in base_names: if basename.endswith('README') or basename.endswith('raindrop'): continue # .json files get first go - they may 'override' what we would # otherwise deduce. elif os.path.exists(basename + ".json"): filename = basename + ".json" with open(filename) as f: try: ob = json.load(f) except ValueError, why: self.fail("%r has invalid json: %r" % (filename, why)) for name, data in ob.get('_attachments', {}).iteritems(): fname = os.path.join(corpus_dir, data['filename']) with open(fname, 'rb') as attach_f: enc_data = base64.encodestring(attach_f.read()).replace('\n', '') data['data'] = enc_data elif os.path.exists(basename + ".rfc822.txt"): # plain rfc822.txt file. with open(basename + ".rfc822.txt", 'rb') as f: data = f.read() msg_id = message_from_string(data)['message-id'] ob = { 'rd_schema_id': 'rd.msg.rfc822', 'rd_key': get_rdkey_for_email(msg_id), 'rd_ext_id': 'proto.imap', 'rd_source': None, '_attachments' : { 'rfc822': { 'content_type': 'message', 'data': base64.encodestring(data).replace('\n', ''), } } }
def handler(doc): # a 'rfc822' stores 'headers' as a dict, with each entry being a list. # We only care about headers which rfc5322 must appear 0 or 1 times, so # flatten the header values here... headers = dict((k, v[0]) for (k, v) in doc['headers'].iteritems()) self_header_message_id = headers.get('message-id') # check something hasn't got confused... assert get_rdkey_for_email(self_header_message_id) == tuple(doc['rd_key']), doc references = set() if 'references' in headers: # 'references' is a bit special though - the provider of the source # schema has already split them! for ref in headers['references']: references.add(ref) if 'in-reply-to' in headers: references.add(headers['in-reply-to']) # the self-message... references.add(self_header_message_id) logger.debug("references: %s", '\n\t'.join(references)) keys = (get_rdkey_for_email(mid) for mid in references) find_and_emit_conversation(keys)
def rfc822_to_schema_item(self, fp): data = fp.read() # we need to use the data twice... msg_id = message_from_string(data)['message-id'] si = { 'rd_schema_id': 'rd.msg.rfc822', 'rd_key': get_rdkey_for_email(msg_id), 'rd_ext_id': 'proto.imap', 'rd_source': None, 'attachments' : { 'rfc822': { 'content_type': 'message', 'data': data, } }, 'items' : {}, } return si
def failed_recipient(doc): # failed_recipient is the email address(es) that was the intended target # which for whatever reason doesn't exist # we want to grab this because: # 1. We could look through the address book for similar identities # 2. In the UI we could offer a message "You tried to email X but the address appears incorrect" # XXX however currently we do nothing with it failed_recipient = doc['headers']['x-failed-recipients'][0] logger.debug("found DSN failed recipient message intended for %s", failed_recipient) body = open_schema_attachment(doc, 'body') # Scan Message Body for the original message id # e.g. Message-ID: <*****@*****.**> match = re.search('Message-ID:\s*<(.+)>.*', body) if (match): logger.debug("found Message-ID header in DSN message '%s'", match.group(1)) id = match.group(1) # make an rd_key for the referenced message. rdkey_orig = get_rdkey_for_email(id) # and say this message and the original are related. find_and_emit_conversation([doc['rd_key'], rdkey_orig]) else: logger.info("No match found for DSN messsage") return
def handler(doc): # This is dealing with the 'imap folder state cache doc' - it stores # all meta-data about all items in a folder; so one document holds the # state for many messages. We first need to determine which are # different... rdkeys = [] imap_flags = [] folder_name = doc['rd_key'][1][1] for item in doc['infos']: msg_id = item['ENVELOPE'][-1] rdkey = get_rdkey_for_email(msg_id) rdkeys.append(rdkey) imap_flags.append((rdkey, item['FLAGS'])) result = open_view('raindrop!content!all', 'msg-seen-flag', keys=rdkeys) # turn the result into a dict keyed by rdkey couch_values = {} for row in result['rows']: couch_values[hashable_key(row['key'])] = row['value'] # work out which of these rdkeys actually exist in our db. existing_rdkeys = set() keys = [['key-schema_id', [rdkey, 'rd.msg.rfc822']] for rdkey in rdkeys] result = open_view(keys=keys, reduce=False) for row in result['rows']: existing_rdkeys.add(hashable_key(row['value']['rd_key'])) # find what is different... nnew = 0 nupdated = 0 # Note it is fairly common to see multiples with the same msg ID in, eg # a 'drafts' folder, so skip duplicates to avoid conflicts. seen_keys = set() for rdkey, flags in imap_flags: if rdkey in seen_keys: logger.info('skipping duplicate message in folder %r: %r', folder_name, rdkey) continue if rdkey not in existing_rdkeys: # this means we haven't actually sucked the message into raindrop # yet (eg, --max-age may have caused only a subset of the messages # to be grabbed, although all messages in the folder are returned # in the input document) logger.debug('skipping message not yet in folder %r: %r', folder_name, rdkey) continue seen_keys.add(rdkey) seen_now = "\\Seen" in flags try: couch_value = couch_values[rdkey] except KeyError: # new message items = {'seen' : seen_now, 'outgoing_state' : 'incoming', } emit_schema('rd.msg.seen', items, rdkey) nnew += 1 else: # If the state in couch is anything other than 'incoming'', it # represents a request to change the state on the server (or the # process of trying to update the server). if couch_value.get('outgoing_state') != 'incoming': logger.info("found outgoing 'seen' state request in doc with key %r", rdkey) continue seen_couch = couch_value['seen'] if seen_now != seen_couch: items = {'seen' : seen_now, 'outgoing_state' : 'incoming', '_rev' : couch_value['_rev'], } emit_schema('rd.msg.seen', items, rdkey) nupdated += 1 logger.info("folder %r needs %d new and %d updated 'seen' records", folder_name, nnew, nupdated)
def handler(doc): # This is dealing with the 'imap folder state cache doc' - it stores # all meta-data about all items in a folder; so one document holds the # state for many messages. We first need to determine which are # different... keys = [] rdkeys = [] imap_flags = [] folder_name = doc['rd_key'][1][1] for item in doc['infos']: msg_id = item['ENVELOPE'][-1] rdkey = get_rdkey_for_email(msg_id) rdkeys.append(rdkey) keys.append(['rd.core.content', 'key-schema_id', [rdkey, 'rd.msg.seen']]) imap_flags.append((rdkey, item['FLAGS'])) result = open_view(keys=keys, reduce=False, include_docs=True) # turn the result into a dict also keyed by rdkey couch_docs = {} for row in result['rows']: couch_docs[tuple(row['value']['rd_key'])] = row['doc'] # work out which of these rdkeys actually exist in our db. existing_rdkeys = set() keys = [] for rdkey in rdkeys: keys.append(['rd.core.content', 'key-schema_id', [rdkey, 'rd.msg.rfc822']]) result = open_view(keys=keys, reduce=False) for row in result['rows']: existing_rdkeys.add(tuple(row['value']['rd_key'])) # find what is different... nnew = 0 to_up = [] # Note it is fairly common to see multiples with the same msg ID in, eg # a 'drafts' folder, so skip duplicates to avoid conflicts. seen_keys = set() for rdkey, flags in imap_flags: if rdkey in seen_keys: logger.info('skipping duplicate message in folder %r: %r', folder_name, rdkey) continue if rdkey not in existing_rdkeys: logger.debug('skipping message not yet in folder %r: %r', folder_name, rdkey) continue seen_keys.add(rdkey) seen_now = "\\Seen" in flags try: doc = couch_docs[rdkey] except KeyError: # new message items = {'seen' : seen_now, 'outgoing_state' : 'incoming', } emit_schema('rd.msg.seen', items, rdkey) nnew += 1 else: # If the state in couch is anything other than 'incoming'', it # represents a request to change the state on the server (or the # process of trying to update the server). if doc.get('outgoing_state') != 'incoming': logger.info("found outgoing 'seen' state request in doc %(_id)r", doc) continue seen_couch = doc['seen'] if seen_now != seen_couch: doc['seen'] = seen_now to_up.append(doc) if to_up: update_documents(to_up) logger.info("folder %r needs %d new and %d updated 'seen' records", folder_name, nnew, len(to_up))