コード例 #1
0
def doc_from_bytes(docid, rdkey, b):
    msg = message_from_string(b)
    doc = {}
    mp = doc['multipart'] = msg.is_multipart()
    headers = doc['headers'] = {}
    # Given we have no opportunity to introduce an object which can ignore
    # the case of headers, we lowercase the keys
    for hn in msg.keys():
        vals = msg.get_all(hn)
        if vals:
            # first do any charset etc conversion...
            vals = [_safe_convert_header(v) for v in vals]
            if hn.lower() == 'references':
                # email.utils.unquote will do bad things to references headers (stripping
                # initial and trailing <>'s, so we don't want to use it for the
                # references header-- but other fields seem ok.  We split the references
                # into a list here because why not.
                headers[hn.lower()] = [extract_message_ids(vals[0])]
            else:
                headers[hn.lower()] = [unquote(v) for v in vals]
            # a sanity check and to help debug an obscure bug which seemed to
            # cause the wrong 'source' doc being passed!
            if __debug__ and rdkey[0]=='email' and hn.lower()=='message-id':
                from raindrop.proto.imap import get_rdkey_for_email
                assert tuple(rdkey)==get_rdkey_for_email(vals[0]), (rdkey, docid, vals)

    # XXX - technically msg objects are recursive; handling that requires
    # more thought.  For now, assume they are flat.
    # We must return non-text parts in attachments, so just return
    # *everything* in attachments.
    attachments = doc['_attachments'] = {}

    if mp:
        # a multi-part message - flatten it here by walking the list, but
        # only looking at the 'leaf' nodes.
        # attachments have lost their order; this object helps keep the
        # other and is a convenient place to stash other headers coming
        # with this part.
        mi = doc['multipart_info'] = []
        i = 1
        for attach in msg.walk():
            if not attach.is_multipart():
                name = sanitize_attach_name(attach.get_filename())
                if not name:
                    name = "subpart-%d" % i
                    i += 1
                attachments[name] = attach_from_msg((docid, name), attach)
                # Put together info about the attachment.
                ah = {}
                for hn, hv in attach.items():
                    ah[hn.lower()] = _safe_convert_header(hv)
                # content-type is redundant, but may be helpful...
                ct = attachments[name]['content_type']
                info = {'name': name, 'headers': ah, 'content_type': ct}
                mi.append(info)
    else:
        attachments['body'] = attach_from_msg((docid, 'body'), msg)
    return doc
コード例 #2
0
def handler(doc):
    # a 'rfc822' stores 'headers' as a dict, with each entry being a list.
    # We only care about headers which rfc5322 must appear 0 or 1 times, so
    # flatten the header values here...
    headers = dict((k, v[0]) for (k, v) in doc['headers'].iteritems())
    self_header_message_id = headers.get('message-id')
    # check something hasn't got confused...
    assert get_rdkey_for_email(self_header_message_id) == tuple(doc['rd_key']), doc

    if 'references' in headers:
        header_message_ids = headers['references']
    elif 'in-reply-to' in headers:
        header_message_ids = [headers['in-reply-to']]
    else:
        header_message_ids = []
    # save off the list of referenced messages (XXX - but this isn't used?)
    references = header_message_ids[:]
    # see if the self-message already exists...
    header_message_ids.append(self_header_message_id)
    uniq_header_message_ids = set(header_message_ids)
    logger.debug("header_message_ids: %s ", header_message_ids)
    logger.debug("references: %s", '\n\t'.join(references))
    # Open a view trying to locate an existing conversation for any of these
    # headers.
    keys = [['rd.core.content', 'key-schema_id', [['email', mid], 'rd.msg.conversation']]
            for mid in uniq_header_message_ids]
    result = open_view(keys=keys, reduce=False,
                       include_docs=True)
    # build a map of the keys we actually got back.
    rows = [r for r in result['rows'] if 'error' not in r]
    if rows:
        assert 'doc' in rows[0], rows
        convo_id = rows[0]['doc']['conversation_id']
        logger.debug("FOUND CONVERSATION header_message_id %s with conversation_id %s",
                     self_header_message_id, convo_id)
        seen_ids = set(r['value']['rd_key'][1] for r in rows)
    else:
        logger.debug("CREATING conversation_id %s", header_message_ids[0])
        convo_id = header_message_ids[0]
        seen_ids = None

    items = {'conversation_id': convo_id}
    # create convo records for any messages which don't yet exist -
    # presumably that includes me too!
    for hid in uniq_header_message_ids:
        if seen_ids is None or hid not in seen_ids:
            rdkey = ['email', hid]
            logger.debug('emitting convo schema referenced message %r', rdkey)
            emit_schema('rd.msg.conversation', items, rd_key=rdkey)

    # make sure current doc gets emitted in case it was
    # not part of the uniq_header_message_ids
    if doc['rd_key'][1] not in uniq_header_message_ids:
        logger.debug('emitting convo schema for my document %(rd_key)r', doc)
        emit_schema('rd.msg.conversation', items, rd_key=doc['rd_key'])
コード例 #3
0
ファイル: __init__.py プロジェクト: dineshkummarc/raindrop
 def gen_corpus_docs(self, corpus_name, item_spec="*"):
     cwd = os.getcwd()
     corpus_dir = self.get_corpus_dir(corpus_name)
     num = 0
     # We try and make life simple for people by auto-determining the
     # 'schema' for some well-known file types (eg, .rfc822.txt)
     pattern = "%s/%s.*" % (corpus_dir, item_spec)
     base_names = set()
     for filename in glob.iglob(pattern):
         try:
             path, name = os.path.split(filename)
             # don't use splitext - we want the *first* dot.
             first, _ = filename.split(".", 1)
             base = os.path.join(path, first)
         except ValueError:
             base = filename
         base_names.add(base)
     for basename in base_names:
         if basename.endswith('README') or basename.endswith('raindrop'):
             continue
         # .json files get first go - they may 'override' what we would
         # otherwise deduce.
         elif os.path.exists(basename + ".json"):
             filename = basename + ".json"
             with open(filename) as f:
                 try:
                     ob = json.load(f)
                 except ValueError, why:
                     self.fail("%r has invalid json: %r" % (filename, why))
                 for name, data in ob.get('_attachments', {}).iteritems():
                     fname = os.path.join(corpus_dir, data['filename'])
                     with open(fname, 'rb') as attach_f:
                         enc_data = base64.encodestring(attach_f.read()).replace('\n', '')
                         data['data'] = enc_data
         elif os.path.exists(basename + ".rfc822.txt"):
             # plain rfc822.txt file.
             with open(basename + ".rfc822.txt", 'rb') as f:
                 data = f.read()
             msg_id = message_from_string(data)['message-id']
             ob = {
                   'rd_schema_id': 'rd.msg.rfc822',
                   'rd_key': get_rdkey_for_email(msg_id),
                   'rd_ext_id': 'proto.imap',
                   'rd_source': None,
                   '_attachments' : {
                     'rfc822': {
                         'content_type': 'message',
                         'data': base64.encodestring(data).replace('\n', ''),
                     }
                   }
                 }
コード例 #4
0
def handler(doc):
    # a 'rfc822' stores 'headers' as a dict, with each entry being a list.
    # We only care about headers which rfc5322 must appear 0 or 1 times, so
    # flatten the header values here...
    headers = dict((k, v[0]) for (k, v) in doc['headers'].iteritems())
    self_header_message_id = headers.get('message-id')
    # check something hasn't got confused...
    assert get_rdkey_for_email(self_header_message_id) == tuple(doc['rd_key']), doc

    references = set()
    if 'references' in headers:
        # 'references' is a bit special though - the provider of the source
        # schema has already split them!
        for ref in headers['references']:
            references.add(ref)
    if 'in-reply-to' in headers:
        references.add(headers['in-reply-to'])
    # the self-message...
    references.add(self_header_message_id)
    logger.debug("references: %s", '\n\t'.join(references))
    
    keys = (get_rdkey_for_email(mid) for mid in references)
    find_and_emit_conversation(keys)
コード例 #5
0
ファイル: __init__.py プロジェクト: BigBlueHat/raindrop
 def rfc822_to_schema_item(self, fp):
     data = fp.read() # we need to use the data twice...
     msg_id = message_from_string(data)['message-id']
     si = {
             'rd_schema_id': 'rd.msg.rfc822',
             'rd_key': get_rdkey_for_email(msg_id),
             'rd_ext_id': 'proto.imap',
             'rd_source': None,
             'attachments' : {
                 'rfc822': {
                     'content_type': 'message',
                     'data': data,
                 }
             },
             'items' : {},
         }
     return si
コード例 #6
0
def failed_recipient(doc):
    # failed_recipient is the email address(es) that was the intended target
    # which for whatever reason doesn't exist
    # we want to grab this because:
    #  1. We could look through the address book for similar identities
    #  2. In the UI we could offer a message "You tried to email X but the address appears incorrect"
    # XXX however currently we do nothing with it
    failed_recipient = doc['headers']['x-failed-recipients'][0]
    logger.debug("found DSN failed recipient message intended for %s", failed_recipient)

    body = open_schema_attachment(doc, 'body')
    # Scan Message Body for the original message id
    # e.g.  Message-ID: <*****@*****.**>
    match = re.search('Message-ID:\s*<(.+)>.*', body)
    if (match):
        logger.debug("found Message-ID header in DSN message '%s'", match.group(1))
        id = match.group(1)
        # make an rd_key for the referenced message.
        rdkey_orig = get_rdkey_for_email(id)
        # and say this message and the original are related.
        find_and_emit_conversation([doc['rd_key'], rdkey_orig])
    else:
        logger.info("No match found for DSN messsage")
        return
コード例 #7
0
def handler(doc):
    # This is dealing with the 'imap folder state cache doc' - it stores
    # all meta-data about all items in a folder; so one document holds the
    # state for many messages.  We first need to determine which are
    # different...
    rdkeys = []
    imap_flags = []
    folder_name = doc['rd_key'][1][1]

    for item in doc['infos']:
        msg_id = item['ENVELOPE'][-1]
        rdkey = get_rdkey_for_email(msg_id)
        rdkeys.append(rdkey)
        imap_flags.append((rdkey, item['FLAGS']))
    result = open_view('raindrop!content!all', 'msg-seen-flag', keys=rdkeys)

    # turn the result into a dict keyed by rdkey
    couch_values = {}
    for row in result['rows']:
        couch_values[hashable_key(row['key'])] = row['value']

    # work out which of these rdkeys actually exist in our db.
    existing_rdkeys = set()
    keys = [['key-schema_id', [rdkey, 'rd.msg.rfc822']]
            for rdkey in rdkeys]
    result = open_view(keys=keys, reduce=False)
    for row in result['rows']:
        existing_rdkeys.add(hashable_key(row['value']['rd_key']))

    # find what is different...
    nnew = 0
    nupdated = 0
    # Note it is fairly common to see multiples with the same msg ID in, eg
    # a 'drafts' folder, so skip duplicates to avoid conflicts.
    seen_keys = set()
    for rdkey, flags in imap_flags:
        if rdkey in seen_keys:
            logger.info('skipping duplicate message in folder %r: %r',
                        folder_name, rdkey)
            continue
        if rdkey not in existing_rdkeys:
            # this means we haven't actually sucked the message into raindrop
            # yet (eg, --max-age may have caused only a subset of the messages
            # to be grabbed, although all messages in the folder are returned
            # in the input document)
            logger.debug('skipping message not yet in folder %r: %r',
                         folder_name, rdkey)
            continue
        seen_keys.add(rdkey)
        seen_now = "\\Seen" in flags
        try:
            couch_value = couch_values[rdkey]
        except KeyError:
            # new message
            items = {'seen' : seen_now,
                     'outgoing_state' : 'incoming',
                     }
            emit_schema('rd.msg.seen', items, rdkey)
            nnew += 1
        else:
            # If the state in couch is anything other than 'incoming'', it
            # represents a request to change the state on the server (or the
            # process of trying to update the server).
            if couch_value.get('outgoing_state') != 'incoming':
                logger.info("found outgoing 'seen' state request in doc with key %r", rdkey)
                continue
            seen_couch = couch_value['seen']
            if seen_now != seen_couch:
                items = {'seen' : seen_now,
                         'outgoing_state' : 'incoming',
                         '_rev' : couch_value['_rev'],
                         }
                emit_schema('rd.msg.seen', items, rdkey)
                nupdated += 1
    logger.info("folder %r needs %d new and %d updated 'seen' records",
                folder_name, nnew, nupdated)
コード例 #8
0
def handler(doc):
    # This is dealing with the 'imap folder state cache doc' - it stores
    # all meta-data about all items in a folder; so one document holds the
    # state for many messages.  We first need to determine which are
    # different...
    keys = []
    rdkeys = []
    imap_flags = []
    folder_name = doc['rd_key'][1][1]

    for item in doc['infos']:
        msg_id = item['ENVELOPE'][-1]
        rdkey = get_rdkey_for_email(msg_id)
        rdkeys.append(rdkey)
        keys.append(['rd.core.content', 'key-schema_id', [rdkey, 'rd.msg.seen']])
        imap_flags.append((rdkey, item['FLAGS']))
    result = open_view(keys=keys, reduce=False, include_docs=True)

    # turn the result into a dict also keyed by rdkey
    couch_docs = {}
    for row in result['rows']:
        couch_docs[tuple(row['value']['rd_key'])] = row['doc']

    # work out which of these rdkeys actually exist in our db.
    existing_rdkeys = set()
    keys = []
    for rdkey in rdkeys:
        keys.append(['rd.core.content', 'key-schema_id', [rdkey, 'rd.msg.rfc822']])
    result = open_view(keys=keys, reduce=False)
    for row in result['rows']:
        existing_rdkeys.add(tuple(row['value']['rd_key']))

    # find what is different...
    nnew = 0
    to_up = []
    # Note it is fairly common to see multiples with the same msg ID in, eg
    # a 'drafts' folder, so skip duplicates to avoid conflicts.
    seen_keys = set()
    for rdkey, flags in imap_flags:
        if rdkey in seen_keys:
            logger.info('skipping duplicate message in folder %r: %r',
                        folder_name, rdkey)
            continue
        if rdkey not in existing_rdkeys:
            logger.debug('skipping message not yet in folder %r: %r',
                         folder_name, rdkey)
            continue
        seen_keys.add(rdkey)
        seen_now = "\\Seen" in flags
        try:
            doc = couch_docs[rdkey]
        except KeyError:
            # new message
            items = {'seen' : seen_now,
                     'outgoing_state' : 'incoming',
                     }
            emit_schema('rd.msg.seen', items, rdkey)
            nnew += 1
        else:
            # If the state in couch is anything other than 'incoming'', it
            # represents a request to change the state on the server (or the
            # process of trying to update the server).
            if doc.get('outgoing_state') != 'incoming':
                logger.info("found outgoing 'seen' state request in doc %(_id)r", doc)
                continue
            seen_couch = doc['seen']
            if seen_now != seen_couch:
                doc['seen'] = seen_now
                to_up.append(doc)
    if to_up:
        update_documents(to_up)
    logger.info("folder %r needs %d new and %d updated 'seen' records",
                folder_name, nnew, len(to_up))