/
load.py
executable file
·86 lines (76 loc) · 2.57 KB
/
load.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env python
import os
import re
import rdflib
import rfc822
import mailbox
import datetime
nmo = rdflib.Namespace("http://www.semanticdesktop.org/ontologies/nmo/#")
rdf = rdflib.RDF
uri = rdflib.URIRef
def main():
generate_rdf()
def generate_rdf():
g = rdflib.Graph("Sleepycat", identifier="emails")
g.open("store", create=True)
g.bind("nmo", str(nmo))
for msg in range14_messages():
u = uri(msg["url"])
print u
g.add((u, rdf.type, nmo.Email))
g.add((u, nmo["from"], uri("mailto:" + msg["from"][1])))
g.add((u, nmo.sentDate, rdflib.Literal(msg["date"])))
g.add((u, nmo.messageSubject, rdflib.Literal(msg["subject"])))
g.add((u, nmo.messageId, rdflib.Literal(msg["message_id"])))
for name, email in msg["to"]:
g.add((u, nmo.to, uri("mailto:" + email)))
for name, email in msg["cc"]:
g.add((u, nmo.cc, uri("mailto:" + email)))
if msg["in_reply_to"]:
reply_to = msg["in_reply_to"].strip("<>")
reply_to = "http://www.w3.org/mid/" + reply_to
reply_to = rdflib.URIRef(reply_to)
g.add((u, nmo.inReplyTo, reply_to))
g.serialize(open("emails.rdf", "w"))
g.serialize(open("emails.ttl", "w"), format="turtle")
g.close()
def get_message(msg):
fr = rfc822.parseaddr(msg['from'])
to = rfc822.AddressList(msg['to']).addresslist
cc = rfc822.AddressList(msg['cc']).addresslist
subject = msg['subject']
date = rfc822.parsedate(msg['date'])
date = datetime.datetime(*date[:6])
url = msg['Archived-At']
if not url:
url = msg['X-Archived-At']
url = url.strip("<>")
message_id = msg['Message-ID']
in_reply_to = msg.get('In-Reply-To', None)
return {
"from": fr,
"subject": subject,
"to": to,
"cc": cc,
"url": url,
"date": date,
"message_id": message_id,
"in_reply_to": in_reply_to,
"raw": msg.as_string()
}
return None
def range14_messages():
for root, dirs, files in os.walk("mboxes"):
for filename in files:
mbox_file = os.path.join(root, filename)
count = 0
for msg in mailbox.mbox(mbox_file):
count += 1
try:
m = get_message(msg)
if re.search("httpRange-?14", m["raw"], re.IGNORECASE):
yield m
except Exception, e:
print "unable to add message #%s from %s: %s" % (count, mbox_file, e)
if __name__ == "__main__":
main()