def to_db(mbfile, list_name): """ Upload all the emails in a mbox file into the database using kittystore API. :arg mbfile, a mailbox file from which the emails are extracted and upload to the database. :arg list_name, the fully qualified list name. """ global TOTALCNT cnt = 0 cnt_read = 0 email = get_class_object(list_to_table_name(list_name), "email", MetaData(engine), create=True) for message in mailbox.mbox(mbfile): cnt_read = cnt_read + 1 # print cnt_read TOTALCNT = TOTALCNT + 1 infos = {} ## TODO: We need to catch-up Subjects/From which are of a specific ## encoding. for it in message.keys(): it2 = it.replace("-", "") infos[it2] = message[it] keys = infos.keys() ## There seem to be a problem to parse some messages if not keys: print ' Failed: %s keys: "%s"' % (mbfile, keys) # print message continue if "MessageID" in infos: infos["MessageID"] = infos["MessageID"].replace("<", "").replace(">", "") if "From" in infos: regex = "(.*)\((.*)\)" match = re.match(regex, infos["From"]) if match: email_add, name = match.groups() infos["From"] = name email_add = email_add.replace(" at ", "@") infos["Email"] = email_add.strip() try: if not "MessageID" in infos: print " Failed: No Message-ID for email:" print " Content:", message["Subject"], message["Date"], message["From"] continue if not store.get_email(list_name, infos["MessageID"]): infos["Date"] = convert_date(infos["Date"]) infos["Content"] = message.get_payload() thread_id = 0 if not "References" in infos and not "InReplyTo" in infos: infos["ThreadID"] = b32encode(sha1(infos["MessageID"]).digest()) else: ref = None if "References" in infos: ref = infos["References"].split()[0].strip() else: ref = infos["InReplyTo"] infos["References"] = infos["InReplyTo"] del (infos["InReplyTo"]) ref = ref.replace("<", "").replace(">", "") res = store.get_email(list_name, ref) if res and res.thread_id: infos["ThreadID"] = res.thread_id else: infos["ThreadID"] = b32encode(sha1(infos["MessageID"]).digest()) infos["Category"] = "Question" if "agenda" in infos["Subject"].lower(): infos["Category"] = "Agenda" if "reminder" in infos["Subject"].lower(): infos["Category"] = "Agenda" infos["Full"] = message.as_string() ## TODO: I'm not sure the TOTALCNT approach is the right one ## we should discuss this with the pipermail guys infos["LegacyID"] = TOTALCNT if not "References" in infos: infos["References"] = None # print infos.keys() mail = email( sender=infos["From"], email=infos["Email"], subject=infos["Subject"], content=infos["Content"], date=infos["Date"], message_id=infos["MessageID"], stable_url_id=infos["MessageID"], thread_id=infos["ThreadID"], references=infos["References"], full=infos["Full"], ) mail.save(session) cnt = cnt + 1 session.commit() except Exception, err: print ' Error: "%s"' % err print "File:", mbfile, "Content:", message["Subject"], message["Date"], message["From"] pass
def get_table_size(list_name): """ Return the size of the document in mongodb. """ email = get_class_object(list_to_table_name(list_name), "email", MetaData(engine)) print " %s emails are stored into the database" % session.query(email).count()