def search_content_subject_or_cs(self, list_name, keyword, limit=None, offset=None): """ Returns a list of email containing the specified keyword in their content or their subject. :arg list_name, name of the mailing list in which this email should be searched. :arg keyword, keyword to search in the content or subject of the emails. """ email = get_class_object(list_to_table_name(list_name), 'email', self.metadata) mails = self.session.query(email).filter(or_( email.content.like('%{0}%'.format(keyword)), email.subject.like('%{0}%'.format(keyword)) )).order_by(email.date) if limit is not None: # imply that the result set is that big mails = mails.offset(offset).limit(limit) mails = mails.all() mails.reverse() return list(set(mails))
def get_list_size(self, list_name): """ Return the number of emails stored for a given mailing list. :arg list_name, name of the mailing list in which this email should be searched. """ email = get_class_object(list_to_table_name(list_name), 'email', self.metadata) return self.session.query(email).count()
def search_subject_index(self, list_name, keyword, limit=None, offset=300): email = get_class_object(list_to_table_name(list_name), 'email', self.metadata) criterion = "to_tsvector('english', subject) @@ to_tsquery(:keyword)" keyword = '%s:*' % keyword q = self.session.query(email) q = q.filter(criterion).params(keyword=keyword).order_by(email.date) if limit is not None: # imply that the result set is that big q = q.offset(offset).limit(limit) return q.all()
def get_thread_participants(self, list_name, thread_id): """ Return the list of participant in a thread. This thread is uniquely identified by its thread_id. :arg list_name, name of the mailing list in which this email should be searched. :arg thread_id, unique identifier of the thread as specified in the database. """ email = get_class_object(list_to_table_name(list_name), 'email', self.metadata) return self.session.query(distinct(email.sender)).filter( email.thread_id == thread_id).all()
def get_thread_length(self, list_name, thread_id): """ Return the number of email present in a thread. This thread is uniquely identified by its thread_id. :arg list_name, name of the mailing list in which this email should be searched. :arg thread_id, unique identifier of the thread as specified in the database. """ email = get_class_object(list_to_table_name(list_name), 'email', self.metadata) return self.session.query(email).filter_by( thread_id=thread_id).count()
def search_subject_cs(self, list_name, keyword): """ Returns a list of email containing the specified keyword in their subject. :arg list_name, name of the mailing list in which this email should be searched. :arg keyword, keyword to search in the subject of the emails. """ email = get_class_object(list_to_table_name(list_name), 'email', self.metadata) mails = self.session.query(email).filter( email.subject.like('%{0}%'.format(keyword)) ).order_by(email.date).all() mails.reverse() return mails
def search_sender_or_cs(self, list_name, keyword): """ Returns a list of email containing the specified keyword in the name or email address of the sender of the email. :arg list_name, name of the mailing list in which this email should be searched. :arg keyword, keyword to search in the database. """ email = get_class_object(list_to_table_name(list_name), 'email', self.metadata) mails = self.session.query(email).filter(or_( email.sender.like('%{0}%'.format(keyword)), email.email.like('%{0}%'.format(keyword)) )).order_by(email.date).all() mails.reverse() return list(set(mails))
def get_thread(self, list_name, thread_id): """ Return all the emails present in a thread. This thread is uniquely identified by its thread_id. :arg list_name, name of the mailing list in which this email should be searched. :arg thread_id, thread_id as used in the web-pages. Used here to uniquely identify the thread in the database. """ email = get_class_object(list_to_table_name(list_name), 'email', self.metadata) mail = None try: mail = self.session.query(email).filter_by( thread_id=thread_id).order_by(email.date).all() except NoResultFound: pass return mail
def get_email(self, list_name, message_id): """ Return an Email object found in the database corresponding to the Message-ID provided. :arg list_name, name of the mailing list in which this email should be searched. :arg message_id, Message-ID as found in the headers of the email. Used here to uniquely identify the email present in the database. """ email = get_class_object(list_to_table_name(list_name), 'email', self.metadata) mail = None try: mail = self.session.query(email).filter_by( message_id=message_id).one() except NoResultFound: pass return mail
def get_archives(self, list_name, start, end): """ Return all the thread started emails between two given dates. :arg list_name, name of the mailing list in which this email should be searched. :arg start, a datetime object representing the starting date of the interval to query. :arg end, a datetime object representing the ending date of the interval to query. """ # Beginning of thread == No 'References' header email = get_class_object(list_to_table_name(list_name), 'email', self.metadata) mails = self.session.query(email).filter( and_( email.date >= start, email.date <= end, email.references == None) ).order_by(email.date).all() mails.reverse() return mails
def get_archives_length(self, list_name): """ Return a dictionnary of years, months for which there are potentially archives available for a given list (based on the oldest post on the list). :arg list_name, name of the mailing list in which this email should be searched. """ archives = {} email = get_class_object(list_to_table_name(list_name), 'email', self.metadata) entry = self.session.query(email).order_by( email.date).limit(1).all()[0] now = datetime.datetime.now() year = entry.date.year month = entry.date.month while year < now.year: archives[year] = range(1, 13)[(month -1):] year = year + 1 month = 1 archives[now.year] = range(1, 13)[:now.month] return archives
def search_content_subject_index(self, list_name, keyword, limit=None, offset=None): """ Returns a list of email containing the specified keyword in their content or their subject. :arg list_name, name of the mailing list in which this email should be searched. :arg keyword, keyword to search in the content or subject of the emails. """ criterion = ("to_tsvector('english', (content || ' ') || subject) " "@@ to_tsquery(:keyword)") keyword = '%s:*' % keyword email = get_class_object(list_to_table_name(list_name), 'email', self.metadata) q = self.session.query(email) q = q.filter(criterion).params(keyword=keyword) # q = q.order_by(email.date) if limit is not None: # imply that the result set is that big q = q.offset(offset).limit(limit) return q.all()
def search_content_subject(self, list_name, keyword, limit=None, offset=None): """ Returns a list of email containing the specified keyword in their content or their subject. :arg list_name, name of the mailing list in which this email should be searched. :arg keyword, keyword to search in the content or subject of the emails. """ if limit is not None: # not implemented, skip the result raise NotImplementedError email = get_class_object(list_to_table_name(list_name), 'email', self.metadata) mails = self.session.query(email).filter( email.content.ilike('%{0}%'.format(keyword)) ).order_by(email.date).all() mails.extend(self.session.query(email).filter( email.subject.ilike('%{0}%'.format(keyword)) ).order_by(email.date).all()) mails.reverse() #return list(set(mails)) return mails
def to_db(mbfile, list_name): """ Upload all the emails in a mbox file into the database using kittystore API. :arg mbfile, a mailbox file from which the emails are extracted and upload to the database. :arg list_name, the fully qualified list name. """ global TOTALCNT cnt = 0 cnt_read = 0 email = get_class_object(list_to_table_name(list_name), "email", MetaData(engine), create=True) for message in mailbox.mbox(mbfile): cnt_read = cnt_read + 1 # print cnt_read TOTALCNT = TOTALCNT + 1 infos = {} ## TODO: We need to catch-up Subjects/From which are of a specific ## encoding. for it in message.keys(): it2 = it.replace("-", "") infos[it2] = message[it] keys = infos.keys() ## There seem to be a problem to parse some messages if not keys: print ' Failed: %s keys: "%s"' % (mbfile, keys) # print message continue if "MessageID" in infos: infos["MessageID"] = infos["MessageID"].replace("<", "").replace(">", "") if "From" in infos: regex = "(.*)\((.*)\)" match = re.match(regex, infos["From"]) if match: email_add, name = match.groups() infos["From"] = name email_add = email_add.replace(" at ", "@") infos["Email"] = email_add.strip() try: if not "MessageID" in infos: print " Failed: No Message-ID for email:" print " Content:", message["Subject"], message["Date"], message["From"] continue if not store.get_email(list_name, infos["MessageID"]): infos["Date"] = convert_date(infos["Date"]) infos["Content"] = message.get_payload() thread_id = 0 if not "References" in infos and not "InReplyTo" in infos: infos["ThreadID"] = b32encode(sha1(infos["MessageID"]).digest()) else: ref = None if "References" in infos: ref = infos["References"].split()[0].strip() else: ref = infos["InReplyTo"] infos["References"] = infos["InReplyTo"] del (infos["InReplyTo"]) ref = ref.replace("<", "").replace(">", "") res = store.get_email(list_name, ref) if res and res.thread_id: infos["ThreadID"] = res.thread_id else: infos["ThreadID"] = b32encode(sha1(infos["MessageID"]).digest()) infos["Category"] = "Question" if "agenda" in infos["Subject"].lower(): infos["Category"] = "Agenda" if "reminder" in infos["Subject"].lower(): infos["Category"] = "Agenda" infos["Full"] = message.as_string() ## TODO: I'm not sure the TOTALCNT approach is the right one ## we should discuss this with the pipermail guys infos["LegacyID"] = TOTALCNT if not "References" in infos: infos["References"] = None # print infos.keys() mail = email( sender=infos["From"], email=infos["Email"], subject=infos["Subject"], content=infos["Content"], date=infos["Date"], message_id=infos["MessageID"], stable_url_id=infos["MessageID"], thread_id=infos["ThreadID"], references=infos["References"], full=infos["Full"], ) mail.save(session) cnt = cnt + 1 session.commit() except Exception, err: print ' Error: "%s"' % err print "File:", mbfile, "Content:", message["Subject"], message["Date"], message["From"] pass
def get_table_size(list_name): """ Return the size of the document in mongodb. """ email = get_class_object(list_to_table_name(list_name), "email", MetaData(engine)) print " %s emails are stored into the database" % session.query(email).count()