def search_content_subject_or_cs(self, list_name, keyword, limit=None,
                                     offset=None):
        """ Returns a list of email containing the specified keyword in
        their content or their subject.

        :arg list_name, name of the mailing list in which this email
        should be searched.
        :arg keyword, keyword to search in the content or subject of
        the emails.
        """
        email = get_class_object(list_to_table_name(list_name), 'email',
            self.metadata)
        mails = self.session.query(email).filter(or_(
                email.content.like('%{0}%'.format(keyword)),
                email.subject.like('%{0}%'.format(keyword))
                )).order_by(email.date)

        if limit is not None:
            # imply that the result set is that big
            mails = mails.offset(offset).limit(limit)

        mails = mails.all()

        mails.reverse()
        return list(set(mails))
    def get_list_size(self, list_name):
        """ Return the number of emails stored for a given mailing list.

        :arg list_name, name of the mailing list in which this email
        should be searched.
        """
        email = get_class_object(list_to_table_name(list_name), 'email',
            self.metadata)
        return self.session.query(email).count()
    def search_subject_index(self, list_name, keyword, limit=None, offset=300):
        email = get_class_object(list_to_table_name(list_name), 'email',
                                 self.metadata)
        criterion = "to_tsvector('english', subject) @@ to_tsquery(:keyword)"
        keyword = '%s:*' % keyword

        q = self.session.query(email)
        q = q.filter(criterion).params(keyword=keyword).order_by(email.date)
        if limit is not None:
            # imply that the result set is that big
            q = q.offset(offset).limit(limit)
        return q.all()
    def get_thread_participants(self, list_name, thread_id):
        """ Return the list of participant in a thread. This thread
        is uniquely identified by its thread_id.

        :arg list_name, name of the mailing list in which this email
        should be searched.
        :arg thread_id, unique identifier of the thread as specified in
        the database.
        """
        email = get_class_object(list_to_table_name(list_name), 'email',
            self.metadata)
        return self.session.query(distinct(email.sender)).filter(
                email.thread_id == thread_id).all()
    def get_thread_length(self, list_name, thread_id):
        """ Return the number of email present in a thread. This thread
        is uniquely identified by its thread_id.

        :arg list_name, name of the mailing list in which this email
        should be searched.
        :arg thread_id, unique identifier of the thread as specified in
        the database.
        """
        email = get_class_object(list_to_table_name(list_name), 'email',
            self.metadata)
        return self.session.query(email).filter_by(
                    thread_id=thread_id).count()
    def search_subject_cs(self, list_name, keyword):
        """ Returns a list of email containing the specified keyword in
        their subject.

        :arg list_name, name of the mailing list in which this email
        should be searched.
        :arg keyword, keyword to search in the subject of the emails.
        """
        email = get_class_object(list_to_table_name(list_name), 'email',
            self.metadata)
        mails = self.session.query(email).filter(
                email.subject.like('%{0}%'.format(keyword))
                ).order_by(email.date).all()
        mails.reverse()
        return mails
    def search_sender_or_cs(self, list_name, keyword):
        """ Returns a list of email containing the specified keyword in
        the name or email address of the sender of the email.

        :arg list_name, name of the mailing list in which this email
        should be searched.
        :arg keyword, keyword to search in the database.
        """
        email = get_class_object(list_to_table_name(list_name), 'email',
            self.metadata)
        mails = self.session.query(email).filter(or_(
                email.sender.like('%{0}%'.format(keyword)),
                email.email.like('%{0}%'.format(keyword))
                )).order_by(email.date).all()
        mails.reverse()
        return list(set(mails))
    def get_thread(self, list_name, thread_id):
        """ Return all the emails present in a thread. This thread
        is uniquely identified by its thread_id.

        :arg list_name, name of the mailing list in which this email
        should be searched.
        :arg thread_id, thread_id as used in the web-pages.
        Used here to uniquely identify the thread in the database.
        """
        email = get_class_object(list_to_table_name(list_name), 'email',
            self.metadata)
        mail = None
        try:
            mail = self.session.query(email).filter_by(
                thread_id=thread_id).order_by(email.date).all()
        except NoResultFound:
            pass
        return mail
    def get_email(self, list_name, message_id):
        """ Return an Email object found in the database corresponding
        to the Message-ID provided.

        :arg list_name, name of the mailing list in which this email
        should be searched.
        :arg message_id, Message-ID as found in the headers of the email.
        Used here to uniquely identify the email present in the database.
        """
        email = get_class_object(list_to_table_name(list_name), 'email',
            self.metadata)
        mail = None
        try:
            mail = self.session.query(email).filter_by(
                message_id=message_id).one()
        except NoResultFound:
            pass
        return mail
 def get_archives(self, list_name, start, end):
     """ Return all the thread started emails between two given dates.
     
     :arg list_name, name of the mailing list in which this email
     should be searched.
     :arg start, a datetime object representing the starting date of
     the interval to query.
     :arg end, a datetime object representing the ending date of
     the interval to query.
     """
     # Beginning of thread == No 'References' header
     email = get_class_object(list_to_table_name(list_name), 'email',
         self.metadata)
     mails = self.session.query(email).filter(
         and_(
             email.date >= start,
             email.date <= end,
             email.references == None)
             ).order_by(email.date).all()
     mails.reverse()
     return mails
    def get_archives_length(self, list_name):
        """ Return a dictionnary of years, months for which there are
        potentially archives available for a given list (based on the
        oldest post on the list).

        :arg list_name, name of the mailing list in which this email
        should be searched.
        """
        archives = {}
        email = get_class_object(list_to_table_name(list_name), 'email',
            self.metadata)
        entry = self.session.query(email).order_by(
                    email.date).limit(1).all()[0]
        now = datetime.datetime.now()
        year = entry.date.year
        month = entry.date.month
        while year < now.year:
            archives[year] = range(1, 13)[(month -1):]
            year = year + 1
            month = 1
        archives[now.year] = range(1, 13)[:now.month]
        return archives
    def search_content_subject_index(self, list_name, keyword, limit=None,
                                     offset=None):
        """ Returns a list of email containing the specified keyword in
        their content or their subject.

        :arg list_name, name of the mailing list in which this email
        should be searched.
        :arg keyword, keyword to search in the content or subject of
        the emails.
        """
        criterion = ("to_tsvector('english', (content || ' ') || subject) "
                     "@@ to_tsquery(:keyword)")
        keyword = '%s:*' % keyword
        email = get_class_object(list_to_table_name(list_name), 'email',
                                 self.metadata)
        q = self.session.query(email)
        q = q.filter(criterion).params(keyword=keyword)
        # q = q.order_by(email.date)
        if limit is not None:
            # imply that the result set is that big
            q = q.offset(offset).limit(limit)
        return q.all()
    def search_content_subject(self, list_name, keyword, limit=None,
                               offset=None):
        """ Returns a list of email containing the specified keyword in
        their content or their subject.

        :arg list_name, name of the mailing list in which this email
        should be searched.
        :arg keyword, keyword to search in the content or subject of
        the emails.
        """
        if limit is not None:
            # not implemented, skip the result
            raise NotImplementedError
        email = get_class_object(list_to_table_name(list_name), 'email',
            self.metadata)
        mails = self.session.query(email).filter(
                email.content.ilike('%{0}%'.format(keyword))
                ).order_by(email.date).all()
        mails.extend(self.session.query(email).filter(
                email.subject.ilike('%{0}%'.format(keyword))
                ).order_by(email.date).all())
        mails.reverse()
        #return list(set(mails))
        return mails
Exemple #14
0
def to_db(mbfile, list_name):
    """ Upload all the emails in a mbox file into the database using
    kittystore API.

    :arg mbfile, a mailbox file from which the emails are extracted and
    upload to the database.
    :arg list_name, the fully qualified list name.
    """
    global TOTALCNT
    cnt = 0
    cnt_read = 0
    email = get_class_object(list_to_table_name(list_name), "email", MetaData(engine), create=True)
    for message in mailbox.mbox(mbfile):
        cnt_read = cnt_read + 1
        # print cnt_read
        TOTALCNT = TOTALCNT + 1
        infos = {}
        ## TODO: We need to catch-up Subjects/From which are of a specific
        ## encoding.
        for it in message.keys():
            it2 = it.replace("-", "")
            infos[it2] = message[it]
        keys = infos.keys()
        ## There seem to be a problem to parse some messages
        if not keys:
            print '  Failed: %s keys: "%s"' % (mbfile, keys)
            # print message
            continue
        if "MessageID" in infos:
            infos["MessageID"] = infos["MessageID"].replace("<", "").replace(">", "")
        if "From" in infos:
            regex = "(.*)\((.*)\)"
            match = re.match(regex, infos["From"])
            if match:
                email_add, name = match.groups()
                infos["From"] = name
                email_add = email_add.replace(" at ", "@")
                infos["Email"] = email_add.strip()
        try:
            if not "MessageID" in infos:
                print "  Failed: No Message-ID for email:"
                print "   Content:", message["Subject"], message["Date"], message["From"]
                continue
            if not store.get_email(list_name, infos["MessageID"]):
                infos["Date"] = convert_date(infos["Date"])
                infos["Content"] = message.get_payload()
                thread_id = 0
                if not "References" in infos and not "InReplyTo" in infos:
                    infos["ThreadID"] = b32encode(sha1(infos["MessageID"]).digest())
                else:
                    ref = None
                    if "References" in infos:
                        ref = infos["References"].split()[0].strip()
                    else:
                        ref = infos["InReplyTo"]
                        infos["References"] = infos["InReplyTo"]
                        del (infos["InReplyTo"])
                    ref = ref.replace("<", "").replace(">", "")
                    res = store.get_email(list_name, ref)
                    if res and res.thread_id:
                        infos["ThreadID"] = res.thread_id
                    else:
                        infos["ThreadID"] = b32encode(sha1(infos["MessageID"]).digest())
                infos["Category"] = "Question"
                if "agenda" in infos["Subject"].lower():
                    infos["Category"] = "Agenda"
                if "reminder" in infos["Subject"].lower():
                    infos["Category"] = "Agenda"
                infos["Full"] = message.as_string()

                ## TODO: I'm not sure the TOTALCNT approach is the right one
                ## we should discuss this with the pipermail guys
                infos["LegacyID"] = TOTALCNT
                if not "References" in infos:
                    infos["References"] = None
                # print infos.keys()
                mail = email(
                    sender=infos["From"],
                    email=infos["Email"],
                    subject=infos["Subject"],
                    content=infos["Content"],
                    date=infos["Date"],
                    message_id=infos["MessageID"],
                    stable_url_id=infos["MessageID"],
                    thread_id=infos["ThreadID"],
                    references=infos["References"],
                    full=infos["Full"],
                )
                mail.save(session)
                cnt = cnt + 1
                session.commit()
        except Exception, err:
            print ' Error: "%s"' % err
            print "File:", mbfile, "Content:", message["Subject"], message["Date"], message["From"]
            pass
Exemple #15
0
def get_table_size(list_name):
    """ Return the size of the document in mongodb. """
    email = get_class_object(list_to_table_name(list_name), "email", MetaData(engine))
    print "  %s emails are stored into the database" % session.query(email).count()