Example #1
0
def ExtractThreads(message_infos):
    thread_messages = []
    for message_info in message_infos:
        try:
            thread_message = jwzthreading.make_message(message_info.headers)
        except ValueError:
            continue

        if thread_message:
            thread_message.message_info = message_info
            thread_messages.append(thread_message)

    thread_dict = jwzthreading.thread(thread_messages)

    containers = []
    for subject, container in thread_dict.items():
        # jwzthreading is too aggressive in threading by subject and will combine
        # distinct threads that happen to have the same subject. Split them up if
        # we have a dummy container that has lots of children at the first
        # level.
        if container.is_dummy() and len(container.children) >= 10:
            for child_container in container.children:
                child_container.subject = subject
                containers.append(child_container)
        else:
            container.subject = subject
            containers.append(container)

    return containers
Example #2
0
def test_to_dict():
    text = ["""\
            Subject: random
            Message-ID: <message1>
            References:

            Body.""",
            """\
            Subject: Re: random
            Message-ID: <message2>
            References: <message1>

            Body.""",

            """\
            Subject: Re: random
            Message-ID: <message3>
            References: <message1>

            Body.""",
            ]

    msg = [message_from_string(textwrap.dedent(el)) for el in text]
    msg = [Message(el, message_idx=idx) for idx, el in enumerate(msg)]

    threads = thread(msg, group_by_subject=False)

    tree_expected = {'id': 0, 'parent': None, 'children': [
                        {'id': 1, 'parent': 0, 'children': []},
                        {'id': 2, 'parent': 0, 'children': []},
                        ]}

    assert threads[0].to_dict() == tree_expected
Example #3
0
def ExtractThreads(message_infos):
    thread_messages = []
    for message_info in message_infos:
        try:
            thread_message = jwzthreading.make_message(message_info.headers)
        except ValueError:
            continue

        if thread_message:
            thread_message.message_info = message_info
            thread_messages.append(thread_message)

    thread_dict = jwzthreading.thread(thread_messages)

    containers = []
    for subject, container in thread_dict.items():
        # jwzthreading is too aggressive in threading by subject and will combine
        # distinct threads that happen to have the same subject. Split them up if
        # we have a dummy container that has lots of children at the first
        # level.
        if container.is_dummy() and len(container.children) >= 10:
            for child_container in container.children:
                child_container.subject = subject
                containers.append(child_container)
        else:
            container.subject = subject
            containers.append(container)

    return containers
Example #4
0
def test_thread_lying_message():
    """Thread three messages together, with other messages lying
    in their references."""
    dummy_parent_m = Message(None)
    dummy_parent_m.subject = dummy_parent_m.message_id = 'Dummy parent'
    lying_before_m = Message(None)
    lying_before_m.subject = lying_before_m.message_id = 'Lying before'
    lying_before_m.references = ['Dummy parent', 'Second', 'First', 'Third']
    m1 = Message(None)
    m1.subject = m1.message_id = 'First'
    m2 = Message(None)
    m2.subject = m2.message_id = 'Second'
    m2.references = ['First']
    m3 = Message(None)
    m3.subject = m3.message_id = 'Third'
    m3.references = ['First', 'Second']
    lying_after_m = Message(None)
    lying_after_m.subject = lying_after_m.message_id = 'Lying after'
    # lying_after_m.references = ['Dummy parent','Third', 'Second', 'First']
    d = thread([dummy_parent_m, lying_before_m,
                m1, m2, m3, lying_after_m], group_by_subject=False)
    assert d[1]['message'] == m1
    assert len(d[1].children) == 1
    assert d[1].children[0]['message'] == m2
    assert len(d[1].children[0].children) == 1
    assert d[1].children[0].children[0]['message'] == m3
Example #5
0
 def test_thread_lying_message(self):
     "Thread three messages together, with other messages lying in their references."
     dummy_parent_m = jwzthreading.Message(None)
     dummy_parent_m.subject = dummy_parent_m.message_id = 'Dummy parent'
     lying_before_m = jwzthreading.Message(None)
     lying_before_m.subject = lying_before_m.message_id = 'Lying before'
     lying_before_m.references = [
         'Dummy parent', 'Second', 'First', 'Third'
     ]
     m1 = jwzthreading.Message(None)
     m1.subject = m1.message_id = 'First'
     m2 = jwzthreading.Message(None)
     m2.subject = m2.message_id = 'Second'
     m2.references = ['First']
     m3 = jwzthreading.Message(None)
     m3.subject = m3.message_id = 'Third'
     m3.references = ['First', 'Second']
     lying_after_m = jwzthreading.Message(None)
     lying_after_m.subject = lying_after_m.message_id = 'Lying after'
     #lying_after_m.references = ['Dummy parent','Third', 'Second', 'First']
     d = jwzthreading.thread(
         [dummy_parent_m, lying_before_m, m1, m2, m3, lying_after_m])
     self.assertEqual(d['First'].message, m1)
     self.assertEqual(len(d['First'].children), 1)
     self.assertEqual(d['First'].children[0].message, m2)
     self.assertEqual(len(d['First'].children[0].children), 1)
     self.assertEqual(d['First'].children[0].children[0].message, m3)
Example #6
0
 def test_thread_unrelated(self):
     "Thread two unconnected messages"
     m1 = jwzthreading.Message(None)
     m1.subject = m1.message_id = 'First'
     m2 = jwzthreading.Message(None)
     m2.subject = m2.message_id = 'Second'
     d = jwzthreading.thread([m1, m2])
     self.assertEqual(d['First'].message, m1)
     self.assertEqual(d['Second'].children, [])
     self.assertEqual(d['Second'].message, m2)
Example #7
0
 def test_thread_unrelated(self):
     "Thread two unconnected messages"
     m1 = jwzthreading.Message(None)
     m1.subject = m1.message_id = 'First'
     m2 = jwzthreading.Message(None)
     m2.subject = m2.message_id = 'Second'
     d = jwzthreading.thread([m1, m2])
     self.assertEqual(d['First'].message, m1)
     self.assertEqual(d['Second'].children, [])
     self.assertEqual(d['Second'].message, m2)
Example #8
0
def test_thread_unrelated():
    """Thread two unconnected messages."""
    m1 = Message(None)
    m1.subject = m1.message_id = 'First'
    m2 = Message(None)
    m2.subject = m2.message_id = 'Second'
    d = thread([m1, m2], group_by_subject=False)
    assert d[0]['message'] == m1
    assert d[1].children == []
    assert d[1]['message'] == m2
Example #9
0
 def test_thread_two(self):
     "Thread two messages together."
     m1 = jwzthreading.Message(None)
     m1.subject = m1.message_id = 'First'
     m2 = jwzthreading.Message(None)
     m2.subject = m2.message_id = 'Second'
     m2.references = ['First']
     d = jwzthreading.thread([m1, m2])
     self.assertEqual(d['First'].message, m1)
     self.assertEqual(len(d['First'].children), 1)
     self.assertEqual(d['First'].children[0].message, m2)
Example #10
0
def test_thread_two_reverse():
    "Thread two messages together, with the child message listed first."
    m1 = Message(None)
    m1.subject = m1.message_id = 'First'
    m2 = Message(None)
    m2.subject = m2.message_id = 'Second'
    m2.references = ['First']
    d = thread([m2, m1], group_by_subject=False)
    assert d[0]['message'] == m1
    assert len(d[0].children) == 1
    assert d[0].children[0]['message'] == m2
Example #11
0
 def test_thread_two_reverse(self):
     "Thread two messages together, with the child message listed first."
     m1 = jwzthreading.Message(None)
     m1.subject = m1.message_id = 'First'
     m2 = jwzthreading.Message(None)
     m2.subject = m2.message_id = 'Second'
     m2.references = ['First']
     d = jwzthreading.thread([m2, m1])
     self.assertEqual(d['First'].message, m1)
     self.assertEqual(len(d['First'].children), 1)
     self.assertEqual(d['First'].children[0].message, m2)
Example #12
0
 def test_thread_two(self):
     "Thread two messages together."
     m1 = jwzthreading.Message(None)
     m1.subject = m1.message_id = 'First'
     m2 = jwzthreading.Message(None)
     m2.subject = m2.message_id = 'Second'
     m2.references = ['First']
     d = jwzthreading.thread([m1, m2])
     self.assertEqual(d['First'].message, m1)
     self.assertEqual(len(d['First'].children), 1)
     self.assertEqual(d['First'].children[0].message, m2)
Example #13
0
 def test_thread_two_reverse(self):
     "Thread two messages together, with the child message listed first."
     m1 = jwzthreading.Message(None)
     m1.subject = m1.message_id = 'First'
     m2 = jwzthreading.Message(None)
     m2.subject = m2.message_id = 'Second'
     m2.references = ['First']
     d = jwzthreading.thread([m2, m1])
     self.assertEqual(d['First'].message, m1)
     self.assertEqual(len(d['First'].children), 1)
     self.assertEqual(d['First'].children[0].message, m2)
Example #14
0
def test_thread_two():
    """Thread two messages together."""
    m1 = Message(None)
    m1.subject = m1.message_id = 'First'
    m2 = Message(None)
    m2.subject = m2.message_id = 'Second'
    m2.references = ['First']
    d = thread([m1, m2])
    assert d[0]['message'] == m1
    assert len(d[0].children) == 1
    assert d[0].children[0]['message'] == m2
Example #15
0
 def test_thread_two_missing_parent(self):
     "Thread two messages, both children of a missing parent."
     m1 = jwzthreading.Message(None)
     m1.subject = 'Child'
     m1.message_id = 'First'
     m1.references = ['parent']
     m2 = jwzthreading.Message(None)
     m2.subject = 'Child'
     m2.message_id = 'Second'
     m2.references = ['parent']
     d = jwzthreading.thread([m1, m2])
     self.assertEqual(d['Child'].message, None)
     self.assertEqual(len(d['Child'].children), 2)
     self.assertEqual(d['Child'].children[0].message, m1)
Example #16
0
 def test_thread_two_missing_parent(self):
     "Thread two messages, both children of a missing parent."
     m1 = jwzthreading.Message(None)
     m1.subject = 'Child'
     m1.message_id = 'First'
     m1.references = ['parent']
     m2 = jwzthreading.Message(None)
     m2.subject = 'Child'
     m2.message_id = 'Second'
     m2.references = ['parent']
     d = jwzthreading.thread([m1, m2])
     self.assertEqual(d['Child'].message, None)
     self.assertEqual(len(d['Child'].children), 2)
     self.assertEqual(d['Child'].children[0].message, m1)
Example #17
0
 def test_thread_two_missing_parent(self):
     "Thread two messages, both children of a missing parent."
     m1 = jwzthreading.Message(None)
     m1.subject = "Child"
     m1.message_id = "First"
     m1.references = ["parent"]
     m2 = jwzthreading.Message(None)
     m2.subject = "Child"
     m2.message_id = "Second"
     m2.references = ["parent"]
     d = jwzthreading.thread([m1, m2])
     self.assertEqual(d["Child"].message, None)
     self.assertEqual(len(d["Child"].children), 2)
     self.assertEqual(d["Child"].children[0].message, m1)
Example #18
0
def test_sorting():
    """Thread two unconnected messages."""
    m1 = Message(None)
    m1.subject = 'b'
    m1.message_id = 1
    m2 = Message(None)
    m2.subject = 'a'
    m2.message_id = 2
    m3 = Message(None)
    d = thread([m2, m1, m3], group_by_subject=False)

    d_s = sort_threads(d, key='message_id', missing=-1)
    assert d_s[0]['message'].message_id is None
    assert d_s[1]['message'].message_id == 1
    d_s = sort_threads(d, key='subject', missing='z')
    assert d_s[0]['message'].message_id == 2
    assert d_s[1]['message'].message_id == 1
Example #19
0
def main():
    import mailbox
    import sys

    msglist = []

    print('Reading input file...')
    mbox = mailbox.mbox(sys.argv[1])
    for message in mbox:
        try:
            parsed_msg = Message(message)
        except ValueError:
            continue
        msglist.append(parsed_msg)

    print('Threading...')
    threads = thread(msglist)

    print('Output...')
    for container in threads:
        print_container(container)
Example #20
0
def test_thread_two_missing_parent():
    "Thread two messages, both children of a missing parent."
    m1 = Message(None)
    m1.subject = 'Child'
    m1.message_id = 'First'
    m1.references = ['parent']
    m2 = Message(None)
    m2.subject = 'Child'
    m2.message_id = 'Second'
    m2.references = ['parent']
    d = thread([m1, m2])
    assert d[0]['message'] is None
    assert len(d[0].children) == 2
    assert d[0].children[0]['message'] == m1
    assert d[0].tree_size == 3

    # check that collapsing the empty container works
    container = d[0].collapse_empty()
    assert container.tree_size == 2
    assert container['message'] is not None
    assert container['message'].message_id == 'First'
    assert container.parent is None
Example #21
0
def test_empty_collapsing_fedora_June2010():
    """ Test threading on the fedora-devel mailing list data
    from June 2010"""

    try:
        import lxml
    except ImportError:
        raise SkipTest

    try:
        import numpy as np
        from numpy.testing import assert_array_equal
        NUMPY_PRESENT = True
    except ImportError:
        NUMPY_PRESENT = False

    msglist = parse_mailbox(os.path.join(DATA_DIR, '2010-January.txt.gz'),
                            encoding='latin1', headersonly=True)

    assert len(msglist) == N_EMAILS_JUNE2010

    threads_ref = parse_mailman_htmlthread(os.path.join(DATA_DIR,
                                           '2010-January_thread.html.gz'))
    threads_ref = sort_threads(threads_ref, key='subject', missing='Z')

    threads = thread([Message(el, message_idx=idx)
                      for idx, el in enumerate(msglist)],
                     group_by_subject=False)
    # There is one single "empty root container"
    assert sum([el.get('message') is None for el in threads]) == 1

    threads = [el.collapse_empty() for el in threads]

    # The empty container was removed
    assert sum([el.get('message') is None for el in threads]) == 0

    assert sum([el.parent is None for el in threads]) == len(threads)
Example #22
0
 def test_thread_lying_message(self):
     "Thread three messages together, with other messages lying in their references."
     dummy_parent_m = jwzthreading.Message(None)
     dummy_parent_m.subject = dummy_parent_m.message_id = 'Dummy parent'
     lying_before_m = jwzthreading.Message(None)
     lying_before_m.subject = lying_before_m.message_id = 'Lying before'
     lying_before_m.references = ['Dummy parent', 'Second', 'First', 'Third']
     m1 = jwzthreading.Message(None)
     m1.subject = m1.message_id = 'First'
     m2 = jwzthreading.Message(None)
     m2.subject = m2.message_id = 'Second'
     m2.references = ['First']
     m3 = jwzthreading.Message(None)
     m3.subject = m3.message_id = 'Third'
     m3.references = ['First', 'Second']
     lying_after_m = jwzthreading.Message(None)
     lying_after_m.subject = lying_after_m.message_id = 'Lying after'
     #lying_after_m.references = ['Dummy parent','Third', 'Second', 'First']
     d = jwzthreading.thread([dummy_parent_m, lying_before_m, m1, m2, m3, lying_after_m])
     self.assertEqual(d['First'].message, m1)
     self.assertEqual(len(d['First'].children), 1)
     self.assertEqual(d['First'].children[0].message, m2)
     self.assertEqual(len(d['First'].children[0].children), 1)
     self.assertEqual(d['First'].children[0].children[0].message, m3)
Example #23
0
    def thread_mails(emails):
        # print('Threading...')
        emails_for_threading = []
        for mail in emails:
            email_for_threading = jwzthreading.make_message(
                email.message_from_string(mail.imported_blob))
            # Store our emailsubject, jwzthreading does not decode subject itself
            email_for_threading.subject = mail.subject.first_original().value
            # Store our email object pointer instead of the raw message text
            email_for_threading.message = mail
            emails_for_threading.append(email_for_threading)

        threaded_emails = jwzthreading.thread(emails_for_threading)

        # Output
        L = threaded_emails.items()
        L.sort()
        for subj, container in L:
            jwzthreading.print_container(container, 0, True)

        def update_threading(threaded_emails, parent=None, debug=False):
            if debug:
                print("\n\nEntering update_threading() for %s mails:" %
                      len(threaded_emails))
            for container in threaded_emails:
                if debug:
                    # jwzthreading.print_container(container)
                    print("\nProcessing:  " + repr(
                        container.message.subject.first_original().value) +
                          " " + repr(container.message.message_id) + " " +
                          repr(container.message.message.id))
                    print("container: " + (repr(container)))
                    print("parent: " + repr(container.parent))
                    print("children: " + repr(container.children))

                if (container.message):
                    current_parent = container.message.message.parent
                    if (current_parent):
                        db_parent_message_id = current_parent.message_id
                    else:
                        db_parent_message_id = None

                    if parent:
                        if parent.message:
                            # jwzthreading strips the <>, re-add them
                            algorithm_parent_message_id = unicode(
                                "<" + parent.message.message_id + ">")
                        else:
                            if debug:
                                print(
                                    "Parent was a dummy container, we may need \
                                     to handle this case better, as we just \
                                     potentially lost sibbling relationships")
                            algorithm_parent_message_id = None
                    else:
                        algorithm_parent_message_id = None
                    if debug:
                        print("Current parent from database: " +
                              repr(db_parent_message_id))
                        print("Current parent from algorithm: " +
                              repr(algorithm_parent_message_id))
                        print("References: " +
                              repr(container.message.references))
                    if algorithm_parent_message_id != db_parent_message_id:
                        if current_parent is None or isinstance(
                                current_parent, Email):
                            if debug:
                                print(
                                    "UPDATING PARENT for :" +
                                    repr(container.message.message.message_id))
                            new_parent = parent.message.message if algorithm_parent_message_id else None
                            if debug:
                                print(repr(new_parent))
                            container.message.message.set_parent(new_parent)
                        else:
                            if debug:
                                print(
                                    "Skipped reparenting:  the current parent \
                                isn't an email, the threading algorithm only \
                                considers mails")
                    update_threading(container.children,
                                     container,
                                     debug=debug)
                else:
                    if debug:
                        print(
                            "Current message ID: None, was a dummy container")
                    update_threading(container.children, parent, debug=debug)

        update_threading(threaded_emails.values(), debug=False)
Example #24
0
    thread_subject = root_message['data']['Subject']
    return ''.join(c for c in thread_subject if (c.isalnum() or c.isspace()))


def index_thread_in_ES(ctr, ischild):
    try:
        container = ctr.to_dict()
        es_type = get_es_type(container)
        for msg_id in collect_msg_ids(container):
            # Pop removes and retrieves msg.
            add_to_ES(es_type, message_map.pop(msg_id))
    except ValueError as error:
        log_error(error)
        for child_ctr in ctr.children:
            index_thread_in_ES(child_ctr, True)


create_index()
message_list, message_map = parse_messages(repo)
containers = thread(message_list, group_by_subject=False)

for ctr in containers:
    index_thread_in_ES(ctr, False)

# Just add messages that somehow evaded being
# threaded to ES under the type "unknown".
for message in message_map.keys():
    add_to_ES('unknown', message_map[message])

logfile.close()
Example #25
0
    def thread_mails(emails):
        print('Threading...')
        emails_for_threading = []
        for mail in emails:
            email_for_threading = jwzthreading.make_message(email.message_from_string(mail.full_message))
            #Store our emailsubject, jwzthreading does not decode subject itself
            email_for_threading.subject = mail.subject
            #Store our email object pointer instead of the raw message text
            email_for_threading.message = mail
            emails_for_threading.append(email_for_threading)

        threaded_emails = jwzthreading.thread(emails_for_threading)

        # Output
        L = threaded_emails.items()
        L.sort()
        for subj, container in L:
            jwzthreading.print_container(container, 0, True)
            
        def update_threading(threaded_emails, parent=None):
            

            for container in threaded_emails:
                #jwzthreading.print_container(container)
                #print (repr(container))
                
                ##print "parent: "+repr(container.parent)
                ##print "children: "+repr(container.children)
                ##print("\nProcessing:  " + repr(container.message.subject) + " " + repr(container.message.message_id))
                

                if(container.message):
                    current_parent = container.message.message.post.parent
                    if(current_parent):
                        db_parent_message_id = current_parent.content.message_id
                    else:
                        db_parent_message_id = None

                    if parent:
                        if parent.message:
                            #jwzthreading strips the <>, re-add them
                            algorithm_parent_message_id = unicode("<"+parent.message.message_id+">")
                        else:
                            # Parent was a dummy container, we may need to handle this case better
                            # we just potentially lost sibbling relationships
                            algorithm_parent_message_id = None
                    else:
                        algorithm_parent_message_id = None
                    #print("Current parent from algorithm: " + repr(algorithm_parent_message_id))
                    #print("References: " + repr(container.message.references))
                    if algorithm_parent_message_id != db_parent_message_id:
                        # Don't reparent if the current parent isn't an email, the threading algorithm only considers mails
                        if current_parent == None or isinstance(current_parent.content, Email):
                            #print("UPDATING PARENT for :" + repr(container.message.message.message_id))
                            new_parent = parent.message.message.post if algorithm_parent_message_id else None
                            #print repr(new_parent)
                            container.message.message.post.set_parent(new_parent)
                    if current_parent and current_parent.content.source_id != container.message.message.source_id:
                        #This is to correct past mistakes in the database, remove it once everyone ran it benoitg 2013-11-20
                        print("UPDATING PARENT, BAD ORIGINAL SOURCE" + repr(current_parent.content.source_id) + " " + repr(container.message.message.source_id))
                        new_parent = parent.message.message.post if algorithm_parent_message_id else None
                        #print repr(new_parent)
                        container.message.message.post.set_parent(new_parent)
                        
                    update_threading(container.children, container)
                else:
                    #print "Current message ID: None, was a dummy container"
                    update_threading(container.children, parent)
                
        update_threading(threaded_emails.values())
Example #26
0
    def thread_mails(emails):
        #log.debug('Threading...')
        emails_for_threading = []
        for mail in emails:
            blob = AbstractMailbox.guess_encoding(mail.imported_blob)
            email_for_threading = jwzthreading.Message(
                email.message_from_string(blob))
            #Store our emailsubject, jwzthreading does not decode subject itself
            email_for_threading.subject = mail.subject.first_original().value
            #Store our email object pointer instead of the raw message text
            email_for_threading.message = mail
            emails_for_threading.append(email_for_threading)

        threaded_emails = jwzthreading.thread(emails_for_threading)

        # Output
        for container in threaded_emails:
            jwzthreading.print_container(container, 0, True)

        def update_threading(threaded_emails, debug=False):
            log.debug("\n\nEntering update_threading() for %ld mails:" %
                      len(threaded_emails))
            for container in threaded_emails:
                message = container['message']
                # if debug:
                #jwzthreading.print_container(container)
                message_string = "%s %s %d " % (
                    message.subject, message.message_id,
                    message.message.id) if message else "null "
                log.debug(
                    "Processing: %s container: %s parent: %s children :%s" %
                    (message_string, container, container.parent,
                     container.children))

                if (message):
                    current_parent = message.message.parent
                    if (current_parent):
                        db_parent_message_id = current_parent.message_id
                    else:
                        db_parent_message_id = None

                    if container.parent:
                        parent_message = container.parent['message']
                        if parent_message:
                            #jwzthreading strips the <>, re-add them
                            algorithm_parent_message_id = u"<" + parent_message.message_id + u">"
                        else:
                            log.warn(
                                "Parent was a dummy container, we may need "
                                "to handle this case better, as we just "
                                "potentially lost sibling relationships")
                            algorithm_parent_message_id = None
                    else:
                        algorithm_parent_message_id = None
                    log.debug("Current parent from database: " +
                              repr(db_parent_message_id))
                    log.debug("Current parent from algorithm: " +
                              repr(algorithm_parent_message_id))
                    log.debug("References: " + repr(message.references))
                    if algorithm_parent_message_id != db_parent_message_id:
                        if current_parent == None or isinstance(
                                current_parent, Email):
                            log.debug("UPDATING PARENT for :" +
                                      repr(message.message.message_id))
                            new_parent = parent_message.message if algorithm_parent_message_id else None
                            log.debug(repr(new_parent))
                            message.message.set_parent(new_parent)
                        else:
                            log.debug(
                                "Skipped reparenting:  the current parent "
                                "isn't an email, the threading algorithm only "
                                "considers mails")
                    update_threading(container.children, debug=debug)
                else:
                    log.debug(
                        "Current message ID: None, was a dummy container")
                    update_threading(container.children, debug=debug)

        update_threading(threaded_emails, debug=False)
Example #27
0
 def test_thread_single(self):
     "Thread a single message"
     m = jwzthreading.Message(None)
     m.subject = m.message_id = 'Single'
     self.assertEqual(jwzthreading.thread([m])['Single'].message, m)
Example #28
0
def test_thread_single():
    """Thread a single message."""
    m = Message(None)
    m.subject = m.message_id = 'Single'
    d = thread([m])
    assert d[0]['message'] == m
Example #29
0
	tex = filter_text(text)
	print sorted(set(tex.split(' '))),len(set(tex.split(' ')))
	write_to_orig_html_file(sentences)
	words = nltk.wordpunct_tokenize(tex)
	locs = search_dict(words)
	write_to_mod_html_file(sentences,locs,tex)
	print sorted(report_words), len(report_words)
	

build_dic_words()


files = glob.glob('/root/bngbirds-data/bngbirds/*.eml')
msglist = []

for file in files[:10]:
	fp = open(file,'r')
	msg = email.message_from_file(fp)
	m = jwz.make_message(msg,file)
	msglist.append(m)
        fp.close()
	

subject_table = jwz.thread(msglist)
L = subject_table.items()
L.sort()
for subj, container in L:
	sent=defaultdict(lambda: 0)
	depth=0
	process_single_message(container,depth,sent)
Example #30
0
def test_threading_fedora_June2010():
    """ Test threading on the fedora-devel mailing list data
    from June 2010"""

    try:
        import lxml
    except ImportError:
        raise SkipTest

    try:
        import numpy as np
        from numpy.testing import assert_array_equal
        NUMPY_PRESENT = True
    except ImportError:
        NUMPY_PRESENT = False

    msglist = parse_mailbox(os.path.join(DATA_DIR, '2010-January.txt.gz'),
                            encoding='latin1', headersonly=True)

    assert len(msglist) == N_EMAILS_JUNE2010

    threads_ref = parse_mailman_htmlthread(os.path.join(DATA_DIR,
                                           '2010-January_thread.html.gz'))
    threads_ref = sort_threads(threads_ref, key='subject', missing='Z')

    threads = thread([Message(el, message_idx=idx)
                      for idx, el in enumerate(msglist)],
                     group_by_subject=False)
    threads = sort_threads(threads, key='subject', missing='Z')

    # There is one single "empty root container", which
    # corresponds to "Re: ABRT considered painful, drago01" (idx==39, id=3) at
    # https://www.redhat.com/archives/fedora-devel-list/2010-January/thread.html
    # Mailman handles this differently, which is visually better but
    # JWZ is technically more correct IMO, just removing this case for now.
    assert sum([el.get('message') is None for el in threads]) == 1

    # remove the problematic thread (cf above)
    threads = [el for el in threads if el.get('message') is not None]
    threads_ref = [el for el in threads_ref if el['message'].message_idx != 3]

    # JWZ currently uncorrectly threads <Possible follow up> of the
    # "Common Lisp apps in Fedora," thread, remove the wrongly threaded
    # containers
    threads = [el for el in threads
               if el.get('message').message_idx not in [153, 285]]

    assert len(threads) == len(threads_ref)  # we deleted one thread

    for idx, container_ref in enumerate(threads_ref):
        container = threads[idx]
        if container.get('message') is not None:
            subject = container['message'].subject
            message_idx = container['message'].message_idx
        else:
            subject = None
            message_idx = None

        assert container_ref['message'].message_idx == message_idx

        if message_idx == 55:
            # This is the "Common Lisp apps in Fedora" thread that has
            # uncorrectly threaded <Possible follow up>
            continue

        assert container_ref.tree_size == container.tree_size

        # check that we have the same messages in threads
        if NUMPY_PRESENT:
            assert_array_equal([el['message'].message_idx for el in container_ref.flatten()],
                         [el['message'].message_idx for el in container.flatten()])
            assert_array_equal([el.current_depth for el in container_ref.flatten()],
                         np.fmin([el.current_depth for el in container.flatten()], MAILMAN_MAX_DEPTH))
Example #31
0
    print('Reading input file...')

    if False:
        mbox_path = '/Volumes/ramdisk/ccmt6.mbox'
        # with open(mbox_path, 'rb') as ifile:
        # mbox = mailbox.UnixMailbox(ifile)
        mbox = mailbox.mbox(mbox_path)
        mlist = list(mbox)
        msglist = [make_message(m) for m in mlist]
    else:
        eml_path = '/tmp/ccmt-1'
        mlist = [
            mailbox.mboxMessage(open(eml).read())
            for eml in glob(pjoin(eml_path, '*.eml'))
        ]
        msglist = [make_message(m) for m in mlist]

    print('Threading...')
    subject_table = thread(msglist)

    # Output
    L = subject_table.items()
    L.sort()
    for subj, container in L:
        print_container(container)

    with open('ccmt-1.org', 'w') as ofile:
        ofile.write('\n'.join(org_out))
        print 'ok:', ofile.name
Example #32
0
    def thread_mails(emails):
        #print('Threading...')
        emails_for_threading = []
        for mail in emails:
            email_for_threading = jwzthreading.make_message(email.message_from_string(mail.imported_blob))
            #Store our emailsubject, jwzthreading does not decode subject itself
            email_for_threading.subject = mail.subject.first_original().value
            #Store our email object pointer instead of the raw message text
            email_for_threading.message = mail
            emails_for_threading.append(email_for_threading)

        threaded_emails = jwzthreading.thread(emails_for_threading)

        # Output
        L = threaded_emails.items()
        L.sort()
        for subj, container in L:
            jwzthreading.print_container(container, 0, True)

        def update_threading(threaded_emails, parent=None, debug=False):
            if debug:
                print "\n\nEntering update_threading() for %s mails:" % len(threaded_emails)
            for container in threaded_emails:
                if debug:
                    #jwzthreading.print_container(container)
                    print("\nProcessing:  " + repr(container.message.subject.first_original().value) + " " + repr(container.message.message_id)+ " " + repr(container.message.message.id))
                    print "container: " + (repr(container))
                    print "parent: " + repr(container.parent)
                    print "children: " + repr(container.children)



                if(container.message):
                    current_parent = container.message.message.parent
                    if(current_parent):
                        db_parent_message_id = current_parent.message_id
                    else:
                        db_parent_message_id = None

                    if parent:
                        if parent.message:
                            #jwzthreading strips the <>, re-add them
                            algorithm_parent_message_id = unicode("<"+parent.message.message_id+">")
                        else:
                            if debug:
                                print "Parent was a dummy container, we may need \
                                     to handle this case better, as we just \
                                     potentially lost sibbling relationships"
                            algorithm_parent_message_id = None
                    else:
                        algorithm_parent_message_id = None
                    if debug:
                        print("Current parent from database: " + repr(db_parent_message_id))
                        print("Current parent from algorithm: " + repr(algorithm_parent_message_id))
                        print("References: " + repr(container.message.references))
                    if algorithm_parent_message_id != db_parent_message_id:
                        if current_parent == None or isinstance(current_parent, Email):
                            if debug:
                                print("UPDATING PARENT for :" + repr(container.message.message.message_id))
                            new_parent = parent.message.message if algorithm_parent_message_id else None
                            if debug:
                                print repr(new_parent)
                            container.message.message.set_parent(new_parent)
                        else:
                            if debug:
                                print "Skipped reparenting:  the current parent \
                                isn't an email, the threading algorithm only \
                                considers mails"
                    update_threading(container.children, container, debug=debug)
                else:
                    if debug:
                        print "Current message ID: None, was a dummy container"
                    update_threading(container.children, parent, debug=debug)

        update_threading(threaded_emails.values(), debug=False)
Example #33
0
 def test_thread_single(self):
     "Thread a single message"
     m = jwzthreading.Message(None)
     m.subject = m.message_id = 'Single'
     self.assertEqual(jwzthreading.thread([m])['Single'].message, m)