def test_thread_unrelated(): """Thread two unconnected messages.""" m1 = Message(None) m1.subject = m1.message_id = 'First' m2 = Message(None) m2.subject = m2.message_id = 'Second' d = thread([m1, m2], group_by_subject=False) assert d[0]['message'] == m1 assert d[1].children == [] assert d[1]['message'] == m2
def test_thread_two(): """Thread two messages together.""" m1 = Message(None) m1.subject = m1.message_id = 'First' m2 = Message(None) m2.subject = m2.message_id = 'Second' m2.references = ['First'] d = thread([m1, m2]) assert d[0]['message'] == m1 assert len(d[0].children) == 1 assert d[0].children[0]['message'] == m2
def test_thread_two_reverse(): "Thread two messages together, with the child message listed first." m1 = Message(None) m1.subject = m1.message_id = 'First' m2 = Message(None) m2.subject = m2.message_id = 'Second' m2.references = ['First'] d = thread([m2, m1], group_by_subject=False) assert d[0]['message'] == m1 assert len(d[0].children) == 1 assert d[0].children[0]['message'] == m2
def test_to_dict(): text = ["""\ Subject: random Message-ID: <message1> References: Body.""", """\ Subject: Re: random Message-ID: <message2> References: <message1> Body.""", """\ Subject: Re: random Message-ID: <message3> References: <message1> Body.""", ] msg = [message_from_string(textwrap.dedent(el)) for el in text] msg = [Message(el, message_idx=idx) for idx, el in enumerate(msg)] threads = thread(msg, group_by_subject=False) tree_expected = {'id': 0, 'parent': None, 'children': [ {'id': 1, 'parent': 0, 'children': []}, {'id': 2, 'parent': 0, 'children': []}, ]} assert threads[0].to_dict() == tree_expected
def test_email_make_message(): text = """\ Subject: random Body.""" msg = message_from_string(textwrap.dedent(text)) with pytest.raises(ValueError): Message(msg)
def transform(self, data_dir, file_pattern='.*', dir_pattern='.*', encoding='latin-1'): """Parse all emails in data_dir""" from email.parser import Parser from jwzthreading import Message data_dir = os.path.normpath(data_dir) if not os.path.exists(data_dir): raise NotFound('data_dir={} does not exist'.format(data_dir)) self.data_dir = data_dir # parse all files in the folder filenames = self._list_filenames(data_dir, dir_pattern, file_pattern) if not filenames: # no files were found raise WrongParameter('No files to process were found!') filenames_rel = [os.path.relpath(el, data_dir) for el in filenames] self.dsid = dsid = generate_uuid() self.dsid_dir = dsid_dir = os.path.join(self.cache_dir, dsid) # hash collision, should not happen if os.path.exists(dsid_dir): shutil.rmtree(dsid_dir) os.mkdir(dsid_dir) pars = { 'filenames': filenames_rel, 'data_dir': data_dir, 'n_samples': len(filenames_rel), "encoding": encoding, 'type': type(self).__name__ } self._pars = pars features = [] for idx, fname in enumerate(filenames): with open(fname, 'rt') as fh: txt = fh.read() #if sys.version_info < (3, 0) and encoding != 'utf-8': # message = message.encode('utf-8') message = Parser().parsestr(txt, headersonly=True) msg_obj = Message(message, message_idx=idx) features.append(msg_obj) joblib.dump(pars, os.path.join(dsid_dir, 'pars'), compress=9) joblib.dump(features, os.path.join(dsid_dir, 'features'), compress=9) #pars['filenames_abs'] = [os.path.join(data_dir, el) for el in filenames_base] return dsid
def test_sorting(): """Thread two unconnected messages.""" m1 = Message(None) m1.subject = 'b' m1.message_id = 1 m2 = Message(None) m2.subject = 'a' m2.message_id = 2 m3 = Message(None) d = thread([m2, m1, m3], group_by_subject=False) d_s = sort_threads(d, key='message_id', missing=-1) assert d_s[0]['message'].message_id is None assert d_s[1]['message'].message_id == 1 d_s = sort_threads(d, key='subject', missing='z') assert d_s[0]['message'].message_id == 2 assert d_s[1]['message'].message_id == 1
def test_thread_two_missing_parent(): "Thread two messages, both children of a missing parent." m1 = Message(None) m1.subject = 'Child' m1.message_id = 'First' m1.references = ['parent'] m2 = Message(None) m2.subject = 'Child' m2.message_id = 'Second' m2.references = ['parent'] d = thread([m1, m2]) assert d[0]['message'] is None assert len(d[0].children) == 2 assert d[0].children[0]['message'] == m1 assert d[0].tree_size == 3 # check that collapsing the empty container works container = d[0].collapse_empty() assert container.tree_size == 2 assert container['message'] is not None assert container['message'].message_id == 'First' assert container.parent is None
def test_encoded_message(): text = """\ Subject: =?UTF-8?B?0L/QtdGA0LXQutC70LDQtA==?= Message-ID: <message1> References: <ref1> <ref2> <ref1> In-Reply-To: <reply> Body.""" msg = message_from_string(textwrap.dedent(text)) m = Message(msg, decode_header=True) assert repr(m) # make sure that we can decode a UTF8 encoded subject assert m.subject == 'переклад'
def test_basic_message(decode_header): text = """\ Subject: random Message-ID: <message1> References: <ref1> <ref2> <ref1> In-Reply-To: <reply> Body.""" msg = message_from_string(textwrap.dedent(text)) m = Message(msg, decode_header=decode_header) assert repr(m) assert m.subject == 'random' assert sorted(m.references) == ['ref1', 'ref2', 'reply'] # Verify that repr() works repr(m)
def main(): import mailbox import sys msglist = [] print('Reading input file...') mbox = mailbox.mbox(sys.argv[1]) for message in mbox: try: parsed_msg = Message(message) except ValueError: continue msglist.append(parsed_msg) print('Threading...') threads = thread(msglist) print('Output...') for container in threads: print_container(container)
def test_empty_collapsing_fedora_June2010(): """ Test threading on the fedora-devel mailing list data from June 2010""" try: import lxml except ImportError: raise SkipTest try: import numpy as np from numpy.testing import assert_array_equal NUMPY_PRESENT = True except ImportError: NUMPY_PRESENT = False msglist = parse_mailbox(os.path.join(DATA_DIR, '2010-January.txt.gz'), encoding='latin1', headersonly=True) assert len(msglist) == N_EMAILS_JUNE2010 threads_ref = parse_mailman_htmlthread(os.path.join(DATA_DIR, '2010-January_thread.html.gz')) threads_ref = sort_threads(threads_ref, key='subject', missing='Z') threads = thread([Message(el, message_idx=idx) for idx, el in enumerate(msglist)], group_by_subject=False) # There is one single "empty root container" assert sum([el.get('message') is None for el in threads]) == 1 threads = [el.collapse_empty() for el in threads] # The empty container was removed assert sum([el.get('message') is None for el in threads]) == 0 assert sum([el.parent is None for el in threads]) == len(threads)
def get_jwz_message(message, uuid): jwz_msg = Message(msg=message['data'], message_idx=uuid) return jwz_msg
def test_thread_lying_message(): """Thread three messages together, with other messages lying in their references.""" dummy_parent_m = Message(None) dummy_parent_m.subject = dummy_parent_m.message_id = 'Dummy parent' lying_before_m = Message(None) lying_before_m.subject = lying_before_m.message_id = 'Lying before' lying_before_m.references = ['Dummy parent', 'Second', 'First', 'Third'] m1 = Message(None) m1.subject = m1.message_id = 'First' m2 = Message(None) m2.subject = m2.message_id = 'Second' m2.references = ['First'] m3 = Message(None) m3.subject = m3.message_id = 'Third' m3.references = ['First', 'Second'] lying_after_m = Message(None) lying_after_m.subject = lying_after_m.message_id = 'Lying after' # lying_after_m.references = ['Dummy parent','Third', 'Second', 'First'] d = thread([dummy_parent_m, lying_before_m, m1, m2, m3, lying_after_m], group_by_subject=False) assert d[1]['message'] == m1 assert len(d[1].children) == 1 assert d[1].children[0]['message'] == m2 assert len(d[1].children[0].children) == 1 assert d[1].children[0].children[0]['message'] == m3
def test_threading_fedora_June2010(): """ Test threading on the fedora-devel mailing list data from June 2010""" try: import lxml except ImportError: raise SkipTest try: import numpy as np from numpy.testing import assert_array_equal NUMPY_PRESENT = True except ImportError: NUMPY_PRESENT = False msglist = parse_mailbox(os.path.join(DATA_DIR, '2010-January.txt.gz'), encoding='latin1', headersonly=True) assert len(msglist) == N_EMAILS_JUNE2010 threads_ref = parse_mailman_htmlthread(os.path.join(DATA_DIR, '2010-January_thread.html.gz')) threads_ref = sort_threads(threads_ref, key='subject', missing='Z') threads = thread([Message(el, message_idx=idx) for idx, el in enumerate(msglist)], group_by_subject=False) threads = sort_threads(threads, key='subject', missing='Z') # There is one single "empty root container", which # corresponds to "Re: ABRT considered painful, drago01" (idx==39, id=3) at # https://www.redhat.com/archives/fedora-devel-list/2010-January/thread.html # Mailman handles this differently, which is visually better but # JWZ is technically more correct IMO, just removing this case for now. assert sum([el.get('message') is None for el in threads]) == 1 # remove the problematic thread (cf above) threads = [el for el in threads if el.get('message') is not None] threads_ref = [el for el in threads_ref if el['message'].message_idx != 3] # JWZ currently uncorrectly threads <Possible follow up> of the # "Common Lisp apps in Fedora," thread, remove the wrongly threaded # containers threads = [el for el in threads if el.get('message').message_idx not in [153, 285]] assert len(threads) == len(threads_ref) # we deleted one thread for idx, container_ref in enumerate(threads_ref): container = threads[idx] if container.get('message') is not None: subject = container['message'].subject message_idx = container['message'].message_idx else: subject = None message_idx = None assert container_ref['message'].message_idx == message_idx if message_idx == 55: # This is the "Common Lisp apps in Fedora" thread that has # uncorrectly threaded <Possible follow up> continue assert container_ref.tree_size == container.tree_size # check that we have the same messages in threads if NUMPY_PRESENT: assert_array_equal([el['message'].message_idx for el in container_ref.flatten()], [el['message'].message_idx for el in container.flatten()]) assert_array_equal([el.current_depth for el in container_ref.flatten()], np.fmin([el.current_depth for el in container.flatten()], MAILMAN_MAX_DEPTH))
def test_thread_single(): """Thread a single message.""" m = Message(None) m.subject = m.message_id = 'Single' d = thread([m]) assert d[0]['message'] == m
def test_prune_promote(): p = Container() c1 = Container() c1['message'] = Message() p.add_child(c1) assert prune_container(p) == [c1]