Esempio n. 1
0
def test_thread_unrelated():
    """Thread two unconnected messages."""
    m1 = Message(None)
    m1.subject = m1.message_id = 'First'
    m2 = Message(None)
    m2.subject = m2.message_id = 'Second'
    d = thread([m1, m2], group_by_subject=False)
    assert d[0]['message'] == m1
    assert d[1].children == []
    assert d[1]['message'] == m2
Esempio n. 2
0
def test_thread_two():
    """Thread two messages together."""
    m1 = Message(None)
    m1.subject = m1.message_id = 'First'
    m2 = Message(None)
    m2.subject = m2.message_id = 'Second'
    m2.references = ['First']
    d = thread([m1, m2])
    assert d[0]['message'] == m1
    assert len(d[0].children) == 1
    assert d[0].children[0]['message'] == m2
Esempio n. 3
0
def test_thread_two_reverse():
    "Thread two messages together, with the child message listed first."
    m1 = Message(None)
    m1.subject = m1.message_id = 'First'
    m2 = Message(None)
    m2.subject = m2.message_id = 'Second'
    m2.references = ['First']
    d = thread([m2, m1], group_by_subject=False)
    assert d[0]['message'] == m1
    assert len(d[0].children) == 1
    assert d[0].children[0]['message'] == m2
Esempio n. 4
0
def test_to_dict():
    text = ["""\
            Subject: random
            Message-ID: <message1>
            References:

            Body.""",
            """\
            Subject: Re: random
            Message-ID: <message2>
            References: <message1>

            Body.""",

            """\
            Subject: Re: random
            Message-ID: <message3>
            References: <message1>

            Body.""",
            ]

    msg = [message_from_string(textwrap.dedent(el)) for el in text]
    msg = [Message(el, message_idx=idx) for idx, el in enumerate(msg)]

    threads = thread(msg, group_by_subject=False)

    tree_expected = {'id': 0, 'parent': None, 'children': [
                        {'id': 1, 'parent': 0, 'children': []},
                        {'id': 2, 'parent': 0, 'children': []},
                        ]}

    assert threads[0].to_dict() == tree_expected
Esempio n. 5
0
def test_email_make_message():
    text = """\
        Subject: random

        Body."""
    msg = message_from_string(textwrap.dedent(text))
    with pytest.raises(ValueError):
        Message(msg)
Esempio n. 6
0
    def transform(self,
                  data_dir,
                  file_pattern='.*',
                  dir_pattern='.*',
                  encoding='latin-1'):
        """Parse all emails in data_dir"""
        from email.parser import Parser
        from jwzthreading import Message

        data_dir = os.path.normpath(data_dir)

        if not os.path.exists(data_dir):
            raise NotFound('data_dir={} does not exist'.format(data_dir))
        self.data_dir = data_dir

        # parse all files in the folder
        filenames = self._list_filenames(data_dir, dir_pattern, file_pattern)

        if not filenames:  # no files were found
            raise WrongParameter('No files to process were found!')

        filenames_rel = [os.path.relpath(el, data_dir) for el in filenames]
        self.dsid = dsid = generate_uuid()
        self.dsid_dir = dsid_dir = os.path.join(self.cache_dir, dsid)

        # hash collision, should not happen
        if os.path.exists(dsid_dir):
            shutil.rmtree(dsid_dir)

        os.mkdir(dsid_dir)
        pars = {
            'filenames': filenames_rel,
            'data_dir': data_dir,
            'n_samples': len(filenames_rel),
            "encoding": encoding,
            'type': type(self).__name__
        }
        self._pars = pars

        features = []
        for idx, fname in enumerate(filenames):
            with open(fname, 'rt') as fh:
                txt = fh.read()
                #if sys.version_info < (3, 0) and encoding != 'utf-8':
                #    message = message.encode('utf-8')
                message = Parser().parsestr(txt, headersonly=True)

                msg_obj = Message(message, message_idx=idx)

                features.append(msg_obj)

        joblib.dump(pars, os.path.join(dsid_dir, 'pars'), compress=9)
        joblib.dump(features, os.path.join(dsid_dir, 'features'), compress=9)

        #pars['filenames_abs'] = [os.path.join(data_dir, el) for el in filenames_base]
        return dsid
Esempio n. 7
0
def test_sorting():
    """Thread two unconnected messages."""
    m1 = Message(None)
    m1.subject = 'b'
    m1.message_id = 1
    m2 = Message(None)
    m2.subject = 'a'
    m2.message_id = 2
    m3 = Message(None)
    d = thread([m2, m1, m3], group_by_subject=False)

    d_s = sort_threads(d, key='message_id', missing=-1)
    assert d_s[0]['message'].message_id is None
    assert d_s[1]['message'].message_id == 1
    d_s = sort_threads(d, key='subject', missing='z')
    assert d_s[0]['message'].message_id == 2
    assert d_s[1]['message'].message_id == 1
Esempio n. 8
0
def test_thread_two_missing_parent():
    "Thread two messages, both children of a missing parent."
    m1 = Message(None)
    m1.subject = 'Child'
    m1.message_id = 'First'
    m1.references = ['parent']
    m2 = Message(None)
    m2.subject = 'Child'
    m2.message_id = 'Second'
    m2.references = ['parent']
    d = thread([m1, m2])
    assert d[0]['message'] is None
    assert len(d[0].children) == 2
    assert d[0].children[0]['message'] == m1
    assert d[0].tree_size == 3

    # check that collapsing the empty container works
    container = d[0].collapse_empty()
    assert container.tree_size == 2
    assert container['message'] is not None
    assert container['message'].message_id == 'First'
    assert container.parent is None
Esempio n. 9
0
def test_encoded_message():
    text = """\
        Subject: =?UTF-8?B?0L/QtdGA0LXQutC70LDQtA==?=
        Message-ID: <message1>
        References: <ref1> <ref2> <ref1>
        In-Reply-To: <reply>

        Body."""
    msg = message_from_string(textwrap.dedent(text))
    m = Message(msg, decode_header=True)
    assert repr(m)
    # make sure that we can decode a UTF8 encoded subject
    assert m.subject == 'переклад'
Esempio n. 10
0
def test_basic_message(decode_header):
    text = """\
        Subject: random
        Message-ID: <message1>
        References: <ref1> <ref2> <ref1>
        In-Reply-To: <reply>

        Body."""
    msg = message_from_string(textwrap.dedent(text))
    m = Message(msg, decode_header=decode_header)
    assert repr(m)
    assert m.subject == 'random'
    assert sorted(m.references) == ['ref1', 'ref2', 'reply']

    # Verify that repr() works
    repr(m)
Esempio n. 11
0
def main():
    import mailbox
    import sys

    msglist = []

    print('Reading input file...')
    mbox = mailbox.mbox(sys.argv[1])
    for message in mbox:
        try:
            parsed_msg = Message(message)
        except ValueError:
            continue
        msglist.append(parsed_msg)

    print('Threading...')
    threads = thread(msglist)

    print('Output...')
    for container in threads:
        print_container(container)
Esempio n. 12
0
def test_empty_collapsing_fedora_June2010():
    """ Test threading on the fedora-devel mailing list data
    from June 2010"""

    try:
        import lxml
    except ImportError:
        raise SkipTest

    try:
        import numpy as np
        from numpy.testing import assert_array_equal
        NUMPY_PRESENT = True
    except ImportError:
        NUMPY_PRESENT = False

    msglist = parse_mailbox(os.path.join(DATA_DIR, '2010-January.txt.gz'),
                            encoding='latin1', headersonly=True)

    assert len(msglist) == N_EMAILS_JUNE2010

    threads_ref = parse_mailman_htmlthread(os.path.join(DATA_DIR,
                                           '2010-January_thread.html.gz'))
    threads_ref = sort_threads(threads_ref, key='subject', missing='Z')

    threads = thread([Message(el, message_idx=idx)
                      for idx, el in enumerate(msglist)],
                     group_by_subject=False)
    # There is one single "empty root container"
    assert sum([el.get('message') is None for el in threads]) == 1

    threads = [el.collapse_empty() for el in threads]

    # The empty container was removed
    assert sum([el.get('message') is None for el in threads]) == 0

    assert sum([el.parent is None for el in threads]) == len(threads)
Esempio n. 13
0
def get_jwz_message(message, uuid):
    jwz_msg = Message(msg=message['data'], message_idx=uuid)
    return jwz_msg
Esempio n. 14
0
def test_thread_lying_message():
    """Thread three messages together, with other messages lying
    in their references."""
    dummy_parent_m = Message(None)
    dummy_parent_m.subject = dummy_parent_m.message_id = 'Dummy parent'
    lying_before_m = Message(None)
    lying_before_m.subject = lying_before_m.message_id = 'Lying before'
    lying_before_m.references = ['Dummy parent', 'Second', 'First', 'Third']
    m1 = Message(None)
    m1.subject = m1.message_id = 'First'
    m2 = Message(None)
    m2.subject = m2.message_id = 'Second'
    m2.references = ['First']
    m3 = Message(None)
    m3.subject = m3.message_id = 'Third'
    m3.references = ['First', 'Second']
    lying_after_m = Message(None)
    lying_after_m.subject = lying_after_m.message_id = 'Lying after'
    # lying_after_m.references = ['Dummy parent','Third', 'Second', 'First']
    d = thread([dummy_parent_m, lying_before_m,
                m1, m2, m3, lying_after_m], group_by_subject=False)
    assert d[1]['message'] == m1
    assert len(d[1].children) == 1
    assert d[1].children[0]['message'] == m2
    assert len(d[1].children[0].children) == 1
    assert d[1].children[0].children[0]['message'] == m3
Esempio n. 15
0
def test_threading_fedora_June2010():
    """ Test threading on the fedora-devel mailing list data
    from June 2010"""

    try:
        import lxml
    except ImportError:
        raise SkipTest

    try:
        import numpy as np
        from numpy.testing import assert_array_equal
        NUMPY_PRESENT = True
    except ImportError:
        NUMPY_PRESENT = False

    msglist = parse_mailbox(os.path.join(DATA_DIR, '2010-January.txt.gz'),
                            encoding='latin1', headersonly=True)

    assert len(msglist) == N_EMAILS_JUNE2010

    threads_ref = parse_mailman_htmlthread(os.path.join(DATA_DIR,
                                           '2010-January_thread.html.gz'))
    threads_ref = sort_threads(threads_ref, key='subject', missing='Z')

    threads = thread([Message(el, message_idx=idx)
                      for idx, el in enumerate(msglist)],
                     group_by_subject=False)
    threads = sort_threads(threads, key='subject', missing='Z')

    # There is one single "empty root container", which
    # corresponds to "Re: ABRT considered painful, drago01" (idx==39, id=3) at
    # https://www.redhat.com/archives/fedora-devel-list/2010-January/thread.html
    # Mailman handles this differently, which is visually better but
    # JWZ is technically more correct IMO, just removing this case for now.
    assert sum([el.get('message') is None for el in threads]) == 1

    # remove the problematic thread (cf above)
    threads = [el for el in threads if el.get('message') is not None]
    threads_ref = [el for el in threads_ref if el['message'].message_idx != 3]

    # JWZ currently uncorrectly threads <Possible follow up> of the
    # "Common Lisp apps in Fedora," thread, remove the wrongly threaded
    # containers
    threads = [el for el in threads
               if el.get('message').message_idx not in [153, 285]]

    assert len(threads) == len(threads_ref)  # we deleted one thread

    for idx, container_ref in enumerate(threads_ref):
        container = threads[idx]
        if container.get('message') is not None:
            subject = container['message'].subject
            message_idx = container['message'].message_idx
        else:
            subject = None
            message_idx = None

        assert container_ref['message'].message_idx == message_idx

        if message_idx == 55:
            # This is the "Common Lisp apps in Fedora" thread that has
            # uncorrectly threaded <Possible follow up>
            continue

        assert container_ref.tree_size == container.tree_size

        # check that we have the same messages in threads
        if NUMPY_PRESENT:
            assert_array_equal([el['message'].message_idx for el in container_ref.flatten()],
                         [el['message'].message_idx for el in container.flatten()])
            assert_array_equal([el.current_depth for el in container_ref.flatten()],
                         np.fmin([el.current_depth for el in container.flatten()], MAILMAN_MAX_DEPTH))
Esempio n. 16
0
def test_thread_single():
    """Thread a single message."""
    m = Message(None)
    m.subject = m.message_id = 'Single'
    d = thread([m])
    assert d[0]['message'] == m
Esempio n. 17
0
def test_prune_promote():
    p = Container()
    c1 = Container()
    c1['message'] = Message()
    p.add_child(c1)
    assert prune_container(p) == [c1]