def test_ignore_messages(self): """Test if it ignores some messages without mandatory fields""" backend = MBox('http://example.com/', self.tmp_error_path) messages = [m for m in backend.fetch()] # There are only two valid message on the mbox self.assertEqual(len(messages), 2) expected = { 'From': 'goran at domain.com ( Göran Lastname )', 'Date': 'Wed, 01 Dec 2010 14:26:40 +0100', 'Subject': '[List-name] Protocol Buffers anyone?', 'Message-ID': '<*****@*****.**>', 'unixfrom': 'goran at domain.com Wed Dec 1 08:26:40 2010', 'body': { 'plain': "Hi!\n\nA message in English, with a signature " "with a different encoding.\n\nregards, G?ran" "\n", } } message = messages[0]['data'] self.assertDictEqual(message, expected) # On the second message, the only change is that 'Message-id' # is replaced by 'Message-ID' message = messages[1]['data'] self.assertDictEqual(message, expected)
def test_fetch(self): """Test whether it parses a set of mbox files""" backend = MBox('http://example.com/', self.tmp_path) messages = [m for m in backend.fetch()] expected = [ ('<*****@*****.**>', '86315b479b4debe320b59c881c1e375216cbf333', 1291210000.0), ('<*****@*****.**>', '86315b479b4debe320b59c881c1e375216cbf333', 1291210000.0), ('<*****@*****.**>', 'bd0185317b013beb21ad3ea04635de3db72496ad', 1095843820.0), ('<*****@*****.**>', '51535703010a3e63d5272202942c283394cdebca', 1205746505.0), ('<019801ca633f$f4376140$dca623c0$@[email protected]>', '302e314c07242bb4750351286862f49e758f3e17', 1257992964.0), ('<*****@*****.**>', 'ddda42422c55d08d56c017a6f128fcd7447484ea', 1043881350.0), ('<*****@*****.**>', '86315b479b4debe320b59c881c1e375216cbf333', 1291210000.0), ('<*****@*****.**>', 'ad3116ae93c0df50436f7c84bfc94000e990996c', 1421328145.0), ('<*****@*****.**>', '4e255acab6442424ecbf05cb0feb1eccb587f7de', 1030123489.0), ] self.assertEqual(len(messages), len(expected)) for x in range(len(messages)): message = messages[x] self.assertEqual(message['data']['Message-ID'], expected[x][0]) self.assertEqual(message['origin'], 'http://example.com/') self.assertEqual(message['uuid'], expected[x][1]) self.assertEqual(message['updated_on'], expected[x][2]) self.assertEqual(message['category'], 'message') self.assertEqual(message['tag'], 'http://example.com/')
def test_fetch_from_date(self): """Test whether a list of messages is returned since a given date""" from_date = datetime.datetime(2008, 1, 1) backend = MBox('http://example.com/', self.tmp_path) messages = [m for m in backend.fetch(from_date=from_date)] expected = [ ('<*****@*****.**>', '86315b479b4debe320b59c881c1e375216cbf333', 1291210000.0), ('<*****@*****.**>', '86315b479b4debe320b59c881c1e375216cbf333', 1291210000.0), ('<*****@*****.**>', '51535703010a3e63d5272202942c283394cdebca', 1205746505.0), ('<019801ca633f$f4376140$dca623c0$@[email protected]>', '302e314c07242bb4750351286862f49e758f3e17', 1257992964.0), ('<*****@*****.**>', '86315b479b4debe320b59c881c1e375216cbf333', 1291210000.0), ('<*****@*****.**>', 'ad3116ae93c0df50436f7c84bfc94000e990996c', 1421328145.0) ] self.assertEqual(len(messages), len(expected)) for x in range(len(messages)): message = messages[x] self.assertEqual(message['data']['Message-ID'], expected[x][0]) self.assertEqual(message['origin'], 'http://example.com/') self.assertEqual(message['uuid'], expected[x][1]) self.assertEqual(message['updated_on'], expected[x][2]) self.assertEqual(message['category'], 'message') self.assertEqual(message['tag'], 'http://example.com/')
def test_ignore_file_errors(self): """Files with IO errors should be ignored""" tmp_path_ign = tempfile.mkdtemp(prefix='perceval_') shutil.copy( os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data/mbox/mbox_single.mbox'), tmp_path_ign) shutil.copy( os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data/mbox/mbox_multipart.mbox'), tmp_path_ign) # Update file mode to make it unable to access os.chmod(os.path.join(tmp_path_ign, 'mbox_multipart.mbox'), 0o000) backend = MBox('http://example.com/', tmp_path_ign) messages = [m for m in backend.fetch()] # Only one message is read self.assertEqual(len(messages), 1) self.assertEqual(messages[0]['data']['Message-ID'], '<*****@*****.**>') self.assertEqual(messages[0]['data']['Date'], 'Wed, 01 Dec 2010 14:26:40 +0100') shutil.rmtree(tmp_path_ign)
def list_mailers(self, url, directory="files/mbox"): repo = MBox(uri=url, dirpath=directory) count = 0 list_mailers = [] for message in repo.fetch(): list_mailers.append(message['data']['From']) return list_mailers
def test_search_fields(self): """Test whether the search_fields is properly set""" backend = MBox('http://example.com/', self.tmp_path) messages = [m for m in backend.fetch(from_date=None)] for message in messages: self.assertEqual(backend.metadata_id(message['data']), message['search_fields']['item_id'])
def test_fetch_exception(self, mock_str_to_datetime): """Test whether an exception is thrown when the the fetch_items method fails""" mock_str_to_datetime.side_effect = Exception backend = MBox('http://example.com/', self.tmp_path) with self.assertRaises(Exception): _ = [m for m in backend.fetch(from_date=None)]
def test_ignore_file_errors(self): """Files with IO errors should be ignored""" tmp_path_ign = tempfile.mkdtemp(prefix='perceval_') def copy_mbox_side_effect(*args, **kwargs): """Copy a mbox archive or raise IO error for 'mbox_multipart.mbox' archive""" error_file = os.path.join(tmp_path_ign, 'mbox_multipart.mbox') mbox = args[0] if mbox.filepath == error_file: raise OSError('Mock error') tmp_path = tempfile.mktemp(prefix='perceval_') with mbox.container as f_in: with open(tmp_path, mode='wb') as f_out: for l in f_in: f_out.write(l) return tmp_path shutil.copy( os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data/mbox/mbox_single.mbox'), tmp_path_ign) shutil.copy( os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data/mbox/mbox_multipart.mbox'), tmp_path_ign) # Mock 'copy_mbox' method for forcing to raise an OSError # with file 'data/mbox/mbox_multipart.mbox' to check if # the code ignores this file with unittest.mock.patch('perceval.backends.core.mbox.MBox._copy_mbox' ) as mock_copy_mbox: mock_copy_mbox.side_effect = copy_mbox_side_effect backend = MBox('http://example.com/', tmp_path_ign) messages = [m for m in backend.fetch()] # Only one message is read self.assertEqual(len(messages), 1) self.assertEqual(messages[0]['data']['Message-ID'], '<*****@*****.**>') self.assertEqual(messages[0]['data']['Date'], 'Wed, 01 Dec 2010 14:26:40 +0100') shutil.rmtree(tmp_path_ign)
def get_content(self, max_items: Optional[int] = None) -> List[str]: """ Получает только содержимое писем в MBOX хранилище писем игнорируя все заголовки :param max_items: Максимальное количество сообщений :return: генератор с содержимым писем архива MBOX """ repo = MBox(self.mbox_path, self.mbox_path) result = [] for index, msg in enumerate(repo.fetch()): if max_items and index >= max_items: break result.append(msg['data']['body'].get( 'html', msg['data']['body'].get('plain', ''))) return result
# you need to have the archives to analyzed there before running the script mbox_dir = 'archives' # ElasticSearch instance (url) es = elasticsearch.Elasticsearch(['http://localhost:9200/']) # Create the 'messages' index in ElasticSearch try: es.indices.create('messages') except elasticsearch.exceptions.RequestError: print('Index already exisits, remove it before running this script again.') exit() # create a mbox object, using mbox_uri as label, mbox_dir as directory to scan repo = MBox(uri=mbox_uri, dirpath=mbox_dir) # Fetch all commits as an iteratoir, and iterate it uploading to ElasticSearch print('Analyzing mbox archives...') # fetch all messages as an iteratoir for message in repo.fetch(): # Create the object (dictionary) to upload to ElasticSearch summary = { 'from': message['data']['From'], 'subject': message['data']['Subject'], 'date': email.utils.parsedate_to_datetime(message['data']['Date']) } print('.', end='') # Upload the object to ElasticSearch es.index(index='messages', doc_type='summary', body=summary) print('\nCreated new index with commits.')
def getmbox(self, mbox_files): mbox_parser = MBox(uri=mbox_files, dirpath='./mboxes') return mbox_parser.fetch()
def numMails(self, url, directory="files/mbox"): repo = MBox(uri=url, dirpath=directory) count = 0 for message in repo.fetch(): count += 1 return count