def test_remove_next_part_from_content(self): with open(get_test_file("pipermail_nextpart.txt")) as email_file: msg = message_from_file(email_file) scrubber = Scrubber("*****@*****.**", msg) contents = scrubber.scrub()[0] self.failIf("-------------- next part --------------" in contents)
def test_html_only_email(self): # This email only has an HTML part, thus the scrubbed content will be # empty. It should be an unicode empty string, not str. with open(get_test_file("html-email-2.txt")) as email_file: msg = message_from_file(email_file) scrubber = Scrubber("*****@*****.**", msg) contents = scrubber.scrub()[0] self.assertTrue(isinstance(contents, unicode), u"Scrubbed content should always be unicode")
def test_non_ascii_payload(self): """Scrubber must handle non-ascii messages""" for enc in ["utf8", "iso8859"]: with open(get_test_file("payload-%s.txt" % enc)) as email_file: msg = message_from_file(email_file) scrubber = Scrubber("*****@*****.**", msg) contents = scrubber.scrub()[0] self.assertTrue(isinstance(contents, unicode)) self.assertEqual(contents, u'This message contains non-ascii ' u'characters:\n\xe9 \xe8 \xe7 \xe0 \xee \xef \xeb \u20ac\n')
def test_name_unicode(self): for num in range(1, 6): with open(get_test_file("attachment-%d.txt" % num)) as email_file: msg = message_from_file(email_file) scrubber = Scrubber("*****@*****.**", msg) attachments = scrubber.scrub()[1] for attachment in attachments: name = attachment[1] self.assertTrue(isinstance(name, unicode), "attachment %r must be unicode" % name)
def test_bad_content_type(self): """Scrubber must handle unknown content-types""" with open(get_test_file("payload-unknown.txt")) as email_file: msg = message_from_file(email_file) scrubber = Scrubber("*****@*****.**", msg) try: contents = scrubber.scrub()[0] except LookupError, e: import traceback print(traceback.format_exc()) self.fail(e) # codec not found
def test_non_ascii_payload(self): """Scrubber must handle non-ascii messages""" for enc in ["utf8", "iso8859"]: with open(get_test_file("payload-%s.txt" % enc)) as email_file: msg = message_from_file(email_file) scrubber = Scrubber("*****@*****.**", msg) contents = scrubber.scrub()[0] self.assertTrue(isinstance(contents, unicode)) self.assertEqual( contents, u'This message contains non-ascii ' u'characters:\n\xe9 \xe8 \xe7 \xe0 \xee \xef \xeb \u20ac\n')
def test_attachment_1(self): with open(get_test_file("attachment-1.txt")) as email_file: msg = message_from_file(email_file) scrubber = Scrubber("*****@*****.**", msg) contents, attachments = scrubber.scrub() self.assertEqual(len(attachments), 1) self.assertEqual(attachments[0], (2, u'puntogil.vcf', u'text/x-vcard', u"utf-8", 'begin:vcard\r\nfn:gil\r\nn:;gil\r\nversion:2.1\r\n' 'end:vcard\r\n\r\n')) self.assertEqual( contents, "This is a test message.\r\n\r\n" "\n-- \ndevel mailing list\[email protected]\n" "https://admin.fedoraproject.org/mailman/listinfo/devel\n")
def test_html_email_1(self): with open(get_test_file("html-email-1.txt")) as email_file: msg = message_from_file(email_file) scrubber = Scrubber("*****@*****.**", msg) contents, attachments = scrubber.scrub() self.assertEqual(len(attachments), 1) # HTML part self._check_html_attachment( attachments[0], (2, u"attachment.html", "text/html", "iso-8859-1")) self.assertEqual(len(attachments[0][4]), 2723) # Scrubbed content self.assertEqual( contents, u"This is a test message\r\n" u"Non-ASCII chars: r\xe9ponse fran\xe7ais \n")
def test_html_email_1(self): with open(get_test_file("html-email-1.txt")) as email_file: msg = message_from_file(email_file) scrubber = Scrubber("*****@*****.**", msg) contents, attachments = scrubber.scrub() self.assertEqual(len(attachments), 1) # HTML part self._check_html_attachment(attachments[0], (2, u"attachment.html", "text/html", "iso-8859-1")) self.assertEqual(len(attachments[0][4]), 2723) # Scrubbed content self.assertEqual(contents, u"This is a test message\r\n" u"Non-ASCII chars: r\xe9ponse fran\xe7ais \n")
def test_attachment_5(self): with open(get_test_file("attachment-5.txt")) as email_file: msg = message_from_file(email_file) scrubber = Scrubber("*****@*****.**", msg) contents, attachments = scrubber.scrub() self.assertEqual(len(attachments), 1) # text attachment self.assertEqual(attachments[0][0:4], (2, u"todo-djeuner.txt", "text/plain", "utf-8")) self.assertEqual(len(attachments[0][4]), 112) # Scrubbed content self.assertEqual(contents, u'This is a test, HTML message with ' u'accented letters : \xe9 \xe8 \xe7 \xe0.\r\nAnd an ' u'attachment with an accented filename\r\n\r\n\r\n\r\n')
def test_attachment_name_badly_encoded(self): msg = Message() msg["From"] = "*****@*****.**" msg["Message-ID"] = "<dummy>" msg.set_payload(b"Dummy content") msg.add_header(b'Content-Disposition', b'attachment', filename=b'non-ascii-\xb8\xb1\xb1\xbe.jpg') scrubber = Scrubber("*****@*****.**", msg) try: attachments = scrubber.scrub()[1] except UnicodeDecodeError: print(format_exc()) self.fail("Could not decode the filename") self.assertEqual(attachments, [(0, u'attachment.bin', 'text/plain', None, b'Dummy content')])
def test_attachment_1(self): with open(get_test_file("attachment-1.txt")) as email_file: msg = message_from_file(email_file) scrubber = Scrubber("*****@*****.**", msg) contents, attachments = scrubber.scrub() self.assertEqual(len(attachments), 1) self.assertEqual(attachments[0], ( 2, u'puntogil.vcf', u'text/x-vcard', u"utf-8", 'begin:vcard\r\nfn:gil\r\nn:;gil\r\nversion:2.1\r\n' 'end:vcard\r\n\r\n')) self.assertEqual(contents, "This is a test message.\r\n\r\n" "\n-- \ndevel mailing list\[email protected]\n" "https://admin.fedoraproject.org/mailman/listinfo/devel\n" )
def test_attachment_5(self): with open(get_test_file("attachment-5.txt")) as email_file: msg = message_from_file(email_file) scrubber = Scrubber("*****@*****.**", msg) contents, attachments = scrubber.scrub() self.assertEqual(len(attachments), 1) # text attachment self.assertEqual(attachments[0][0:4], (2, u"todo-djeuner.txt", "text/plain", "utf-8")) self.assertEqual(len(attachments[0][4]), 112) # Scrubbed content self.assertEqual( contents, u'This is a test, HTML message with ' u'accented letters : \xe9 \xe8 \xe7 \xe0.\r\nAnd an ' u'attachment with an accented filename\r\n\r\n\r\n\r\n')
def test_attachment_3(self): with open(get_test_file("attachment-3.txt")) as email_file: msg = message_from_file(email_file) scrubber = Scrubber("*****@*****.**", msg) contents, attachments = scrubber.scrub() self.assertEqual(len(attachments), 2) # HTML part self._check_html_attachment( attachments[0], (3, u"attachment.html", "text/html", "iso-8859-1")) self.assertEqual(len(attachments[0][4]), 3134) # Image attachment self.assertEqual(attachments[1][0:4], (4, u"GeoffreyRoucourt.jpg", "image/jpeg", None)) self.assertEqual(len(attachments[1][4]), 282180) # Scrubbed content self.assertEqual(contents, u"This is a test message\r\n")
def test_attachment_3(self): with open(get_test_file("attachment-3.txt")) as email_file: msg = message_from_file(email_file) scrubber = Scrubber("*****@*****.**", msg) contents, attachments = scrubber.scrub() self.assertEqual(len(attachments), 2) # HTML part self._check_html_attachment(attachments[0], (3, u"attachment.html", "text/html", "iso-8859-1")) self.assertEqual(len(attachments[0][4]), 3134) # Image attachment self.assertEqual(attachments[1][0:4], (4, u"GeoffreyRoucourt.jpg", "image/jpeg", None)) self.assertEqual(len(attachments[1][4]), 282180) # Scrubbed content self.assertEqual(contents, u"This is a test message\r\n")
def test_attachment_name_badly_encoded(self): msg = Message() msg["From"] = "*****@*****.**" msg["Message-ID"] = "<dummy>" msg.set_payload(b"Dummy content") msg.add_header(b'Content-Disposition', b'attachment', filename=b'non-ascii-\xb8\xb1\xb1\xbe.jpg') scrubber = Scrubber("*****@*****.**", msg) try: attachments = scrubber.scrub()[1] except UnicodeDecodeError: print(format_exc()) self.fail("Could not decode the filename") self.assertEqual( attachments, [(0, u'attachment.bin', 'text/plain', None, b'Dummy content')])
def test_attachment_2(self): with open(get_test_file("attachment-2.txt")) as email_file: msg = message_from_file(email_file) scrubber = Scrubber("*****@*****.**", msg) contents, attachments = scrubber.scrub() self.assertEqual(len(attachments), 1) self.assertEqual(attachments[0], ( 3, u'signature.asc', u'application/pgp-signature', None, '-----BEGIN PGP SIGNATURE-----\r\nVersion: GnuPG v1.4.12 ' '(GNU/Linux)\r\nComment: Using GnuPG with Mozilla - ' 'http://www.enigmail.net/\r\n\r\niEYEARECAAYFAlBhm3oACgkQhmBj' 'z394AnmMnQCcC+6tWcqE1dPQmIdRbLXgKGVp\r\nEeUAn2OqtaXaXaQV7rx+' 'SmOldmSzcFw4\r\n=OEJv\r\n-----END PGP SIGNATURE-----\r\n')) self.assertEqual(contents, u"This is a test message\r\nNon-ascii chars: Hofm\xfchlgasse\r\n" u"\n-- \ndevel mailing list\[email protected]\n" u"https://admin.fedoraproject.org/mailman/listinfo/devel\n" )
def test_attachment_4(self): with open(get_test_file("attachment-4.txt")) as email_file: msg = message_from_file(email_file) scrubber = Scrubber("*****@*****.**", msg) contents, attachments = scrubber.scrub() self.assertEqual(len(attachments), 2) # HTML part self._check_html_attachment(attachments[0], (3, u"attachment.html", "text/html", "iso-8859-1")) self.assertEqual(len(attachments[0][4]), 114) # text attachment self.assertEqual(attachments[1][0:4], #(4, u"todo-déjeuner.txt", "text/plain", "utf-8")) (4, u"todo-djeuner.txt", "text/plain", "utf-8")) self.assertEqual(len(attachments[1][4]), 112) # Scrubbed content self.assertEqual(contents, u'This is a test, HTML message with ' u'accented letters : \xe9 \xe8 \xe7 \xe0.\r\nAnd an ' u'attachment with an accented filename\r\n')
def test_attachment_2(self): with open(get_test_file("attachment-2.txt")) as email_file: msg = message_from_file(email_file) scrubber = Scrubber("*****@*****.**", msg) contents, attachments = scrubber.scrub() self.assertEqual(len(attachments), 1) self.assertEqual( attachments[0], (3, u'signature.asc', u'application/pgp-signature', None, '-----BEGIN PGP SIGNATURE-----\r\nVersion: GnuPG v1.4.12 ' '(GNU/Linux)\r\nComment: Using GnuPG with Mozilla - ' 'http://www.enigmail.net/\r\n\r\niEYEARECAAYFAlBhm3oACgkQhmBj' 'z394AnmMnQCcC+6tWcqE1dPQmIdRbLXgKGVp\r\nEeUAn2OqtaXaXaQV7rx+' 'SmOldmSzcFw4\r\n=OEJv\r\n-----END PGP SIGNATURE-----\r\n')) self.assertEqual( contents, u"This is a test message\r\nNon-ascii chars: Hofm\xfchlgasse\r\n" u"\n-- \ndevel mailing list\[email protected]\n" u"https://admin.fedoraproject.org/mailman/listinfo/devel\n")
def test_attachment_4(self): with open(get_test_file("attachment-4.txt")) as email_file: msg = message_from_file(email_file) scrubber = Scrubber("*****@*****.**", msg) contents, attachments = scrubber.scrub() self.assertEqual(len(attachments), 2) # HTML part self._check_html_attachment( attachments[0], (3, u"attachment.html", "text/html", "iso-8859-1")) self.assertEqual(len(attachments[0][4]), 114) # text attachment self.assertEqual( attachments[1][0:4], #(4, u"todo-déjeuner.txt", "text/plain", "utf-8")) (4, u"todo-djeuner.txt", "text/plain", "utf-8")) self.assertEqual(len(attachments[1][4]), 112) # Scrubbed content self.assertEqual( contents, u'This is a test, HTML message with ' u'accented letters : \xe9 \xe8 \xe7 \xe0.\r\nAnd an ' u'attachment with an accented filename\r\n')
def add_to_list(list_name, message): # timeit("1 start") mlist = MailingList.objects.get_or_create(name=list_name)[0] if not getattr(settings, "HYPERKITTY_BATCH_MODE", False): mlist.update_from_mailman() mlist.save() if mlist.archive_policy == ArchivePolicy.never.value: logger.info("Archiving disabled by list policy for %s", list_name) return if not message.has_key("Message-Id"): raise ValueError("No 'Message-Id' header in email", message) # timeit("2 after ml, before checking email & sender") msg_id = get_message_id(message) if Email.objects.filter(mailinglist=mlist, message_id=msg_id).exists(): raise DuplicateMessage(msg_id) email = Email(mailinglist=mlist, message_id=msg_id) email.in_reply_to = get_ref(message) # Find thread id # Sender try: from_name, from_email = parseaddr(message["From"]) from_name = header_to_unicode(from_name).strip() sender_address = from_email.decode("ascii").strip() except (UnicodeDecodeError, UnicodeEncodeError): raise ValueError("Non-ascii sender address", message) if not sender_address: if from_name: sender_address = re.sub("[^a-z0-9]", "", from_name.lower()) if not sender_address: sender_address = "unknown" sender_address = "{}@example.com".format(sender_address) else: sender_address = "*****@*****.**" sender = Sender.objects.get_or_create(address=sender_address)[0] sender.name = from_name # update the name if needed sender.save() email.sender = sender if not getattr(settings, "HYPERKITTY_BATCH_MODE", False): set_sender_mailman_id(sender) # timeit("3 after sender, before email content") # Headers email.subject = header_to_unicode(message.get("Subject")) if email.subject is not None: # limit subject size to 512, it's a varchar field email.subject = email.subject[:512] msg_date = parsedate(message.get("Date")) if msg_date is None: # Absent or unparseable date msg_date = timezone.now() utcoffset = msg_date.utcoffset() if msg_date.tzinfo is not None: msg_date = msg_date.astimezone(timezone.utc) # store in UTC email.date = msg_date if utcoffset is None: email.timezone = 0 else: # in minutes email.timezone = int(((utcoffset.days * 24 * 60 * 60) + utcoffset.seconds) / 60) # Content scrubber = Scrubber(list_name, message) # warning: scrubbing modifies the msg in-place email.content, attachments = scrubber.scrub() # timeit("4 after email content, before signals") # TODO: detect category? # Set or create the Thread if email.in_reply_to is not None: try: ref_msg = Email.objects.get(mailinglist=email.mailinglist, message_id=email.in_reply_to) except Email.DoesNotExist: # the parent may not be archived (on partial imports), create a new # thread for now. pass else: # re-use parent's thread-id email.parent = ref_msg email.thread_id = ref_msg.thread_id ref_msg.thread.date_active = email.date ref_msg.thread.save() thread_created = False if email.thread_id is None: # Create the thread if not found thread = Thread.objects.create( mailinglist=email.mailinglist, thread_id=email.message_id_hash, date_active=email.date ) thread_created = True email.thread = thread email.save() # must save before setting the thread.starting_email if thread_created: thread.starting_email = email thread.save() new_thread.send("Mailman", thread=thread) # signal_results = new_thread.send_robust("Mailman", thread=thread) # for receiver, result in signal_results: # if isinstance(result, Exception): # logger.warning( # "Signal 'new_thread' to {} raised an exception: {}".format( # receiver.func_name, result)) # Signals new_email.send("Mailman", email=email) # signal_results = new_email.send_robust("Mailman", email=email) # for receiver, result in signal_results: # if isinstance(result, Exception): # logger.warning( # "Signal 'new_email' to {} raised an exception: {}".format( # receiver.func_name, result)) # #logger.exception(result) # #from traceback import print_exc; print_exc(result) # timeit("5 after signals, before save") # timeit("6 after save") # compute thread props here because email must have been saved before # (there will be DB queries in this function) if not getattr(settings, "HYPERKITTY_BATCH_MODE", False): compute_thread_order_and_depth(email.thread) # Attachments (email must have been saved before) for attachment in attachments: counter, name, content_type, encoding, content = attachment if Attachment.objects.filter(email=email, counter=counter).exists(): continue Attachment.objects.create( email=email, counter=counter, name=name, content_type=content_type, encoding=encoding, content=content ) return email.message_id_hash
def add_to_list(list_name, message): #timeit("1 start") mlist = MailingList.objects.get_or_create(name=list_name)[0] if not getattr(settings, "HYPERKITTY_BATCH_MODE", False): mlist.update_from_mailman() mlist.save() if mlist.archive_policy == ArchivePolicy.never.value: logger.info("Archiving disabled by list policy for %s", list_name) return if not message.has_key("Message-Id"): raise ValueError("No 'Message-Id' header in email", message) #timeit("2 after ml, before checking email & sender") msg_id = get_message_id(message) if Email.objects.filter(mailinglist=mlist, message_id=msg_id).exists(): raise DuplicateMessage(msg_id) email = Email(mailinglist=mlist, message_id=msg_id) email.in_reply_to = get_ref(message) # Find thread id # Sender try: from_name, from_email = parseaddr(message['From']) from_name = header_to_unicode(from_name).strip() sender_address = from_email.decode("ascii").strip() except (UnicodeDecodeError, UnicodeEncodeError): raise ValueError("Non-ascii sender address", message) if not sender_address: if from_name: sender_address = re.sub("[^a-z0-9]", "", from_name.lower()) if not sender_address: sender_address = "unknown" sender_address = "{}@example.com".format(sender_address) else: sender_address = "*****@*****.**" sender = Sender.objects.get_or_create(address=sender_address)[0] sender.name = from_name # update the name if needed sender.save() email.sender = sender if not getattr(settings, "HYPERKITTY_BATCH_MODE", False): set_sender_mailman_id(sender) #timeit("3 after sender, before email content") # Headers email.subject = header_to_unicode(message.get('Subject')) if email.subject is not None: # limit subject size to 512, it's a varchar field email.subject = email.subject[:512] msg_date = parsedate(message.get("Date")) if msg_date is None: # Absent or unparseable date msg_date = timezone.now() utcoffset = msg_date.utcoffset() if msg_date.tzinfo is not None: msg_date = msg_date.astimezone(timezone.utc) # store in UTC email.date = msg_date if utcoffset is None: email.timezone = 0 else: # in minutes email.timezone = int( ((utcoffset.days * 24 * 60 * 60) + utcoffset.seconds) / 60 ) # Content scrubber = Scrubber(list_name, message) # warning: scrubbing modifies the msg in-place email.content, attachments = scrubber.scrub() #timeit("4 after email content, before signals") # TODO: detect category? # Set or create the Thread if email.in_reply_to is not None: try: ref_msg = Email.objects.get( mailinglist=email.mailinglist, message_id=email.in_reply_to) except Email.DoesNotExist: # the parent may not be archived (on partial imports), create a new # thread for now. pass else: # re-use parent's thread-id email.parent = ref_msg email.thread_id = ref_msg.thread_id ref_msg.thread.date_active = email.date ref_msg.thread.save() thread_created = False if email.thread_id is None: # Create the thread if not found thread = Thread.objects.create( mailinglist=email.mailinglist, thread_id=email.message_id_hash, date_active=email.date) thread_created = True email.thread = thread email.save() # must save before setting the thread.starting_email if thread_created: thread.starting_email = email thread.save() new_thread.send("Mailman", thread=thread) #signal_results = new_thread.send_robust("Mailman", thread=thread) #for receiver, result in signal_results: # if isinstance(result, Exception): # logger.warning( # "Signal 'new_thread' to {} raised an exception: {}".format( # receiver.func_name, result)) # Signals new_email.send("Mailman", email=email) #signal_results = new_email.send_robust("Mailman", email=email) #for receiver, result in signal_results: # if isinstance(result, Exception): # logger.warning( # "Signal 'new_email' to {} raised an exception: {}".format( # receiver.func_name, result)) # #logger.exception(result) # #from traceback import print_exc; print_exc(result) #timeit("5 after signals, before save") #timeit("6 after save") # compute thread props here because email must have been saved before # (there will be DB queries in this function) if not getattr(settings, "HYPERKITTY_BATCH_MODE", False): compute_thread_order_and_depth(email.thread) # Attachments (email must have been saved before) for attachment in attachments: counter, name, content_type, encoding, content = attachment if Attachment.objects.filter(email=email, counter=counter).exists(): continue Attachment.objects.create( email=email, counter=counter, name=name, content_type=content_type, encoding=encoding, content=content) return email.message_id_hash