def from_mbox(self, mbfile): """ Insert all the emails contained in an mbox file into the database. :arg mbfile: a mailbox file """ mbox = mailbox.mbox(mbfile) progress_marker = ProgressMarker(self.verbose, self.stdout) if not self.since: progress_marker.total = len(mbox) for message in mbox: if self._is_too_old(message): continue progress_marker.tick(message["Message-Id"]) # Un-wrap the subject line if necessary if message["subject"]: message.replace_header( "subject", TEXTWRAP_RE.sub(" ", message["subject"])) if message.get_from(): message.set_unixfrom(message.get_from()) # Now insert the message try: with transaction.atomic(): add_to_list(self.list_address, message) except DuplicateMessage as e: if self.verbose: self.stderr.write("Duplicate email with message-id '%s'" % e.args[0]) continue except ValueError as e: self.stderr.write("Failed adding message %s: %s" % (message.get("Message-ID"), e)) if len(e.args) != 2: raise # Regular ValueError exception try: self.stderr.write("%s from %s about %s" % (e.args[0], e.args[1].get("From"), e.args[1].get("Subject"))) except UnicodeDecodeError: pass continue except DatabaseError: try: print_exc(file=self.stderr) except UnicodeError: pass self.stderr.write("Message %s failed to import, skipping" % unquote(message["Message-Id"])) continue email = Email.objects.get(mailinglist__name=self.list_address, message_id=get_message_id(message)) # # Commit every time to be able to rollback on error # if not transaction.get_autocommit(): # transaction.commit() # Store the list of impacted threads to be able to compute the # thread_order and thread_depth values self.impacted_thread_ids.add(email.thread_id) progress_marker.count_imported += 1 # self.store.search_index.flush() # Now commit to the search index progress_marker.finish()
def from_mbox(self, mbfile): """ Insert all the emails contained in an mbox file into the database. :arg mbfile: a mailbox file """ #self.store.search_index = make_delayed(self.store.search_index) mbox = mailbox.mbox(mbfile) progress_marker = ProgressMarker(self.verbose, self.stdout) if not self.since: progress_marker.total = len(mbox) for message in mbox: if self._is_too_old(message): continue progress_marker.tick(message["Message-Id"]) # Un-wrap the subject line if necessary if message["subject"]: message.replace_header("subject", TEXTWRAP_RE.sub(" ", message["subject"])) # Now insert the message try: with transaction.atomic(): add_to_list(self.list_address, message) except DuplicateMessage as e: if self.verbose: self.stderr.write( "Duplicate email with message-id '%s'" % e.args[0]) continue except ValueError as e: if len(e.args) != 2: raise # Regular ValueError exception try: self.stderr.write("%s from %s about %s" % (e.args[0], e.args[1].get("From"), e.args[1].get("Subject"))) except UnicodeDecodeError: self.stderr.write("%s with message-id %s" % (e.args[0], e.args[1].get("Message-ID"))) continue except DatabaseError: try: print_exc(file=self.stderr) except UnicodeError: pass self.stderr.write("Message %s failed to import, skipping" % unquote(message["Message-Id"])) continue email = Email.objects.get( mailinglist__name=self.list_address, message_id=get_message_id(message)) ## Commit every time to be able to rollback on error #if not transaction.get_autocommit(): # transaction.commit() # Store the list of impacted threads to be able to compute the # thread_order and thread_depth values self.impacted_thread_ids.add(email.thread_id) progress_marker.count_imported += 1 #self.store.search_index.flush() # Now commit to the search index progress_marker.finish()
def test_get_folded_message_id(self): msg = message_from_string("""\ From: [email protected] To: [email protected] Subject: Test message Message-ID: <a.folded.message.id> Dummy Message """) self.assertEqual(utils.get_message_id(msg), 'a.folded.message.id')
def add_to_list(list_name, message): # timeit("1 start") mlist = MailingList.objects.get_or_create(name=list_name)[0] if not getattr(settings, "HYPERKITTY_BATCH_MODE", False): mlist.update_from_mailman() mlist.save() if mlist.archive_policy == ArchivePolicy.never.value: logger.info("Archiving disabled by list policy for %s", list_name) return if "Message-Id" not in message: raise ValueError("No 'Message-Id' header in email", message) # timeit("2 after ml, before checking email & sender") msg_id = get_message_id(message) if Email.objects.filter(mailinglist=mlist, message_id=msg_id).exists(): raise DuplicateMessage(msg_id) email = Email(mailinglist=mlist, message_id=msg_id) email.in_reply_to = get_ref(message) # Find thread id if message.get_unixfrom() is not None: mo = UNIXFROM_DATE_RE.match(message.get_unixfrom()) if mo: archived_date = parsedate(mo.group(1)) if archived_date is not None: email.archived_date = archived_date # Sender try: from_name, from_email = parseaddr(message['From']) from_name = header_to_unicode(from_name).strip() sender_address = from_email.decode("ascii").strip() except (UnicodeDecodeError, UnicodeEncodeError): raise ValueError("Non-ascii sender address", message) if not sender_address: if from_name: sender_address = re.sub("[^a-z0-9]", "", from_name.lower()) if not sender_address: sender_address = "unknown" sender_address = "{}@example.com".format(sender_address) else: sender_address = "*****@*****.**" email.sender_name = from_name sender = Sender.objects.get_or_create(address=sender_address)[0] email.sender = sender if not getattr(settings, "HYPERKITTY_BATCH_MODE", False): try: sender.set_mailman_id() except MailmanConnectionError: pass # timeit("3 after sender, before email content") # Headers email.subject = header_to_unicode(message.get('Subject')) if email.subject is not None: # limit subject size to 512, it's a varchar field email.subject = email.subject[:512] msg_date = parsedate(message.get("Date")) if msg_date is None: # Absent or unparseable date msg_date = timezone.now() utcoffset = msg_date.utcoffset() if msg_date.tzinfo is not None: msg_date = msg_date.astimezone(timezone.utc) # store in UTC email.date = msg_date if utcoffset is None: email.timezone = 0 else: # in minutes email.timezone = int( ((utcoffset.days * 24 * 60 * 60) + utcoffset.seconds) / 60) # Content scrubber = Scrubber(message) # warning: scrubbing modifies the msg in-place email.content, attachments = scrubber.scrub() # timeit("4 after email content, before signals") # TODO: detect category? # Set or create the Thread if email.in_reply_to is not None: try: ref_msg = Email.objects.get(mailinglist=email.mailinglist, message_id=email.in_reply_to) except Email.DoesNotExist: # the parent may not be archived (on partial imports), create a new # thread for now. pass else: # re-use parent's thread-id email.parent = ref_msg email.thread_id = ref_msg.thread_id thread = ref_msg.thread thread_created = False if email.thread_id is None: # Create the thread if not found thread, thread_created = Thread.objects.get_or_create( mailinglist=email.mailinglist, thread_id=email.message_id_hash) email.thread = thread email.save() # must save before setting the thread.starting_email thread.date_active = email.date if thread_created: thread.starting_email = email thread.save() if thread_created: new_thread.send("Mailman", thread=thread) # signal_results = new_thread.send_robust("Mailman", thread=thread) # for receiver, result in signal_results: # if isinstance(result, Exception): # logger.warning( # "Signal 'new_thread' to {} raised an " # "exception: {}".format( # receiver.func_name, result)) # Signals new_email.send("Mailman", email=email) # signal_results = new_email.send_robust("Mailman", email=email) # for receiver, result in signal_results: # if isinstance(result, Exception): # logger.warning( # "Signal 'new_email' to {} raised an exception: {}".format( # receiver.func_name, result)) # #logger.exception(result) # #from traceback import print_exc; print_exc(result) # timeit("5 after signals, before save") # timeit("6 after save") # compute thread props here because email must have been saved before # (there will be DB queries in this function) if not getattr(settings, "HYPERKITTY_BATCH_MODE", False): compute_thread_order_and_depth(email.thread) # Attachments (email must have been saved before) for attachment in attachments: counter, name, content_type, encoding, content = attachment if Attachment.objects.filter(email=email, counter=counter).exists(): continue Attachment.objects.create(email=email, counter=counter, name=name, content_type=content_type, encoding=encoding, content=content) return email.message_id_hash
def test_get_message_id(self): msg = Message() msg["Message-Id"] = '<%s>' % ('x' * 300) self.assertEqual(utils.get_message_id(msg), 'x' * 254)
def add_to_list(list_name, message): assert isinstance(message, EmailMessage) # timeit("1 start") mlist = MailingList.objects.get_or_create(name=list_name)[0] if not getattr(settings, "HYPERKITTY_BATCH_MODE", False): update_from_mailman(mlist.name) mlist.save() if mlist.archive_policy == ArchivePolicy.never.value: logger.info("Archiving disabled by list policy for %s", list_name) return if "Message-Id" not in message: raise ValueError("No 'Message-Id' header in email", message) # timeit("2 after ml, before checking email & sender") msg_id = get_message_id(message) if Email.objects.filter(mailinglist=mlist, message_id=msg_id).exists(): raise DuplicateMessage(msg_id) email = Email(mailinglist=mlist, message_id=msg_id) email.in_reply_to = get_ref(message) # Find thread id if message.get_unixfrom() is not None: mo = UNIXFROM_DATE_RE.match(message.get_unixfrom()) if mo: archived_date = parsedate(mo.group(1)) if archived_date is not None: email.archived_date = archived_date # Sender try: from_str = header_to_unicode(message['From']) from_name, from_email = parseaddr(from_str) from_name = from_name.strip() sender_address = from_email.encode('ascii').decode("ascii").strip() except (UnicodeDecodeError, UnicodeEncodeError): raise ValueError("Non-ascii sender address", message) if not sender_address: if from_name: sender_address = re.sub("[^a-z0-9]", "", from_name.lower()) if not sender_address: sender_address = "unknown" sender_address = "{}@example.com".format(sender_address) else: sender_address = "*****@*****.**" email.sender_name = from_name sender = Sender.objects.get_or_create(address=sender_address)[0] email.sender = sender if not getattr(settings, "HYPERKITTY_BATCH_MODE", False): sender_mailman_id(sender.pk) # timeit("3 after sender, before email content") # Headers email.subject = header_to_unicode(message.get('Subject')) if email.subject is not None: # limit subject size to 512, it's a varchar field email.subject = email.subject[:512] msg_date = parsedate(message.get("Date")) if msg_date is None: # Absent or unparseable date msg_date = timezone.now() utcoffset = msg_date.utcoffset() if msg_date.tzinfo is not None: msg_date = msg_date.astimezone(timezone.utc) # store in UTC email.date = msg_date if utcoffset is None: email.timezone = 0 else: # in minutes email.timezone = int( ((utcoffset.days * 24 * 60 * 60) + utcoffset.seconds) / 60) # Content scrubber = Scrubber(message) # warning: scrubbing modifies the msg in-place email.content, attachments = scrubber.scrub() # timeit("4 after email content, before signals") # TODO: detect category? # Find the parent email. # This can't be moved to Email.on_pre_save() because Email.set_parent() # needs to be free to change the parent independently from the in_reply_to # property, and will save() the instance. # This, along with some of the work done in Email.on_pre_save(), could be # moved to an async task, but the rest of the app must be able to cope with # emails lacking this data, and email being process randomly (child before # parent). The work in Email.on_post_created() also depends on it, so be # careful with task dependencies if you ever do this. # Plus, it has "premature optimization" written all over it. if email.in_reply_to is not None: try: ref_msg = Email.objects.get(mailinglist=email.mailinglist, message_id=email.in_reply_to) except Email.DoesNotExist: # the parent may not be archived (on partial imports), create a new # thread for now. pass else: # re-use parent's thread-id email.parent = ref_msg email.thread_id = ref_msg.thread_id try: email.save() except DataError as e: raise ValueError(str(e)) # Attachments (email must have been saved before) for attachment in attachments: counter, name, content_type, encoding, content = attachment if Attachment.objects.filter(email=email, counter=counter).exists(): continue att = Attachment.objects.create(email=email, counter=counter, name=name, content_type=content_type, encoding=encoding) att.set_content(content) att.save() return email.message_id_hash
def add_to_list(list_name, message): # timeit("1 start") mlist = MailingList.objects.get_or_create(name=list_name)[0] if not getattr(settings, "HYPERKITTY_BATCH_MODE", False): mlist.update_from_mailman() mlist.save() if mlist.archive_policy == ArchivePolicy.never.value: logger.info("Archiving disabled by list policy for %s", list_name) return if not message.has_key("Message-Id"): raise ValueError("No 'Message-Id' header in email", message) # timeit("2 after ml, before checking email & sender") msg_id = get_message_id(message) if Email.objects.filter(mailinglist=mlist, message_id=msg_id).exists(): raise DuplicateMessage(msg_id) email = Email(mailinglist=mlist, message_id=msg_id) email.in_reply_to = get_ref(message) # Find thread id # Sender try: from_name, from_email = parseaddr(message["From"]) from_name = header_to_unicode(from_name).strip() sender_address = from_email.decode("ascii").strip() except (UnicodeDecodeError, UnicodeEncodeError): raise ValueError("Non-ascii sender address", message) if not sender_address: if from_name: sender_address = re.sub("[^a-z0-9]", "", from_name.lower()) if not sender_address: sender_address = "unknown" sender_address = "{}@example.com".format(sender_address) else: sender_address = "*****@*****.**" sender = Sender.objects.get_or_create(address=sender_address)[0] sender.name = from_name # update the name if needed sender.save() email.sender = sender if not getattr(settings, "HYPERKITTY_BATCH_MODE", False): set_sender_mailman_id(sender) # timeit("3 after sender, before email content") # Headers email.subject = header_to_unicode(message.get("Subject")) if email.subject is not None: # limit subject size to 512, it's a varchar field email.subject = email.subject[:512] msg_date = parsedate(message.get("Date")) if msg_date is None: # Absent or unparseable date msg_date = timezone.now() utcoffset = msg_date.utcoffset() if msg_date.tzinfo is not None: msg_date = msg_date.astimezone(timezone.utc) # store in UTC email.date = msg_date if utcoffset is None: email.timezone = 0 else: # in minutes email.timezone = int(((utcoffset.days * 24 * 60 * 60) + utcoffset.seconds) / 60) # Content scrubber = Scrubber(list_name, message) # warning: scrubbing modifies the msg in-place email.content, attachments = scrubber.scrub() # timeit("4 after email content, before signals") # TODO: detect category? # Set or create the Thread if email.in_reply_to is not None: try: ref_msg = Email.objects.get(mailinglist=email.mailinglist, message_id=email.in_reply_to) except Email.DoesNotExist: # the parent may not be archived (on partial imports), create a new # thread for now. pass else: # re-use parent's thread-id email.parent = ref_msg email.thread_id = ref_msg.thread_id ref_msg.thread.date_active = email.date ref_msg.thread.save() thread_created = False if email.thread_id is None: # Create the thread if not found thread = Thread.objects.create( mailinglist=email.mailinglist, thread_id=email.message_id_hash, date_active=email.date ) thread_created = True email.thread = thread email.save() # must save before setting the thread.starting_email if thread_created: thread.starting_email = email thread.save() new_thread.send("Mailman", thread=thread) # signal_results = new_thread.send_robust("Mailman", thread=thread) # for receiver, result in signal_results: # if isinstance(result, Exception): # logger.warning( # "Signal 'new_thread' to {} raised an exception: {}".format( # receiver.func_name, result)) # Signals new_email.send("Mailman", email=email) # signal_results = new_email.send_robust("Mailman", email=email) # for receiver, result in signal_results: # if isinstance(result, Exception): # logger.warning( # "Signal 'new_email' to {} raised an exception: {}".format( # receiver.func_name, result)) # #logger.exception(result) # #from traceback import print_exc; print_exc(result) # timeit("5 after signals, before save") # timeit("6 after save") # compute thread props here because email must have been saved before # (there will be DB queries in this function) if not getattr(settings, "HYPERKITTY_BATCH_MODE", False): compute_thread_order_and_depth(email.thread) # Attachments (email must have been saved before) for attachment in attachments: counter, name, content_type, encoding, content = attachment if Attachment.objects.filter(email=email, counter=counter).exists(): continue Attachment.objects.create( email=email, counter=counter, name=name, content_type=content_type, encoding=encoding, content=content ) return email.message_id_hash
def from_mbox(self, mbfile): """ Insert all the emails contained in an mbox file into the database. :arg mbfile: a mailbox file """ mbox = mailbox.mbox(mbfile) progress_marker = ProgressMarker(self.verbose, self.stdout) if not self.since: progress_marker.total = len(mbox) for msg in mbox: # FIXME: this converts mailbox.mboxMessage to # email.message.EmailMessage msg_raw = msg.as_bytes(unixfrom=False) unixfrom = msg.get_from() message = message_from_bytes(msg_raw, policy=policy.default) # Fix missing and wierd Date: headers. date = (self._get_date(message, "date") or self._get_date(message, "resent-date")) if unixfrom and not date: date = " ".join(unixfrom.split()[1:]) if date: # Make sure this date can be parsed before setting it as as the # header. If not, a TypeError is raised and we just keep the # old Header. with suppress(TypeError): del message['Date'] message['Date'] = date if self._is_too_old(message): continue progress_marker.tick(message["Message-Id"]) # Un-wrap the subject line if necessary if message["subject"]: message.replace_header( "subject", TEXTWRAP_RE.sub(" ", message["subject"])) if unixfrom: message.set_unixfrom(unixfrom) if message['message-id'] is None: message['Message-ID'] = make_msgid('generated') # Now insert the message try: with transaction.atomic(): add_to_list(self.list_address, message) except DuplicateMessage as e: if self.verbose: self.stderr.write("Duplicate email with message-id '%s'" % e.args[0]) continue except (LookupError, UnicodeError, ValueError) as e: self.stderr.write("Failed adding message %s: %s" % (message.get("Message-ID"), e)) if len(e.args) == 2: try: self.stderr.write("%s from %s about %s" % (e.args[0], e.args[1].get("From"), e.args[1].get("Subject"))) except UnicodeDecodeError: pass # Don't reraise the exception continue except DatabaseError: try: print_exc(file=self.stderr) except UnicodeError: pass self.stderr.write("Message %s failed to import, skipping" % unquote(message["Message-Id"])) continue except Exception as e: # In case of *any* exception, log and continue to import the # rest of the archive. self.stderr.write( "Message {} failed to import, skipping".format( unquote(message["Message-ID"]))) self.stderr.write(e) continue email = Email.objects.get(mailinglist__name=self.list_address, message_id=get_message_id(message)) # # Commit every time to be able to rollback on error # if not transaction.get_autocommit(): # transaction.commit() # Store the list of impacted threads to be able to compute the # thread_order and thread_depth values self.impacted_thread_ids.add(email.thread_id) progress_marker.count_imported += 1 # self.store.search_index.flush() # Now commit to the search index progress_marker.finish() mbox.close()
def test_get_message_id(self): msg = Message() msg["Message-Id"] = "<%s>" % ("x" * 300) self.assertEqual(utils.get_message_id(msg), "x" * 254)