Example #1
0
    def from_mbox(self, mbfile):
        """
        Insert all the emails contained in an mbox file into the database.

        :arg mbfile: a mailbox file
        """
        mbox = mailbox.mbox(mbfile)
        progress_marker = ProgressMarker(self.verbose, self.stdout)
        if not self.since:
            progress_marker.total = len(mbox)
        for message in mbox:
            if self._is_too_old(message):
                continue
            progress_marker.tick(message["Message-Id"])
            # Un-wrap the subject line if necessary
            if message["subject"]:
                message.replace_header(
                    "subject", TEXTWRAP_RE.sub(" ", message["subject"]))
            if message.get_from():
                message.set_unixfrom(message.get_from())
            # Now insert the message
            try:
                with transaction.atomic():
                    add_to_list(self.list_address, message)
            except DuplicateMessage as e:
                if self.verbose:
                    self.stderr.write("Duplicate email with message-id '%s'" %
                                      e.args[0])
                continue
            except ValueError as e:
                self.stderr.write("Failed adding message %s: %s" %
                                  (message.get("Message-ID"), e))
                if len(e.args) != 2:
                    raise  # Regular ValueError exception
                try:
                    self.stderr.write("%s from %s about %s" %
                                      (e.args[0], e.args[1].get("From"),
                                       e.args[1].get("Subject")))
                except UnicodeDecodeError:
                    pass
                continue
            except DatabaseError:
                try:
                    print_exc(file=self.stderr)
                except UnicodeError:
                    pass
                self.stderr.write("Message %s failed to import, skipping" %
                                  unquote(message["Message-Id"]))
                continue
            email = Email.objects.get(mailinglist__name=self.list_address,
                                      message_id=get_message_id(message))
            # # Commit every time to be able to rollback on error
            # if not transaction.get_autocommit():
            #     transaction.commit()
            # Store the list of impacted threads to be able to compute the
            # thread_order and thread_depth values
            self.impacted_thread_ids.add(email.thread_id)
            progress_marker.count_imported += 1
        # self.store.search_index.flush() # Now commit to the search index
        progress_marker.finish()
    def from_mbox(self, mbfile):
        """
        Insert all the emails contained in an mbox file into the database.

        :arg mbfile: a mailbox file
        """
        #self.store.search_index = make_delayed(self.store.search_index)
        mbox = mailbox.mbox(mbfile)
        progress_marker = ProgressMarker(self.verbose, self.stdout)
        if not self.since:
            progress_marker.total = len(mbox)
        for message in mbox:
            if self._is_too_old(message):
                continue
            progress_marker.tick(message["Message-Id"])
            # Un-wrap the subject line if necessary
            if message["subject"]:
                message.replace_header("subject",
                        TEXTWRAP_RE.sub(" ", message["subject"]))
            # Now insert the message
            try:
                with transaction.atomic():
                    add_to_list(self.list_address, message)
            except DuplicateMessage as e:
                if self.verbose:
                    self.stderr.write(
                        "Duplicate email with message-id '%s'" % e.args[0])
                continue
            except ValueError as e:
                if len(e.args) != 2:
                    raise # Regular ValueError exception
                try:
                    self.stderr.write("%s from %s about %s"
                        % (e.args[0], e.args[1].get("From"),
                           e.args[1].get("Subject")))
                except UnicodeDecodeError:
                    self.stderr.write("%s with message-id %s"
                        % (e.args[0], e.args[1].get("Message-ID")))
                continue
            except DatabaseError:
                try:
                    print_exc(file=self.stderr)
                except UnicodeError:
                    pass
                self.stderr.write("Message %s failed to import, skipping"
                      % unquote(message["Message-Id"]))
                continue
            email = Email.objects.get(
                mailinglist__name=self.list_address,
                message_id=get_message_id(message))
            ## Commit every time to be able to rollback on error
            #if not transaction.get_autocommit():
            #    transaction.commit()
            # Store the list of impacted threads to be able to compute the
            # thread_order and thread_depth values
            self.impacted_thread_ids.add(email.thread_id)
            progress_marker.count_imported += 1
        #self.store.search_index.flush() # Now commit to the search index
        progress_marker.finish()
Example #3
0
    def test_get_folded_message_id(self):
        msg = message_from_string("""\
From: [email protected]
To: [email protected]
Subject: Test message
Message-ID:
 <a.folded.message.id>

Dummy Message
""")
        self.assertEqual(utils.get_message_id(msg), 'a.folded.message.id')
Example #4
0
def add_to_list(list_name, message):
    # timeit("1 start")
    mlist = MailingList.objects.get_or_create(name=list_name)[0]
    if not getattr(settings, "HYPERKITTY_BATCH_MODE", False):
        mlist.update_from_mailman()
    mlist.save()
    if mlist.archive_policy == ArchivePolicy.never.value:
        logger.info("Archiving disabled by list policy for %s", list_name)
        return
    if "Message-Id" not in message:
        raise ValueError("No 'Message-Id' header in email", message)
    # timeit("2 after ml, before checking email & sender")
    msg_id = get_message_id(message)
    if Email.objects.filter(mailinglist=mlist, message_id=msg_id).exists():
        raise DuplicateMessage(msg_id)
    email = Email(mailinglist=mlist, message_id=msg_id)
    email.in_reply_to = get_ref(message)  # Find thread id
    if message.get_unixfrom() is not None:
        mo = UNIXFROM_DATE_RE.match(message.get_unixfrom())
        if mo:
            archived_date = parsedate(mo.group(1))
            if archived_date is not None:
                email.archived_date = archived_date

    # Sender
    try:
        from_name, from_email = parseaddr(message['From'])
        from_name = header_to_unicode(from_name).strip()
        sender_address = from_email.decode("ascii").strip()
    except (UnicodeDecodeError, UnicodeEncodeError):
        raise ValueError("Non-ascii sender address", message)
    if not sender_address:
        if from_name:
            sender_address = re.sub("[^a-z0-9]", "", from_name.lower())
            if not sender_address:
                sender_address = "unknown"
            sender_address = "{}@example.com".format(sender_address)
        else:
            sender_address = "*****@*****.**"
    email.sender_name = from_name
    sender = Sender.objects.get_or_create(address=sender_address)[0]
    email.sender = sender
    if not getattr(settings, "HYPERKITTY_BATCH_MODE", False):
        try:
            sender.set_mailman_id()
        except MailmanConnectionError:
            pass
    # timeit("3 after sender, before email content")

    # Headers
    email.subject = header_to_unicode(message.get('Subject'))
    if email.subject is not None:
        # limit subject size to 512, it's a varchar field
        email.subject = email.subject[:512]
    msg_date = parsedate(message.get("Date"))
    if msg_date is None:
        # Absent or unparseable date
        msg_date = timezone.now()
    utcoffset = msg_date.utcoffset()
    if msg_date.tzinfo is not None:
        msg_date = msg_date.astimezone(timezone.utc)  # store in UTC
    email.date = msg_date
    if utcoffset is None:
        email.timezone = 0
    else:
        # in minutes
        email.timezone = int(
            ((utcoffset.days * 24 * 60 * 60) + utcoffset.seconds) / 60)

    # Content
    scrubber = Scrubber(message)
    # warning: scrubbing modifies the msg in-place
    email.content, attachments = scrubber.scrub()
    # timeit("4 after email content, before signals")

    # TODO: detect category?

    # Set or create the Thread
    if email.in_reply_to is not None:
        try:
            ref_msg = Email.objects.get(mailinglist=email.mailinglist,
                                        message_id=email.in_reply_to)
        except Email.DoesNotExist:
            # the parent may not be archived (on partial imports), create a new
            # thread for now.
            pass
        else:
            # re-use parent's thread-id
            email.parent = ref_msg
            email.thread_id = ref_msg.thread_id
            thread = ref_msg.thread

    thread_created = False
    if email.thread_id is None:
        # Create the thread if not found
        thread, thread_created = Thread.objects.get_or_create(
            mailinglist=email.mailinglist, thread_id=email.message_id_hash)
        email.thread = thread

    email.save()  # must save before setting the thread.starting_email

    thread.date_active = email.date
    if thread_created:
        thread.starting_email = email
    thread.save()
    if thread_created:
        new_thread.send("Mailman", thread=thread)
        # signal_results = new_thread.send_robust("Mailman", thread=thread)
        # for receiver, result in signal_results:
        #     if isinstance(result, Exception):
        #         logger.warning(
        #             "Signal 'new_thread' to {} raised an "
        #             "exception: {}".format(
        #             receiver.func_name, result))

    # Signals
    new_email.send("Mailman", email=email)
    # signal_results = new_email.send_robust("Mailman", email=email)
    # for receiver, result in signal_results:
    #     if isinstance(result, Exception):
    #         logger.warning(
    #             "Signal 'new_email' to {} raised an exception: {}".format(
    #             receiver.func_name, result))
    #         #logger.exception(result)
    #         #from traceback import print_exc; print_exc(result)
    # timeit("5 after signals, before save")
    # timeit("6 after save")
    #  compute thread props here because email must have been saved before
    # (there will be DB queries in this function)
    if not getattr(settings, "HYPERKITTY_BATCH_MODE", False):
        compute_thread_order_and_depth(email.thread)

    # Attachments (email must have been saved before)
    for attachment in attachments:
        counter, name, content_type, encoding, content = attachment
        if Attachment.objects.filter(email=email, counter=counter).exists():
            continue
        Attachment.objects.create(email=email,
                                  counter=counter,
                                  name=name,
                                  content_type=content_type,
                                  encoding=encoding,
                                  content=content)

    return email.message_id_hash
Example #5
0
 def test_get_message_id(self):
     msg = Message()
     msg["Message-Id"] = '<%s>' % ('x' * 300)
     self.assertEqual(utils.get_message_id(msg), 'x' * 254)
def add_to_list(list_name, message):
    assert isinstance(message, EmailMessage)
    # timeit("1 start")
    mlist = MailingList.objects.get_or_create(name=list_name)[0]
    if not getattr(settings, "HYPERKITTY_BATCH_MODE", False):
        update_from_mailman(mlist.name)
    mlist.save()
    if mlist.archive_policy == ArchivePolicy.never.value:
        logger.info("Archiving disabled by list policy for %s", list_name)
        return
    if "Message-Id" not in message:
        raise ValueError("No 'Message-Id' header in email", message)
    # timeit("2 after ml, before checking email & sender")
    msg_id = get_message_id(message)
    if Email.objects.filter(mailinglist=mlist, message_id=msg_id).exists():
        raise DuplicateMessage(msg_id)
    email = Email(mailinglist=mlist, message_id=msg_id)
    email.in_reply_to = get_ref(message)  # Find thread id
    if message.get_unixfrom() is not None:
        mo = UNIXFROM_DATE_RE.match(message.get_unixfrom())
        if mo:
            archived_date = parsedate(mo.group(1))
            if archived_date is not None:
                email.archived_date = archived_date

    # Sender
    try:
        from_str = header_to_unicode(message['From'])
        from_name, from_email = parseaddr(from_str)
        from_name = from_name.strip()
        sender_address = from_email.encode('ascii').decode("ascii").strip()
    except (UnicodeDecodeError, UnicodeEncodeError):
        raise ValueError("Non-ascii sender address", message)
    if not sender_address:
        if from_name:
            sender_address = re.sub("[^a-z0-9]", "", from_name.lower())
            if not sender_address:
                sender_address = "unknown"
            sender_address = "{}@example.com".format(sender_address)
        else:
            sender_address = "*****@*****.**"
    email.sender_name = from_name
    sender = Sender.objects.get_or_create(address=sender_address)[0]
    email.sender = sender
    if not getattr(settings, "HYPERKITTY_BATCH_MODE", False):
        sender_mailman_id(sender.pk)
    # timeit("3 after sender, before email content")

    # Headers
    email.subject = header_to_unicode(message.get('Subject'))
    if email.subject is not None:
        # limit subject size to 512, it's a varchar field
        email.subject = email.subject[:512]
    msg_date = parsedate(message.get("Date"))
    if msg_date is None:
        # Absent or unparseable date
        msg_date = timezone.now()
    utcoffset = msg_date.utcoffset()
    if msg_date.tzinfo is not None:
        msg_date = msg_date.astimezone(timezone.utc)  # store in UTC
    email.date = msg_date
    if utcoffset is None:
        email.timezone = 0
    else:
        # in minutes
        email.timezone = int(
            ((utcoffset.days * 24 * 60 * 60) + utcoffset.seconds) / 60)

    # Content
    scrubber = Scrubber(message)
    # warning: scrubbing modifies the msg in-place
    email.content, attachments = scrubber.scrub()
    # timeit("4 after email content, before signals")

    # TODO: detect category?

    # Find the parent email.
    # This can't be moved to Email.on_pre_save() because Email.set_parent()
    # needs to be free to change the parent independently from the in_reply_to
    # property, and will save() the instance.
    # This, along with some of the work done in Email.on_pre_save(), could be
    # moved to an async task, but the rest of the app must be able to cope with
    # emails lacking this data, and email being process randomly (child before
    # parent). The work in Email.on_post_created() also depends on it, so be
    # careful with task dependencies if you ever do this.
    # Plus, it has "premature optimization" written all over it.
    if email.in_reply_to is not None:
        try:
            ref_msg = Email.objects.get(mailinglist=email.mailinglist,
                                        message_id=email.in_reply_to)
        except Email.DoesNotExist:
            # the parent may not be archived (on partial imports), create a new
            # thread for now.
            pass
        else:
            # re-use parent's thread-id
            email.parent = ref_msg
            email.thread_id = ref_msg.thread_id

    try:
        email.save()
    except DataError as e:
        raise ValueError(str(e))

    # Attachments (email must have been saved before)
    for attachment in attachments:
        counter, name, content_type, encoding, content = attachment
        if Attachment.objects.filter(email=email, counter=counter).exists():
            continue
        att = Attachment.objects.create(email=email,
                                        counter=counter,
                                        name=name,
                                        content_type=content_type,
                                        encoding=encoding)
        att.set_content(content)
        att.save()

    return email.message_id_hash
Example #7
0
 def test_get_message_id(self):
     msg = Message()
     msg["Message-Id"] = '<%s>' % ('x' * 300)
     self.assertEqual(utils.get_message_id(msg), 'x' * 254)
Example #8
0
def add_to_list(list_name, message):
    # timeit("1 start")
    mlist = MailingList.objects.get_or_create(name=list_name)[0]
    if not getattr(settings, "HYPERKITTY_BATCH_MODE", False):
        mlist.update_from_mailman()
    mlist.save()
    if mlist.archive_policy == ArchivePolicy.never.value:
        logger.info("Archiving disabled by list policy for %s", list_name)
        return
    if not message.has_key("Message-Id"):
        raise ValueError("No 'Message-Id' header in email", message)
    # timeit("2 after ml, before checking email & sender")
    msg_id = get_message_id(message)
    if Email.objects.filter(mailinglist=mlist, message_id=msg_id).exists():
        raise DuplicateMessage(msg_id)
    email = Email(mailinglist=mlist, message_id=msg_id)
    email.in_reply_to = get_ref(message)  # Find thread id

    # Sender
    try:
        from_name, from_email = parseaddr(message["From"])
        from_name = header_to_unicode(from_name).strip()
        sender_address = from_email.decode("ascii").strip()
    except (UnicodeDecodeError, UnicodeEncodeError):
        raise ValueError("Non-ascii sender address", message)
    if not sender_address:
        if from_name:
            sender_address = re.sub("[^a-z0-9]", "", from_name.lower())
            if not sender_address:
                sender_address = "unknown"
            sender_address = "{}@example.com".format(sender_address)
        else:
            sender_address = "*****@*****.**"
    sender = Sender.objects.get_or_create(address=sender_address)[0]
    sender.name = from_name  # update the name if needed
    sender.save()
    email.sender = sender
    if not getattr(settings, "HYPERKITTY_BATCH_MODE", False):
        set_sender_mailman_id(sender)
    # timeit("3 after sender, before email content")

    # Headers
    email.subject = header_to_unicode(message.get("Subject"))
    if email.subject is not None:
        # limit subject size to 512, it's a varchar field
        email.subject = email.subject[:512]
    msg_date = parsedate(message.get("Date"))
    if msg_date is None:
        # Absent or unparseable date
        msg_date = timezone.now()
    utcoffset = msg_date.utcoffset()
    if msg_date.tzinfo is not None:
        msg_date = msg_date.astimezone(timezone.utc)  # store in UTC
    email.date = msg_date
    if utcoffset is None:
        email.timezone = 0
    else:
        # in minutes
        email.timezone = int(((utcoffset.days * 24 * 60 * 60) + utcoffset.seconds) / 60)

    # Content
    scrubber = Scrubber(list_name, message)
    # warning: scrubbing modifies the msg in-place
    email.content, attachments = scrubber.scrub()
    # timeit("4 after email content, before signals")

    # TODO: detect category?

    # Set or create the Thread
    if email.in_reply_to is not None:
        try:
            ref_msg = Email.objects.get(mailinglist=email.mailinglist, message_id=email.in_reply_to)
        except Email.DoesNotExist:
            # the parent may not be archived (on partial imports), create a new
            # thread for now.
            pass
        else:
            # re-use parent's thread-id
            email.parent = ref_msg
            email.thread_id = ref_msg.thread_id
            ref_msg.thread.date_active = email.date
            ref_msg.thread.save()

    thread_created = False
    if email.thread_id is None:
        # Create the thread if not found
        thread = Thread.objects.create(
            mailinglist=email.mailinglist, thread_id=email.message_id_hash, date_active=email.date
        )
        thread_created = True
        email.thread = thread

    email.save()  # must save before setting the thread.starting_email

    if thread_created:
        thread.starting_email = email
        thread.save()
        new_thread.send("Mailman", thread=thread)
        # signal_results = new_thread.send_robust("Mailman", thread=thread)
        # for receiver, result in signal_results:
        #    if isinstance(result, Exception):
        #        logger.warning(
        #            "Signal 'new_thread' to {} raised an exception: {}".format(
        #            receiver.func_name, result))

    # Signals
    new_email.send("Mailman", email=email)
    # signal_results = new_email.send_robust("Mailman", email=email)
    # for receiver, result in signal_results:
    #    if isinstance(result, Exception):
    #        logger.warning(
    #            "Signal 'new_email' to {} raised an exception: {}".format(
    #            receiver.func_name, result))
    #        #logger.exception(result)
    #        #from traceback import print_exc; print_exc(result)
    # timeit("5 after signals, before save")
    # timeit("6 after save")
    # compute thread props here because email must have been saved before
    # (there will be DB queries in this function)
    if not getattr(settings, "HYPERKITTY_BATCH_MODE", False):
        compute_thread_order_and_depth(email.thread)

    # Attachments (email must have been saved before)
    for attachment in attachments:
        counter, name, content_type, encoding, content = attachment
        if Attachment.objects.filter(email=email, counter=counter).exists():
            continue
        Attachment.objects.create(
            email=email, counter=counter, name=name, content_type=content_type, encoding=encoding, content=content
        )

    return email.message_id_hash
Example #9
0
    def from_mbox(self, mbfile):
        """
        Insert all the emails contained in an mbox file into the database.

        :arg mbfile: a mailbox file
        """
        mbox = mailbox.mbox(mbfile)
        progress_marker = ProgressMarker(self.verbose, self.stdout)
        if not self.since:
            progress_marker.total = len(mbox)
        for msg in mbox:
            # FIXME: this converts mailbox.mboxMessage to
            # email.message.EmailMessage
            msg_raw = msg.as_bytes(unixfrom=False)
            unixfrom = msg.get_from()
            message = message_from_bytes(msg_raw, policy=policy.default)
            # Fix missing and wierd Date: headers.
            date = (self._get_date(message, "date")
                    or self._get_date(message, "resent-date"))
            if unixfrom and not date:
                date = " ".join(unixfrom.split()[1:])

            if date:
                # Make sure this date can be parsed before setting it as as the
                # header. If not, a TypeError is raised and we just keep the
                # old Header.
                with suppress(TypeError):
                    del message['Date']
                    message['Date'] = date

            if self._is_too_old(message):
                continue
            progress_marker.tick(message["Message-Id"])
            # Un-wrap the subject line if necessary
            if message["subject"]:
                message.replace_header(
                    "subject", TEXTWRAP_RE.sub(" ", message["subject"]))
            if unixfrom:
                message.set_unixfrom(unixfrom)
            if message['message-id'] is None:
                message['Message-ID'] = make_msgid('generated')
            # Now insert the message
            try:
                with transaction.atomic():
                    add_to_list(self.list_address, message)
            except DuplicateMessage as e:
                if self.verbose:
                    self.stderr.write("Duplicate email with message-id '%s'" %
                                      e.args[0])
                continue
            except (LookupError, UnicodeError, ValueError) as e:
                self.stderr.write("Failed adding message %s: %s" %
                                  (message.get("Message-ID"), e))
                if len(e.args) == 2:
                    try:
                        self.stderr.write("%s from %s about %s" %
                                          (e.args[0], e.args[1].get("From"),
                                           e.args[1].get("Subject")))
                    except UnicodeDecodeError:
                        pass
                # Don't reraise the exception
                continue
            except DatabaseError:
                try:
                    print_exc(file=self.stderr)
                except UnicodeError:
                    pass
                self.stderr.write("Message %s failed to import, skipping" %
                                  unquote(message["Message-Id"]))
                continue
            except Exception as e:
                # In case of *any* exception, log and continue to import the
                # rest of the archive.
                self.stderr.write(
                    "Message {} failed to import, skipping".format(
                        unquote(message["Message-ID"])))
                self.stderr.write(e)
                continue
            email = Email.objects.get(mailinglist__name=self.list_address,
                                      message_id=get_message_id(message))
            # # Commit every time to be able to rollback on error
            # if not transaction.get_autocommit():
            #     transaction.commit()
            # Store the list of impacted threads to be able to compute the
            # thread_order and thread_depth values
            self.impacted_thread_ids.add(email.thread_id)
            progress_marker.count_imported += 1
        # self.store.search_index.flush() # Now commit to the search index
        progress_marker.finish()
        mbox.close()
Example #10
0
 def test_get_message_id(self):
     msg = Message()
     msg["Message-Id"] = "<%s>" % ("x" * 300)
     self.assertEqual(utils.get_message_id(msg), "x" * 254)