def do_split(argv):
    path_name = argv[0]
    if not os.path.isfile(path_name):
        print("Error: file " + path_name + " not found.")
        return
    file_name = os.path.basename(path_name)[:-5]
    dir_name = file_name + "-split"
    if os.path.isdir(dir_name):
        shutil.rmtree(dir_name)
    os.makedirs(dir_name)
    print("Loading input mbox...", end="",flush=True)
    input_mbox = mailbox.mbox(path_name)
    num_emails = len(input_mbox)
    print("done.")
    num_parts = int(argv[1])
    if len(argv) == 3:
        num_emails_per_part = int(argv[2])
    else:
        num_emails_per_part = int(math.ceil(num_emails / num_parts))
    num_emails_scanned = 0
    for i in range(num_parts):
        start = num_emails_scanned
        end = start + num_emails_per_part - 1
        if end > num_emails - 1:
            end = num_emails - 1
        new_mbox_path = dir_name + "/" + file_name + str(start) + "-" + str(end) + ".mbox"
        print("Creating " + new_mbox_path + " for emails " + str(start) + "-" + str(end) + "...",end="", flush=True)
        new_mbox = mailbox.mbox(new_mbox_path, create=True)
        for j in range(start, end + 1):
            new_mbox.add(input_mbox[j])
            num_emails_scanned += 1
        new_mbox.flush()
        print("done.")
Example #2
0
def splitbox(boxfile, fmt, filtermsg=None, copy=True, dry_run=False):
    box = mailbox.mbox(boxfile)

    for k, m in box.iteritems():
        if filtermsg is None or not filtermsg(m):
            continue
        h = dict(m.items())
        t = email.utils.parsedate_tz(m.get('Date'))
        h['Date'] = datetime.utcfromtimestamp(email.utils.mktime_tz(t))
        f = fmt.format(**h)
        logger.info("Saving message %s in mailbox %s", k, f)
        if not dry_run:
            outbox = mailbox.mbox(f, create=True)
            outbox.lock()
            outbox.add(m)
            outbox.unlock()
            outbox.close()

        if not copy:
            logger.info("Removing message %s", k)
            if not dry_run:
                box.lock()
                box.discard(k)
                box.unlock()

    box.close()
def do_list(argv):
    path_name = argv[0]
    if not os.path.isfile(path_name):
        print("Error: file " + path_name + " not found.")
        return
    file_name = os.path.basename(path_name)[:-5]
    dir_name = file_name + "-split"
    if os.path.isdir(dir_name):
        shutil.rmtree(dir_name)
    os.makedirs(dir_name)
    print("Loading input mbox...", end="",flush=True)
    input_mbox = mailbox.mbox(path_name)
    num_emails = len(input_mbox)
    print("done.")
    groups = [int(a) for a in argv[1:]]
    num_emails_scanned = 0
    for i in range(len(groups)):
        start = num_emails_scanned
        end = start + groups[i] - 1
        last_group = False
        if end >= num_emails - 1:
            end = num_emails - 1
            last_group = True
        new_mbox_path = dir_name + "/" + file_name + str(start) + "-" + str(end) + ".mbox"
        print("Creating " + new_mbox_path + " for emails " + str(start) + "-" + str(end) + "...",end="", flush=True)
        new_mbox = mailbox.mbox(new_mbox_path, create=True)
        for j in range(start, end + 1):
            new_mbox.add(input_mbox[j])
        new_mbox.flush()
        print("done.")
        if last_group:
            break
        num_emails_scanned += groups[i]
Example #4
0
 def test_since_override(self):
     # When there's mail already and the "since" option is not used, it
     # defaults to the last email's date
     msg1 = Message()
     msg1["From"] = "*****@*****.**"
     msg1["Message-ID"] = "<msg1>"
     msg1["Date"] = "2015-01-01 12:00:00"
     msg1.set_payload("msg1")
     add_to_list("*****@*****.**", msg1)
     mailbox.mbox(os.path.join(self.tmpdir, "test.mbox"))
     # do the import
     output = StringIO()
     with patch("hyperkitty.management.commands.hyperkitty_import.DbImporter"
         ) as DbImporterMock:
         instance = Mock()
         instance.impacted_thread_ids = []
         DbImporterMock.side_effect = lambda *a, **kw: instance
         self.command.execute(os.path.join(self.tmpdir, "test.mbox"),
             verbosity=2, stdout=output, stderr=output,
             list_address="*****@*****.**",
             since="2010-01-01 00:00:00 UTC",
             no_download=True, no_sync_mailman=True,
         )
     self.assertEqual(DbImporterMock.call_args[0][1]["since"],
                      datetime(2010, 1, 1, tzinfo=utc))
Example #5
0
def separate_old_from_new_chats(chats_all_mbox_file, chats_old_mbox_file, chats_new_mbox_file):
    # Somewhere around 2013-05-01 Google changed its chat format. Old chat is
    # custom XMPP-like XML, new chat is mail message-based text/html.
    print('Separating old-style from new-style chats... ', file=sys.stdout)
    sys.stdout.flush()
    chats_all_mbox = mailbox.mbox(chats_all_mbox_file)
    chats_old_mbox = mailbox.mbox(chats_old_mbox_file)
    chats_new_mbox = mailbox.mbox(chats_new_mbox_file)
    num_messages = 0
    num_old_chats = 0
    num_new_chats = 0

    for message in chats_all_mbox:
        num_messages += 1
        if message.is_multipart():
            # ALL old-style chats have the message in a 2-part multipart
            # payload: the first part containts the full XML chat, the second
            # contains a useless HTML representation of the chat
            num_old_chats += 1
            chats_old_mbox.add(message)
        else:
            # ALL new-style chats have the message in a non-multipart payload:
            # the payload is just a string containing the chat content
            num_new_chats += 1
            chats_new_mbox.add(message)

    print('    Chat messages: {0}'.format(num_messages), file=sys.stdout)
    print('    Old-style: {0} chat messages stored in \'{1}\''.format(num_old_chats, os.path.basename(chats_old_mbox_file)), file=sys.stdout)
    print('    New-style: {0} chat messages stored in \'{1}\''.format(num_new_chats, os.path.basename(chats_new_mbox_file)), file=sys.stdout)
    print('DONE', file=sys.stdout)
Example #6
0
def parse_mbox(filename=None, fileobj=None):
    'parse a mbox file'

    if not filename and not fileobj:
        raise ValueError('one of "filename" or "fileobj" is required')

    if filename:
        mbox = mailbox.mbox(filename)
        for message in mbox:
            yield simplify_message(message)

    else:
        # create a tempfile because mbox needs a path
        with NamedTemporaryFile() as tempfile:
            for chunk in iter(lambda: fileobj.read(BUFFER_SIZE), bytes()):
                tempfile.write(chunk)

            # make sure there is something to read
            tempfile.flush()

            mbox = mailbox.mbox(tempfile.name)
            for message in mbox:
                # skip corrupted messages
                if not message.get('Message-Id'): continue

                yield simplify_message(message)
 def initialize(self, mbox_file):
     self.emails = []
     self.mbox_file = '%s.dsn' % mbox_file
     self.mbox = mailbox.mbox(self.mbox_file)
     self.mbox.clear()
     self.mbox_temp_file = '%s.dsn-temp' % mbox_file
     self.mbox_temp = mailbox.mbox(self.mbox_temp_file)
     self.mbox_temp.clear()
Example #8
0
    def __init__(
        self,
        dbman,
        address=None,
        aliases=None,
        realname=None,
        gpg_key=None,
        signature=None,
        signature_filename=None,
        signature_as_attachment=False,
        sent_box=None,
        sent_tags=["sent"],
        draft_box=None,
        draft_tags=["draft"],
        abook=None,
    ):
        self.dbman = dbman
        self.address = address
        self.abook = abook
        self.aliases = []
        if aliases:
            self.aliases = aliases.split(";")
        self.realname = realname
        self.gpg_key = gpg_key
        self.signature = signature
        self.signature_filename = signature_filename
        self.signature_as_attachment = signature_as_attachment

        self.sent_box = None
        if sent_box:
            mburl = urlparse(sent_box)
            if mburl.scheme == "mbox":
                self.sent_box = mailbox.mbox(mburl.path)
            elif mburl.scheme == "maildir":
                self.sent_box = mailbox.Maildir(mburl.path)
            elif mburl.scheme == "mh":
                self.sent_box = mailbox.MH(mburl.path)
            elif mburl.scheme == "babyl":
                self.sent_box = mailbox.Babyl(mburl.path)
            elif mburl.scheme == "mmdf":
                self.sent_box = mailbox.MMDF(mburl.path)
        self.sent_tags = sent_tags

        self.draft_box = None
        if draft_box:
            mburl = urlparse(draft_box)
            if mburl.scheme == "mbox":
                self.draft_box = mailbox.mbox(mburl.path)
            elif mburl.scheme == "maildir":
                self.draft_box = mailbox.Maildir(mburl.path)
            elif mburl.scheme == "mh":
                self.draft_box = mailbox.MH(mburl.path)
            elif mburl.scheme == "babyl":
                self.draft_box = mailbox.Babyl(mburl.path)
            elif mburl.scheme == "mmdf":
                self.draft_box = mailbox.MMDF(mburl.path)
        self.draft_tags = draft_tags
Example #9
0
    def read_comment_emails(cls, mbox):
        """
        reads mbox for emails and adds valid comment-reply emails into the database
        if fails to add, adds to fail-box.

        For email replys to comments only,

        use the subject:
        :param mbox:
        :return:
        """
        if mbox is None:
            return
        m = mailbox.mbox(mbox)
        m.lock()
        fail = mailbox.mbox(mbox + ".failed")
        fail.lock()
        processed = mailbox.mbox(mbox + ".processed")
        processed.lock()
        try:
            for key in m.iterkeys():
                try:
                    message = m[key]
                    comment, user = cls.parse_subject(message.get("subject"))
                    if comment is None or user is None:
                        key = fail.add(message)
                        fail.flush()
                        m.discard(key)
                        m.flush()
                        logger.info("Failed to add comment-reply-email {subject} key:{key} added to ".format(subject=message.get("subject"), key=key) + fail._file.name)
                        continue
                    new_comment = rt.models.comments.Comment()
                    new_comment.owner = comment.owner
                    new_comment.reply_to = comment
                    new_comment.user = user
                    new_comment.short_text = message.get_payload()
                    new_comment.full_clean()
                    new_comment.save()
                    key = processed.add(message)
                    logger.info("New comment {} via email key: {}".format(new_comment.id,key))
                    processed.flush()
                    m.discard(key)
                    m.flush()
                except Exception as e:
                    key = fail.add(message)
                    fail.flush()
                    logger.exception("Failed to add comment-reply-email {subject} key:{key} added to ".format(subject=message.get("subject"), key=key) + fail._file.name)
                    m.discard(key)
                    m.flush()
        except Exception as e:
            raise e
        finally:
            for b in (m, fail, processed):
                b.flush()
                b.unlock()
                b.close()
Example #10
0
 def __init__ (self, mbox_file, nmbox_file):
     '''
     Constructor
     '''
     self.src_mbox           = mailbox.mbox(mbox_file)
     self.dest_mbox          = mailbox.mbox(nmbox_file, create=True)
     self.faker              = Faker()
     self.emails_name        = {}
     self.domains            = {}
     self.re_email = re.compile(r'[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}')
def clone(mbox_file_path, num_times_size, out_file_path):
    if os.path.exists(out_file_path):
        os.remove(out_file_path)
    mbox_to_clone = mailbox.mbox(mbox_file_path)
    mbox_clone = mailbox.mbox(out_file_path, create=True)
    percentage = 0
    for i in range(len(mbox_to_clone)):
        if (i % (len(mbox_to_clone) // 10)) == 0:
            print(str(percentage) + "%")
            percentage += 10
        for _ in range(num_times_size):
            mbox_clone.add(mbox_to_clone[i])
    mbox_clone.flush()
Example #12
0
def _load_messages(course, stream, mailbox=None, input_=None, output=None,
                   continue_after_invalid_message=False,
                   trust_email_infrastructure=False, respond=None,
                   dry_run=False):
    if mailbox is None:
        _LOG.debug('loading message from {}'.format(stream))
        mbox = None
        messages = [(None,_message_from_file(stream))]
        if output is not None:
            ombox = _mailbox.Maildir(output, factory=None, create=True)
    elif mailbox == 'mbox':
        mbox = _mailbox.mbox(input_, factory=None, create=False)
        messages = mbox.items()
        if output is not None:
            ombox = _mailbox.mbox(output, factory=None, create=True)
    elif mailbox == 'maildir':
        mbox = _mailbox.Maildir(input_, factory=None, create=False)
        messages = []
        for key,msg in mbox.items():
            subpath = mbox._lookup(key)
            if subpath.endswith('.gitignore'):
                _LOG.debug('skipping non-message {}'.format(subpath))
                continue
            messages.append((key, msg))
        if output is not None:
            ombox = _mailbox.Maildir(output, factory=None, create=True)
    else:
        raise ValueError(mailbox)
    messages.sort(key=_get_message_time)
    for key,msg in messages:
        try:
            ret = _parse_message(
                course=course, message=msg,
                trust_email_infrastructure=trust_email_infrastructure)
        except _InvalidMessage as error:
            error.message = msg
            _LOG.warn('invalid message {}'.format(error.message_id()))
            if not continue_after_invalid_message:
                raise
            _LOG.warn('{}'.format(error))
            if respond:
                response = _get_error_response(error)
                if response is not None:
                    respond(response)
            continue
        if output is not None and dry_run is False:
            # move message from input mailbox to output mailbox
            ombox.add(msg)
            if mbox is not None:
                del mbox[key]
        yield ret
Example #13
0
def copyMboxFiles(opera,tbird):
    
    if not os.path.exists(tbird+"/Migration"):
        tBox = mailbox.mbox(tbird+"/Migration")
    else:
        print("The mailbox Migration exists! \nPlease remove it before starting the migration")
        sys.exit()
    
    for root, dirs, files in os.walk(opera):
        
        for fn in files:
            mb = mailbox.mbox(root+"/"+fn)
            for message in mb:
                tBox.add(message)
                tBox.flush()
Example #14
0
def main(args=None):
    try:
        # Setup locale
        # Set LC_TIME to "C" so that imaplib.Time2Internaldate() 
        # uses English month name.
        locale.setlocale(locale.LC_ALL, "")
        locale.setlocale(locale.LC_TIME, "C")
        #  Encoding of the sys.stderr
        enc = locale.getlocale()[1] or "utf_8"
        sys.stderr = codecs.lookup(enc)[-1](sys.stderr, errors="ignore")

        # Parse arguments
        if args is None:
            args = sys.argv[1:]
        parser = MyOptionParser()
        options = parser.parse_args(args)
        if len(str(options.user)) == 0:
            print "User name: ",
            options.user = sys.stdin.readline().rstrip("\n")
        if len(str(options.password)) == 0:
            options.password = getpass.getpass()
        options = options.__dict__
        src = options.pop("src")
        err = options.pop("error")
        time_fields = options.pop("time_fields")

        recurse = options.pop("r")

        # Connect to the server and login
        print >>sys.stderr, \
              "Connecting to %s:%s." % (options["host"], options["port"])
        uploader = IMAPUploader(**options)
        uploader.open()

        if(not recurse):
            # Prepare source and error mbox
            src = mailbox.mbox(src, create=False)
            if err:
                err = mailbox.mbox(err)
            upload(uploader, options["box"], src, err, time_fields)
        else:
            recursive_upload(uploader, "", src, err, time_fields)

        return 0

    except optparse.OptParseError, e:
        print >>sys.stderr, e
        return 2
Example #15
0
    def parse(self):

        logging.info('Email file: {}'.format(self.email_file))

        out = []
        i = 0
        for msg in mailbox.mbox(self.email_file):

            logging.info('--------- Parsing message {} ---------'.format(i))

            msg_data = dict(type='email')
            msg_data['date'] = self.decode_field(msg['date'])
            msg_data['date'] = format_basic(msg_data['date'])

            for field in ['from', 'to', 'cc', 'bcc']:
                msg_data[field] = self.decode_field(msg[field])
                msg_data[field] = format_address(msg_data[field])
                logging.debug('{}: {}'.format(field, msg_data[field]))

            try:
                payload = format_text(self.get_payload_text(msg))
            except Exception as e:
                raise e

            subject = self.decode_field(msg['subject'])
            subject = format_text(subject)
            text = ' '.join([subject, payload])
            msg_data['text'] = text
            logging.debug(text)

            out.append(msg_data)
            i += 1

        return out
Example #16
0
def open_list_archives(url,base_arc_dir="archives"):
    """
    Returns a list of all email messages contained in the specified directory.

    The argument *url* here is taken to be the name of a subdirectory
    of the directory specified in argument *base_arc_dir*.

    This directory is expected to contain files with extensions .txt,
    .mail, or .mbox. These files are all expected to be in mbox format--
    i.e. a series of blocks of text starting with headers (colon-separated
    key-value pairs) followed by an email body.
    """
    list_name = get_list_name(url)
    arc_dir = archive_directory(base_arc_dir,list_name)
    
    file_extensions = [".txt", ".mail", ".mbox"]

    txts = [os.path.join(arc_dir,fn) for fn
            in os.listdir(arc_dir)
            if any([fn.endswith(extension) for extension in file_extensions])]

    print 'Opening %d archive files' % (len(txts))
    arch = [mailbox.mbox(txt, create=False).values() for txt in txts]

    messages = [item for sublist in arch for item in sublist]
    return messages
Example #17
0
def test_to_message_from_message_with_spam():
    mb = mailbox.mbox("tests/spam")
    fails = 0
    total = 0

    for msg in mb:
        try:
            m = encoding.from_message(msg)
            out = encoding.to_message(m)
            assert repr(out)

            m2 = encoding.from_message(out)

            for k in m:
                if '@' in m[k]:
                    assert_equal(parseaddr(m[k]), parseaddr(m2[k]))
                else:
                    assert m[k].strip() == m2[k].strip(), "%s: %r != %r" % (k, m[k], m2[k])

                assert not m[k].startswith(u"=?")
                assert not m2[k].startswith(u"=?")
                assert m.body == m2.body, "Bodies don't match" 

                assert_equal(len(m.parts), len(m2.parts), "Not the same number of parts.")

                for i, part in enumerate(m.parts):
                    assert part.body == m2.parts[i].body, "Part %d isn't the same: %r \nvs\n. %r" % (i, part.body, m2.parts[i].body)
            total += 1
        except encoding.EncodingError, exc:
            fails += 1
Example #18
0
File: stats.py Project: Debian/dak
def parse_prod(logdate):
    global stats
    global users
    maildate = ''.join([x[-2:] for x in logdate.split('-')])
    mailarchive = join(utils.get_conf()['Dir::Base'], 'mail/archive',
                       'mail-%s.xz' % maildate)
    if not isfile(mailarchive):
        return
    (fd, tmpfile) = utils.temp_filename(utils.get_conf()['Dir::TempPath'])
    system('xzcat %s > %s' % (mailarchive, tmpfile))
    for message in mbox(tmpfile):
        if (message['subject']
                and message['subject'].startswith('Comments regarding')):
            try:
                member = users[' '.join(message['From'].split()[:-1])]
            except KeyError:
                continue
            ts = mktime_tz(parsedate_tz(message['date']))
            timestamp = datetime.fromtimestamp(ts).strftime("%Y%m%d%H%M%S")
            date = parse_timestamp(timestamp)
            if date not in stats:
                stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0,
                                 'REJECT': 0, 'PROD': 0}, 'members': {}}
            if member not in stats[date]['members']:
                stats[date]['members'][member] = {'ACCEPT': 0, 'REJECT': 0,
                                                     'PROD': 0}
            if member not in stats['history']['members']:
                stats['history']['members'][member] = {'ACCEPT': 0,
                                                       'REJECT': 0, 'PROD': 0}
            stats[date]['stats']['PROD'] += 1
            stats[date]['members'][member]['PROD'] += 1
            stats['history']['stats']['PROD'] += 1
            stats['history']['members'][member]['PROD'] += 1
    unlink(tmpfile)
Example #19
0
def convert(db_path, mbox_path, device_name):
    mbox = mailbox.mbox(mbox_path, create=True)

    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row
    curs = conn.cursor()
    curs.execute("SELECT * FROM ZNOTE")
    for row in curs.fetchall():
        curs.execute("SELECT * FROM ZNOTEBODY WHERE Z_PK=?", (row['Z_PK'],))
        content = curs.fetchone()['ZCONTENT'].encode('utf8')
        msg = MIMEText(content, 'html', 'utf8')

        msg.set_unixfrom("From %s %s" % (device_name,
                                         shift_date(row['ZMODIFICATIONDATE']).ctime()))
        msg['From'] = device_name
        subject = row['ZTITLE']
        print subject
        print row['ZSUMMARY']
        try:
            subject.encode('ascii')
        except UnicodeEncodeError:
            try:
                subject = email.header.Header(subject.encode('GB2312'), 'GB2312')
            except UnicodeEncodeError:
                subject = email.header.Header(subject.encode('utf8'), 'UTF-8')
        msg['Subject'] = subject
        msg['X-Universally-Unique-Identifier'] = row['ZGUID']
        msg.set_charset('utf-8')
        msg['X-Uniform-Type-Identifier'] = 'com.apple.mail-note'
        msg['MIME-Version'] = "1.0 (Apple Message framework v1244.3)"
        msg['X-Mail-Created-Date'] = rfc_2822(shift_date(row['ZCREATIONDATE']))
        msg['Date'] = rfc_2822(shift_date(row['ZMODIFICATIONDATE']))
        msg['X-Mail-Generated-Subject'] = 'YES'

        mbox.add(msg)
Example #20
0
def parse_mbox(path, list_id):
    results = {models.Patch: 0, models.CoverLetter: 0, models.Comment: 0}
    duplicates = 0
    dropped = 0

    mbox = mailbox.mbox(path)
    for msg in mbox:
        try:
            obj = parsemail.parse_mail(msg, list_id)
            if obj:
                results[type(obj)] += 1
            else:
                dropped += 1
        except django.db.utils.IntegrityError:
            duplicates += 1
    print(
        "Processed %(total)d messages -->\n"
        "  %(covers)4d cover letters\n"
        "  %(patches)4d patches\n"
        "  %(comments)4d comments\n"
        "  %(duplicates)4d duplicates\n"
        "  %(dropped)4d dropped\n"
        "Total: %(new)s new entries"
        % {
            "total": len(mbox),
            "covers": results[models.CoverLetter],
            "patches": results[models.Patch],
            "comments": results[models.Comment],
            "duplicates": duplicates,
            "dropped": dropped,
            "new": len(mbox) - duplicates - dropped,
        }
    )
Example #21
0
def frontloader(*args):
    tid = args[0]
    tlocks[tid].acquire()
    c = LMTPClient("localhost", 10024)
    c.lhlo("host")
    mb = mailbox.mbox(MAILBOX, factory=None, create=False)
    i = 1
    while i < MESSAGES:
        for msg in mb.values():
            addr = string.split(msg.get_from())[0]
            c.send(addr, USERNAME, msg.as_string())
            if not i % RECONNECT:
                c.quit()
                c = LMTPClient("localhost", 10024)
                c.lhlo("host")
                sys.stdout.write("_")
            else:
                sys.stdout.write(".")
            sys.stdout.flush()
            i = i + 1
            if i >= MESSAGES:
                break

    c.quit()
    tlocks[tid].release()
Example #22
0
def send_mbox(mbox_filename, args):
    try:
        mbox = mailbox.mbox(mbox_filename)
        send_mailbox(mbox, args)
    finally:
        if mbox:
            mbox.close()
 def initialize(self, mbox_file):
     self.mbox_file = '%s.ooo' % mbox_file
     self.mbox = mailbox.mbox(self.mbox_file)
     self.mbox.clear()
     subject_re = [
         r'^Absen(t|ce)',
         r'^(AUTO: )?Out of (the )?office',
         r'^Auto( ?): ',
         r'^AutoRe( ?):',
         r'^Automatic reply: ',
         r'automatique d\'absence',
         r'Automated Reply',
         r'AutoReply',
         r'(est|is) absent',
         r'^En dehors du bureau',
         r'I am out of town',
         r'I am currently away',
         r'(am|is) out of (the )?office',
         r' n\'est pas joignable',
         r'Notification d\'absence',
         r'^Out of email reach',
         r'R.{1,2}ponse automatique( :)?',  # There may be encoding error of e acute
         r'^Respuesta de Estoy ausente:',
     ]
     self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re]
Example #24
0
def maildir2mailbox(maildirname, mboxfilename):
    """
    slightly adapted from maildir2mbox.py, 
    Nathan R. Yergler, 6 June 2010
    http://yergler.net/blog/2010/06/06/batteries-included-or-maildir-to-mbox-again/
    Port to Python 3 by Philippe Fremy
    """
    # open the existing maildir and the target mbox file
    maildir = mailbox.Maildir(maildirname, email.message_from_binary_file)
    mbox = mailbox.mbox(mboxfilename)

    # lock the mbox
    # mbox.lock()

    # iterate over messages in the maildir and add to the mbox
    n = len(maildir)
    for i, v in enumerate(maildir.iteritems()):
        key, msg = v
        if (i % 10) == 9:
            print( 'Progress: msg %d of %d' % (i+1,n))
        try:
            mbox.add(msg)
        except Exception:
            print( 'Exception while processing msg with key: %s' % key )
            traceback.print_exc()            

    # close and unlock
    mbox.close()
    maildir.close()
Example #25
0
 def _get_otp(self):
     """Internal method to get the OTP, either interactively over the commandline or
     by checking a mailbox (mbox).
     """
     interactive = get_from_tconfig(['sms_token', 'interactive'], required=True)
     mbox_filepath = get_from_tconfig(['sms_token', 'mbox_filepath'],
                                      default="/var/mail/jenkins")
     otp = None
     if interactive.lower() == 'true':
         otp = raw_input("OTP (check your e-mail): ")
     else:
         time.sleep(10) # Wait for sms to arrive
         mybox = mailbox.mbox(mbox_filepath)
         mybox.lock()
         try:
             print "Mailbox length: " + str(len(mybox))
             def get_mail_delivery_date(key_mail_pair):
                 mail = key_mail_pair[1]
                 date_tuple = parsedate(mail['Delivery-date'])
                 return time.mktime(date_tuple)
             newest_mail_key, newest_mail = max(mybox.iteritems(), key=get_mail_delivery_date)
             self.assertTrue(newest_mail is not None, "No sms in mbox")
             payload = newest_mail.get_payload()
             matches = re.search(r"\d{6}", payload)
             self.assertTrue(matches is not None, "No OTP in sms message %r" % newest_mail)
             otp = matches.group(0)
             mybox.remove(newest_mail_key)
         except Exception as exc:
             raise exc
         finally:
             mybox.close()
             mybox.unlock()
     return otp
Example #26
0
def generate_mbox(messages, full_tags):
    mbox_dir = config.get_mbox_path()
    mid = escape_message_id(messages[0][0].get_message_id())

    tmp_mbox_path = "%s/tmp-%s" % (mbox_dir, mid)
    mbox_path = "%s/mbox-%s" % (mbox_dir, mid)

    mbox = mailbox.mbox(tmp_mbox_path, create=True)
    for message, tags in messages:
        new_payload = add_tags(message, merge_tags(full_tags, tags))

        msg = message.get_message_parts()[0]

        # Drop content transfer encoding so msg.set_payload() will re-encode
        if "content-transfer-encoding" in msg:
            del msg["content-transfer-encoding"]

        # Change charset to UTF-8 to guarantee that we can represent all
        # characters.  Think about the case where the patch email was ASCII and
        # a reviewer with a non-ASCII name replied with a Reviewed-by tag, now
        # the patch can no longer be represented by ASCII.
        msg.set_payload(new_payload.encode("utf-8"), "utf-8")
        mbox.add(msg)
    mbox.flush()
    mbox.close()

    os.rename(tmp_mbox_path, mbox_path)

    return config.get_mbox_prefix() + ("mbox-%s" % mid)
Example #27
0
def process(file, config, rcontext, columns=None):
    fullpath = file.fullpath
    try:
        with open(fullpath, "rb") as file:
            # Read the beginning of the file to check if it looks like
            # an mbox file. If not, stop.
            if not file.read(5).startswith("From "):
                return None

        messages = []
        mbox = mailbox.mbox(fullpath, create=False)
        for mboxmessage in mbox:
            message = {}
            message['flags'] = mboxmessage.get_flags()
            message['content'] = mboxmessage.get_payload()
            # Stores each line such as To:, Subject:, etc. (if present)
            for key, value in mboxmessage.items():
                message[key] = value
            messages.append(message)

        # Print some data that is stored in
        # the database if debug is true
        if config.DEBUG:
            print "\nMbox file data:"
            print "%-18s %s" % (columns[0], messages)
            print

        return [messages]
    except:
        traceback.print_exc(file=sys.stderr)

        return None
Example #28
0
def arch_month_mbox(request, list_name, year, month_name):
    store = get_store(request)
    mlist = get_list_by_name(list_name, store, request)
    if mlist is None:
        raise Http404("No archived mailing-list by that name.")
    month = month_name_to_num(month_name)
    year = int(year)
    begin_date = datetime.datetime(year, month, 1)
    if month != 12:
        end_month = month + 1
    else:
        end_month = 1
    end_date = datetime.datetime(year, end_month, 1)
    messages = store.get_messages(mlist.name, start=begin_date, end=end_date)
    messages.reverse() # they are sorted recent first by default
    mboxfile, mboxfilepath = tempfile.mkstemp(prefix="hyperkitty-",
                                              suffix=".mbox.gz")
    os.close(mboxfile)
    mbox = mailbox.mbox(mboxfilepath)
    for message in messages:
        mbox.add(message.full)
    mbox.close()
    content = StringIO()
    zipped_content = gzip.GzipFile(fileobj=content)
    with gzip.GzipFile(fileobj=content, mode="wb") as zipped_content:
        with open(mboxfilepath, "rb") as mboxfile:
            zipped_content.write(mboxfile.read())
    response = HttpResponse(content.getvalue())
    content.close()
    response['Content-Type'] = "application/mbox+gz"
    response['Content-Disposition'] = 'attachment; filename=%d-%s.txt.gz' \
            % (year, month_name)
    response['Content-Length'] = len(response.content)
    os.remove(mboxfilepath)
    return response
Example #29
0
def check_mail(label, nomail, ignore, colors):
    """ Check mail in mailbox "label" and return report and color """
    name = label
    if not os.path.isabs(name):
        name = os.path.join(os.environ['HOME'], name)

    if not os.path.exists(name) or os.path.getsize(name) == 0:
        return nomail, colors[0]

    if os.path.isfile(name):
        import mailbox

        try:
            mbox = mailbox.mbox(name, create = False)
            messages = 0
            for msg in mbox:
                if msg.get_flags() == '':
                    messages += 1

            if messages > 0:
                return '{0}:{1}'.format(os.path.basename(name),
                        messages), colors[1]
            else:
                return nomail, colors[0]

        except IOError, exception:
            return '{0}: {1}'.format(name, exception.strerror), colors[2]

        except:
 def initialize(self, mbox_file):
     self.seen = 0
     self.bad_problems = 0
     self.emails = []
     self.mbox_file = '%s.bounced' % mbox_file
     self.mbox = mailbox.mbox(self.mbox_file)
     self.mbox.clear()
    def handle(self, *args, **options):
        results = {
            models.Patch: 0,
            models.CoverLetter: 0,
            models.Comment: 0,
        }
        duplicates = 0
        dropped = 0
        errors = 0

        # TODO(stephenfin): Support passing via stdin
        path = args and args[0] or options['infile']
        if not os.path.exists(path):
            self.stdout.write('Invalid path: %s' % path)
            sys.exit(1)

        # assume if <infile> is a directory, then we're passing a maildir
        if os.path.isfile(path):
            mbox = mailbox.mbox(path, create=False)
        else:
            mbox = mailbox.Maildir(path, create=False)

        count = len(mbox)

        # Iterate through the mbox. This will pick up exceptions that are only
        # thrown when a broken email is found part way through. Without this
        # block, we'd get the exception thrown in enumerate(mbox) below, which
        # is harder to catch. This is due to a bug in the Python 'email'
        # library, as described here:
        #
        #   https://lists.ozlabs.org/pipermail/patchwork/2017-July/004486.html
        #
        # The alternative is converting the mbox to a list of messages, but
        # that requires holding the entire thing in memory, which is wateful.
        try:
            for m in mbox:
                pass
        except AttributeError:
            logger.warning('Broken mbox/Maildir, aborting')
            return

        logger.info('Parsing %d mails', count)
        for i, msg in enumerate(mbox):
            try:
                obj = parse_mail(msg, options['list_id'])
                if obj:
                    results[type(obj)] += 1
                else:
                    dropped += 1
            except django.db.utils.IntegrityError:
                duplicates += 1
            except ValueError:
                # TODO(stephenfin): Perhaps we should store the broken patch
                # somewhere for future reference?
                errors += 1

            if (i % 10) == 0:
                self.stdout.write('%06d/%06d\r' % (i, count), ending='')
                self.stdout.flush()

        self.stdout.write(
            'Processed %(total)d messages -->\n'
            '  %(covers)4d cover letters\n'
            '  %(patches)4d patches\n'
            '  %(comments)4d comments\n'
            '  %(duplicates)4d duplicates\n'
            '  %(dropped)4d dropped\n'
            '  %(errors)4d errors\n'
            'Total: %(new)s new entries' % {
                'total': count,
                'covers': results[models.CoverLetter],
                'patches': results[models.Patch],
                'comments': results[models.Comment],
                'duplicates': duplicates,
                'dropped': dropped,
                'errors': errors,
                'new': count - duplicates - dropped - errors,
            })
        mbox.close()
Example #32
0
def generate_kmeans_clustering(mbox_filename,
                               output_filename,
                               author_uid_filename,
                               json_filename,
                               top_n=None):
    """
	From the .MBOX file, this function extracts the email content is extracted using two predefined classes
	available in the Python Standard Library: Mailbox and Message. Feature vectors are created for all the authors
	by obtaining meaningful words from the mail content, after removing the stop words, using NLTK libraries.
	The words obtained are transformed using stemming or lemmatization before adding these words to the word list of
	the corresponding authors. A matrix is created out of these word lists such that row set is the union of terms of
	all the authors and the column set contains the authors. If a term does not appear in a document, the corresponding
	matrix entry would be zero. The resulting matrix is called term-document matrix. Then tf-idf analysis is performed
	on the term-document matrix. Finally the top-10 words of each author is listed by their weight values.
	Each entry corresponds to the tf-idf normalized coefficient of the keyword for a user. If a keyword is not present
	in the top-10 keywords of a user, then the corresponding matrix entry would be zero. Also returns the feature names.

	:param mbox_filename: Contains the absolute or relative address of the MBOX file to be opened.
	:return: Term Document Matrix: The columns of the matrix are the users and the rows of the matrix are the keywords.
	"""
    english_stopwords = set(
        stopwords.words('english')
    ) | custom_stopwords.common_words | custom_stopwords.custom_words
    email_re = re.compile(r'[\w\.-]+@[\w\.-]+')
    wnl = WordNetLemmatizer()

    print("Reading messages from MBOX file...")
    mailbox_obj = mailbox.mbox(mbox_filename)
    with open(author_uid_filename, 'r') as map_file:
        author_uid_map = json.load(map_file)
        map_file.close()
    top_n = min(len(author_uid_map), top_n)
    top_authors, top_authors_index = get_top_authors(top_n, json_filename)
    keywords_list = [list() for x in range(top_n + 1)]

    i = 0  # Number of emails processed
    for message in mailbox_obj:
        temp = email_re.search(str(message['From']))
        from_addr = temp.group(0) if temp is not None else message['From']
        if top_n is not None and from_addr not in top_authors:
            continue
        if top_n is None and from_addr not in author_uid_map.keys():
            continue

        msg_body = get_message_body(message)
        if from_addr is None:
            from_addr = message['From']
        msg_tokens = [
            x.lower() for x in re.sub('\W+', ' ', msg_body).split()
            if 2 < len(x) < 30
        ]
        # Toggle comment below if numbers and underscores should also be removed.
        # msg_tokens = [x for x in re.sub('[^a-zA-Z]+', ' ', msg_body).split() if 2 < len(x) < 30]

        msg_tokens = [
            wnl.lemmatize(x) for x in msg_tokens
            if not x.isdigit() and x not in from_addr
        ]
        msg_tokens = [x for x in msg_tokens if x not in english_stopwords]

        keywords_list[top_authors_index[from_addr]].extend(msg_tokens)

        i += 1
        if not i % 10000:
            print(i, "of", len(mailbox_obj), "messages processed.")

    for num in range(len(keywords_list)):
        keywords_list[num] = " ".join(keywords_list[num])

    print("Performing tf-idf analysis on the term-document matrix...")
    vectorizer = TfidfVectorizer(analyzer='word',
                                 stop_words=english_stopwords,
                                 max_features=200000,
                                 use_idf=True,
                                 ngram_range=(1, 4))
    tfidf_matrix = vectorizer.fit_transform(keywords_list).toarray()

    # with open("author_top_index.json", 'w') as json_file:
    #     json.dump(top_authors_index, json_file)
    # print(feature_names)

    kmeans_classifier = KMeans(n_clusters=8, n_init=4)
    labels = kmeans_classifier.fit_predict(tfidf_matrix)
    clustering = dict()

    for i in range(len(labels)):
        x = None
        for k, v in author_uid_map.items():
            if v == i:
                x = k
        if clustering.get(str(labels[i]), None) is None:
            clustering[str(labels[i])] = [x]
        else:
            clustering[str(labels[i])].append(x)

    with open(output_filename, 'w') as out_file:
        json.dump(clustering, out_file)
    out_file.close()
Example #33
0
 def startup(self):
     self.box = mailbox.mbox(self.filename)
    """Extracts: To, From, Subject and Date from email.Message() or mailbox.Message()
    origin -- Message() object
    Returns tuple(From, To, Subject, Date)
    If message doesn't contain one/more of them, the empty strings will be returned.
    """
    Date = ""
    if origin.has_key("date"): Date = origin["date"].strip()
    From = ""
    if origin.has_key("from"): From = origin["from"].strip()
    To = ""
    if origin.has_key("to"): To = origin["to"].strip()
    Subject = ""
    if origin.has_key("subject"): Subject = origin["subject"].strip()
    return From, To, Subject, Date


f = open("emailMailbox", "rb")
open("noEmailsToMailbox.txt", "w")
mailbox = mailbox.mbox(f.name)
ommitedEmailAdresses  = ["*****@*****.**", "*****@*****.**"]
for message in mailbox:
    msg = pullout(message, f.name)
    emailCaption = caption(message)
    if (not any(emailAddress in emailCaption[0] for emailAddress in ommitedEmailAdresses)) and ("*****@*****.**" in emailCaption[1]):
        file = open("noEmailsToMailbox.txt", "a")
        file.write(msg[0])
        file.close


f.close()
def open_mbox_file():
    my_file = Path(args.mbox_path)
    if not my_file.is_file():
        print("path '%s' is not a file" % args.mbox_path)
        exit(0)
    return mailbox.mbox(args.mbox_path)
Example #36
0
# -*- coding: utf-8 -*-

import mailbox
from bs4 import BeautifulSoup
from dateutil import parser

mbox = mailbox.mbox('Indeed.mbox')

for message in mbox:
    if message.is_multipart():
        content = ''.join(
            part.get_payload(decode=True) for part in message.get_payload())
    else:
        content = message.get_payload(decode=True)

    dt = parser.parse(message['Date']).strftime("%Y-%m-%d")
    soup = BeautifulSoup(content, "lxml")
    for block in soup.select(".job_company_location_wrapper .nolink"):
        print dt + ";" + block.text.rstrip().lstrip().encode('utf-8')
    for block in soup.select('span.sg-paragraph-large.db'):
        print dt + ";" + block.text.replace(
            '-', '').rstrip().lstrip().encode('utf-8')
    for block in soup.select(".job-company-location"):
        print dt + ";" + block.text.replace(
            '-', '').rstrip().lstrip().encode('utf-8')
Example #37
0
 def clasificar_correo(self, fichero):
     correos = mailbox.mbox(fichero)
     return [self.clasificar_mensaje(mensaje) for mensaje in correos]
Example #38
0
def process_mbox_files(group_name, service):
  """Iterates over the mbox files found in the user's subdir and imports them.

  Args:
    group_name: The email address of the group to import into.
    service: A Gmail API service object.
  Returns:
    A tuple of: Number of labels imported without error,
                Number of labels imported with some errors,
                Number of labels that failed completely,
                Number of messages imported without error,
                Number of messages that failed.
  """
  number_of_labels_imported_without_error = 0
  number_of_labels_imported_with_some_errors = 0
  number_of_labels_failed = 0
  number_of_messages_imported_without_error = 0
  number_of_messages_failed = 0
  base_path = os.path.join(args.dir, group_name)
  for root, dirs, files in os.walk(base_path):
    for dir in dirs:
      try:
        labelname = os.path.join(root[len(base_path) + 1:], dir)
      except Exception:
        logging.error("Labels under '%s' may not nest correctly", dir)
    for file in files:
      filename = root[len(base_path) + 1:]
      if filename:
        filename += '/'
      filename += file
      labelname, ext = os.path.splitext(filename)
      full_filename = os.path.join(root, file)
      if labelname.endswith('.mbox/mbox'):
          logging.error("It's seem to be Apple Mail export. It's not handled by the script")
        # Assume this is an Apple Mail export, so there's an mbox file inside a
        # dir that ends with .mbox.
        # labelname = labelname[:-10]
        # logging.info("File '%s' looks like an Apple Mail export, importing it "
        #              "into label '%s'",
        #              full_filename,
        #              labelname)
      elif ext != '.mbox':
        logging.info("Skipping '%s' because it doesn't have a .mbox extension",
                     full_filename)
        continue
      if os.path.isdir(full_filename):
        # This "shouldn't happen" but it does, sometimes.
        # Assume this is an Apple Mail export, so there's an mbox file inside the dir.
        full_filename += os.path.join(full_filename, 'mbox')
        logging.info("Using '%s' instead of the directory", full_filename)
      logging.info("Starting processing of '%s'", full_filename)
      number_of_successes_in_label = 0
      number_of_failures_in_label = 0
      mbox = mailbox.mbox(full_filename)

      logging.info("Using label name '%s'", labelname)
      total = len(mbox)
      for index, message in enumerate(mbox):
        if index < args.from_message:
          continue
        logging.info("Processing message %d/%d in label '%s'", index, total, labelname)
        try:
          # Use media upload to allow messages more than 5mb.
          # See https://developers.google.com/api-client-library/python/guide/media_upload
          # and http://google-api-python-client.googlecode.com/hg/docs/epy/apiclient.http.MediaIoBaseUpload-class.html.
          if sys.version_info.major == 2:
            message_data = io.BytesIO(message.as_string())
          else:
            message_data = io.StringIO(message.as_string())
          media = MediaIoBaseUpload(message_data, mimetype='message/rfc822')
          service.archive().insert(
              groupId=group_name,
              media_body=media).execute()
          number_of_successes_in_label += 1

        except Exception:
          number_of_failures_in_label += 1
          logging.exception('Failed to import mbox message')
      logging.info("Finished processing '%s'. %d messages imported "
                   "successfully, %d messages failed.",
                   full_filename,
                   number_of_successes_in_label,
                   number_of_failures_in_label)
      if number_of_failures_in_label == 0:
        number_of_labels_imported_without_error += 1
      elif number_of_successes_in_label > 0:
        number_of_labels_imported_with_some_errors += 1
      else:
        number_of_labels_failed += 1
      number_of_messages_imported_without_error += number_of_successes_in_label
      number_of_messages_failed += number_of_failures_in_label
  return (number_of_labels_imported_without_error,     # 0
          number_of_labels_imported_with_some_errors,  # 1
          number_of_labels_failed,                     # 2
          number_of_messages_imported_without_error,   # 3
          number_of_messages_failed)                   # 4
Example #39
0
    # get mbox file
    mbox_file = raw_input("name of mbox file in current directory (ex. my_file.mbox): ")

    # get name to filter
    name_filter = raw_input("name of sender that you want to filter (ex. Jarrod Parkes): ")

    # get email to filter
    email_filter = raw_input("email of sender that you want to filter (ex. [email protected]): ")

    # create CSV file
    writer = csv.writer(open(export_file_name, "wb"))

    # create header row
    writer.writerow(["subject", "from", "date", "body"])

    # add rows based on mbox file
    for message in mailbox.mbox(mbox_file):
        contents = get_message(message)
        contents = html2text.html2text(contents)
        # does message contain name or email filter?
        if name_filter != "" and name_filter in message["from"]:
            writer.writerow([message["subject"], message["from"], message["date"], contents])
        elif email_filter != "" and email_filter in message["from"]:
            writer.writerow([message["subject"], message["from"], message["date"], contents])
        else:
            continue

    # print finish message
    print "generated csv file called " + export_file_name
Example #40
0
#!/usr/bin/env python
"""
    This example shows how to restore a backup created by backup_mailbox.py"
"""
from ProcImap.ImapMailbox import ImapMailbox
from ProcImap.ImapMessage import ImapMessage
from ProcImap.Utils.MailboxFactory import MailboxFactory
from mailbox import mbox
import sys

# usage: restore_mailbox.py backupmbox imapmailbox

mailboxes = MailboxFactory('/home/goerz/.procimap/mailboxes.cfg')
server = mailboxes.get_server('Gmail')
mailbox = ImapMailbox((server, sys.argv[2]))
backupsource = mbox(sys.argv[1], factory=ImapMessage)

for message in backupsource:
    if message.has_key("X-ProcImap-Imapflags"):
        message.flags_from_string(message["X-ProcImap-Imapflags"])
        del message["X-ProcImap-Imapflags"]
    if message.has_key("X-ProcImap-ImapInternalDate"):
        message.internaldate_from_string(
            message["X-ProcImap-ImapInternalDate"])
        del message["X-ProcImap-ImapInternalDate"]
    mailbox.add(message)

mailbox.close()
backupsource.close()
sys.exit(0)
Example #41
0
def collect_from_url(url, base_arch_dir="archives"):
    url = normalize_mailing_list_url(url)
    list_name = mailman.get_list_name(url)
    logging.info("Getting W3C list archive for %s" % list_name)

    response = urllib2.urlopen(url)
    html = response.read()
    soup = BeautifulSoup(html)

    time_period_indices = list()
    rows = soup.select('tbody tr')
    for row in rows:
        link = row.select('td:nth-of-type(1) a')[0].get('href')
        logging.info("Found time period archive page: %s" % link)
        time_period_indices.append(link)

    # directory for downloaded files
    arc_dir = mailman.archive_directory(base_arch_dir, list_name)

    for link in time_period_indices:
        link_url = urlparse.urljoin(url, link)
        response = urllib2.urlopen(link_url)
        html = response.read()
        soup = BeautifulSoup(html)

        end_date_string = soup.select(
            '#end')[0].parent.parent.select('em')[0].get_text()
        end_date = dateutil.parser.parse(end_date_string)
        year_month_mbox = end_date.strftime('%Y-%m') + '.mbox'
        mbox_path = os.path.join(arc_dir, year_month_mbox)

        # looks like we've already downloaded this timeperiod
        if os.path.isfile(mbox_path):
            logging.info(
                'Looks like %s already exists, moving on.' %
                mbox_path)
            continue
        logging.info('Downloading messages to archive to %s.' % mbox_path)

        message_links = list()
        messages = list()

        anchors = soup.select('div.messages-list a')
        for anchor in anchors:
            if anchor.get('href'):
                message_url = urlparse.urljoin(link_url, anchor.get('href'))
                message_links.append(message_url)

        for message_link in message_links:
            response = urllib2.urlopen(message_link)
            html = response.read()

            message = W3cMailingListArchivesParser().parsestr(html)
            messages.append(message)
            time.sleep(1)  # wait between loading messages, for politeness

        mbox = mailbox.mbox(mbox_path)
        mbox.lock()

        try:
            for message in messages:
                mbox.add(message)
            mbox.flush()
        finally:
            mbox.unlock()

        logging.info('Saved ' + year_month_mbox)
Example #42
0
import mailbox
import re
import csv

mbox_file = "data/BLOCKED.mbox"
mbox = mailbox.mbox(mbox_file)


def getbody(message):  #getting plain text 'email body'
    body = None
    if message.is_multipart():
        for part in message.walk():
            if part.is_multipart():
                for subpart in part.walk():
                    if subpart.get_content_type() == 'text/plain':
                        body = subpart.get_payload(decode=True)
            elif part.get_content_type() == 'text/plain':
                body = part.get_payload(decode=True)
    elif message.get_content_type() == 'text/plain':
        body = message.get_payload(decode=True)
    return body


def main():
    total_messages = 1
    claim_list_file = "data/block_list.csv"
    with open(claim_list_file, 'w', newline='', encoding='utf-8') as new_file:
        out_file_headers = [
            'channel_name', 'video_title', 'copyrighted_content', 'claimed_by',
            'claim_note', 'claim_url', 'claim_date'
        ]
Example #43
0
from detector import Detector
import mailbox
import re

PATH_TO_INBOX_MBOX = "~/Documents/Fall15/research/Inbox.mbox"
PATH_TO_ARCHIVE_MBOX = "~/Documents/Fall15/research/Archived.mbox"

inbox = mailbox.mbox(PATH_TO_INBOX_MBOX)
#archive = mailbox.mbox(PATH_TO_ARCHIVE_MBOX)

#Has timezone abbrev
#att2 = re.compile("\(?[A-Z][A-Z][A-Z]\)?")
#Has timezone in offset
timezone_re = re.compile("([+-][0-9][0-9][0-9][0-9])")

sender_to_email_map = {}
sender_to_date_data = {}

num_emails = 0


class Timezone:
    def __init__(self, date_string):
        # Timezone
        self.timezone = Timezone.convert_to_timezone_string(date_string)

    def same_timezone(self, other):
        if (self.timezone != other.timezone):
            return False
        return True
Example #44
0
import mailbox
import uuid
import email.utils
import sqlite3

import config
# config.py file

from email_reply_parser import EmailReplyParser
# https://github.com/zapier/email-reply-parser

# Load config variables from config.py
mbox = mailbox.mbox(config.DWAYNE_CONFIG['mailbox'])
NQUESTIONS = config.DWAYNE_CONFIG['NQUESTIONS']
db_name = config.DWAYNE_CONFIG['db_name']
question_1 = config.DWAYNE_CONFIG['question_1']
question_2 = config.DWAYNE_CONFIG['question_2']
question_3 = config.DWAYNE_CONFIG['question_3']
len1 = len(question_1)
len2 = len(question_2)
len3 = len(question_3)
uidstring = config.DWAYNE_CONFIG['uidstring']
lenid = len(uidstring)


def get_shortid(subject_line):
    """
    Returns the short id from the email subject
    """
    shortid = None
    # TODO check parsing method
Example #45
0
 messages = []
 archived = []
 servername = hostname()
 msquarantine = get_config_option('QuarantineDir')
 cutoff = parse('01-01-05')
 qdirs = ["spam", "nonspam"]
 for (dirname, dirs, files) in os.walk(mboxdir):
     #print_ '*' * 100
     print 'Processing: %(d)s' % dict(d=dirname)
     for mail in files:
         if mail.startswith('.'):
             continue
         count = 0
         filename = os.path.join(dirname, mail)
         print "Processing mailbox: %s" % mail
         for message in mailbox.mbox(filename):
             try:
                 msgdatetime = parse(message['date'], ignoretz=True)
                 msgtimestamp = msgdatetime.strftime(
                     "%Y-%m-%d %H:%M:%S")
                 msgdate = msgdatetime.strftime("%Y-%m-%d")
                 msgtime = msgdatetime.strftime("%H:%M:%S")
                 dirdate = msgdate.replace('-', '')
                 # quarantinedir = os.path.join(basepath, 'data',
                 #                         'quarantine', dirdate)
                 quarantinedir = os.path.join(msquarantine, dirdate)
                 if not os.path.exists(quarantinedir):
                     os.mkdir(quarantinedir)
                     os.mkdir(os.path.join(quarantinedir, 'spam'))
                     os.mkdir(os.path.join(quarantinedir, 'nonspam'))
                 messageid = parseaddr(message['message-id'])[1]
Example #46
0
def parseMbox(file):
    for message in mailbox.mbox(file):
        msg = parseMessage(message, file)
        if (msg):
            parseContacts(message, msg)
            parseHeaders(message, msg)
Example #47
0
 def mbox(self, location): # FIXME: inconsistent with maildir()
     mboxfile = mailbox.mbox(location)
     mboxfile.lock()
     for item in self.items():
         mboxfile.add(item.eml())
     mboxfile.unlock()
Example #48
0
 def readmbox(self, location):
     for message in mailbox.mbox(location):
         if sys.hexversion >= 0x03000000:
             _item.Item(self, eml=message.as_bytes(unixfrom=True), create=True)
         else: # pragma: no cover
             _item.Item(self, eml=message.as_string(unixfrom=True), create=True)
Example #49
0

if __name__ == '__main__':

    import networkx as nx
    try: 
        import matplotlib.pyplot as plt
    except:
        pass

    if len(sys.argv)==1:
        filePath = "unix_email.mbox"
    else:
        filePath = sys.argv[1]

    mbox = mailbox.mbox(filePath, msgfactory) # parse unix mailbox

    G=nx.MultiDiGraph() # create empty graph

    # parse each messages and build graph 
    for msg in mbox: # msg is python email.Message.Message object
        (source_name,source_addr) = parseaddr(msg['From']) # sender
        # get all recipients
        # see http://www.python.org/doc/current/lib/module-email.Utils.html
        tos = msg.get_all('to', [])
        ccs = msg.get_all('cc', [])
        resent_tos = msg.get_all('resent-to', [])
        resent_ccs = msg.get_all('resent-cc', [])
        all_recipients = getaddresses(tos + ccs + resent_tos + resent_ccs)
        # now add the edges for this mail message
        for (target_name,target_addr) in all_recipients:
Example #50
0
def process_mbox(filename, stats):
    msgs = mailbox.mbox(filename)
    for (j, mail) in enumerate(msgs):
        process_mail(mail, filename, stats)
Example #51
0
                yield msg

    def _read_email_text(self, msg):
        content_type = 'NA' if isinstance(msg, str) else msg.get_content_type()
        encoding = 'NA' if isinstance(msg, str) else msg.get(
            'Content-Transfer-Encoding', 'NA')
        if 'text/plain' in content_type and 'base64' not in encoding:
            msg_text = msg.get_payload()
        elif 'text/html' in content_type and 'base64' not in encoding:
            msg_text = get_html_text(msg.get_payload())
        elif content_type == 'NA':
            msg_text = get_html_text(msg)
        else:
            msg_text = None
        return (content_type, encoding, msg_text)


######################### End of library, example of use below

mbox_obj = mailbox.mbox('data/input/personal_email/Personnel.mbox')

num_entries = len(mbox_obj)

message_list = []

for idx, email_obj in enumerate(mbox_obj):
    email_data = GmailMboxMessage(email_obj)
    message_list.append(email_data.parse_email().email_date)
    print('Parsing email {0} of {1}'.format(idx, num_entries))

message_list[:3]
Example #52
0
    sys.exit(0)

filename = sys.argv[1]
directory = os.path.curdir

if not os.path.exists(filename):
    print "File doesn't exist:", filename
    sys.exit(1)

if len(sys.argv) == 3:
    directory = sys.argv[2]
    if not os.path.exists(directory) or not os.path.isdir(directory):
        print "Directory doesn't exist:", directory
        sys.exit(1)

mb = mailbox.mbox(filename)
nmes = len(mb)

os.chdir(directory)

for i in range(len(mb)):
    if (VERBOSE >= 2):
        print "Analyzing message number", i

    mes = mb.get_message(i)
    em = email.message_from_string(mes.as_string())

    subject = em.get('Subject')
    if subject and subject.find('=?') != -1:
        ll = email.header.decode_header(subject)
        subject = ""
Example #53
0
    def _get_mbox(name):
        """Open an mbox file.

        :param name: Name of mbox file
        """
        return mailbox.mbox(os.path.join(TEST_SERIES_DIR, name), create=False)
Example #54
0
def main():

    ### get the arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-b', dest='needBash', action='store_true')
    parser.add_argument('-p', dest='needTxt', action='store_true')
    parser.add_argument('-m', action="store", dest="mboxFileName")
    args = parser.parse_args()

    mbox = mailbox.mbox(args.mboxFileName)
    writeBash = args.needBash
    writeParameter = args.needTxt

    ### if write to bash, open the bash script
    if writeBash:
        myBash = open("pbookFromMail.sh", "w")
        myBash.write("#!/bin/bash\n")
        myBash.write("pbook << EOF\n")

    #### preparing the output file name depending on the input file name
    fileInName = args.mboxFileName
    withoutSuffix = fileInName.split('.mbox')[0]
    brokenJobs = open(withoutSuffix + '.txt', 'w')
    brokenJob = 0
    unfinishedJob = 0
    totalJobCount = 0

    ### looping through each mail
    for message in mbox:
        ### count the number of mails
        totalJobCount = totalJobCount + 1
        #print "from   :", message['from']
        subjectLine = message['subject']
        print "subject:", subjectLine

        ### getting different parts of the subject line
        eachWord = subjectLine.split()
        taskId = eachWord[3].split(':')[1]
        status = eachWord[4].split('(')[1]
        doneJob = eachWord[4].split('(')[1].split('/')[0]
        totalJob = eachWord[4].split('(')[1].split('/')[1]
        if message.is_multipart():
            content = ''.join(
                part.get_payload(decode=True)
                for part in message.get_payload())
        else:
            content = message.get_payload(decode=True)

        ### if there is a broken job, no point retrying it, rather it needs to be submitted fresh.
        if ('Final Status : broken' in content):
            brokenJobs.write("\n")
            brokenJobs.write("*****mail number  " + str(brokenJob) +
                             " ************************\n")
            contentList = content.splitlines()
            print content
            for contentLine in contentList:
                if 'In  :' in contentLine:
                    print contentLine
                    brokenJobs.write(contentLine.split('In  :')[1] + '\n')
                if (writeParameter):
                    if 'Parameters :' in contentLine:
                        brokenJobs.write('----\n')
                        print contentLine
                        brokenJobs.write(contentLine + '\n')
            time.sleep(0)
            brokenJob += 1

        ### the job is not broken
        else:

            #### the doneJob is less than the totalJob, these jobs can be retried.
            if (int(doneJob) != int(totalJob)):
                print "Task ID: ", taskId
                print "status: ", status
                print "number done: ", doneJob
                print "number total: ", totalJob
                print taskId, ": ", status, ": ", doneJob, ": ", totalJob
                if writeBash:
                    myBash.write("retry(" + taskId + ")\n")
                unfinishedJob += 1

    ### close the bash script
    if writeBash:
        myBash.write("EOF")
        myBash.close()

    brokenJobs.close()
    print "Number of broken jobs: ", brokenJob
    print "Number of unfinished jobs: ", unfinishedJob
    print "Total number of jobs: ", totalJobCount
Example #55
0
    return datetime.fromtimestamp(
        email.utils.mktime_tz(email.utils.parsedate_tz(raw_timestamp)))


parser = argparse.ArgumentParser(
    description=
    'Get only the most recent files from an mbox, printing a new mbox file')
parser.add_argument('mboxs',
                    metavar="f",
                    type=str,
                    nargs='+',
                    help="an mbox file to search")
ns = parser.parse_args()

tmpf = tempfile.NamedTemporaryFile()
output = mailbox.mbox(tmpf.name, create=True)
for mbox in ns.mboxs:
    messages = mailbox.mbox(mbox)
    for msg in messages:
        # If there is a date header, use it, otherwise, user the first line, which
        # always contains a timestamp....
        raw_timestamp = msg['Date']
        if not raw_timestamp:
            raw_timestamp = re.sub("^From \w+? ", "",
                                   str(msg).partition("\n")[0])
        try:
            if parse_email_timestamp(
                    raw_timestamp) > datetime.now() - timedelta(hours=36):
                output.add(msg)
        # if parsing a timestamp fails, log it, but just ignore that message.
        # ignoring a few messages is fine, since we're just using this for training
Example #56
0
    for k in ['To', 'Cc', 'Bcc', 'Received']:
        if not json_msg.get(k):
            continue
        json_msg[k] = json_msg[k].replace('\n', '').replace('\t', '').replace(
            '\r', '').replace(' ', '').split(',')

    try:
        for part in msg.walk():
            if part.get_content_type() == 'text/plain':
                content = part.get_payload()
                json_msg['text'] = content
    except:
        sys.stderr.write('Skipping message - error encountered\n')
    finally:
        return json_msg


def gen_json_msgs(mb):
    li = []
    for msg in mbox:
        if msg is None:
            break
        li.append(jsonifyMessage(msg))
    return li


mbox = mailbox.mbox(MBOX)
with open(OUT_FILE, 'w') as f:
    json.dump(gen_json_msgs(mbox), f, indent=4)
#!/usr/bin/env python

import mailbox, random, string, os
from email.mime.text import MIMEText
from email.utils import formatdate

mbox_out = 'testmbox'
mbox_tmp = '/tmp/testmbox'

mbox = mailbox.mbox(mbox_tmp)

mailfrom = '*****@*****.**'
mailto = '*****@*****.**'
subject = 'Testmsg of %s kB'

# You might want to adjust your size_distribution dictionary according to your needs.
# The following will create an mbox with 5 mails of 10kB, 80kB, 150kB and 250kB each
# in a randomized order. Imaptest will go through the mbox sequentially so the
# randomness has to be in the mbox file.
#size_distribution = {}
#size_distribution[10] = 5
#size_distribution[80] = 5
#size_distribution[150] = 5
#size_distribution[250] = 5

size_distribution = {}
size_distribution[5] = 5
size_distribution[10] = 10
size_distribution[20] = 25
size_distribution[40] = 15
size_distribution[60] = 13
Example #58
0
 def __init__(self, filename):
     self.mbox = mailbox.mbox(filename)
 
This creates a key "Candidate," and before I used google takeout to download 
the data I applied labels to each candidate's emails. This can then be 
extracted from gmail's X-label header and used to populate the Candidate key.

I signed up as "Victro Pala" because I figured it was easy to pick out and 
unlikely to actually be included anywhere in an email.

This outputs to a .json.

"""
import re
import mailbox
from mboxEmailParseAndScrub import emailMboxParser

mBox = mailbox.mbox('obj/ConsolidatedPoliticalEmails.mbox')
newParser = emailMboxParser(mBox)
newParser.AddRegexTouple(re.compile(r'\=\w\w'), '')
newParser.AddRegexTouple(re.compile(r'\[.*\]'))
newParser.declareDefaultRegex()
NEWLINE = '=\n'
NEWLINE2 = '\n'
RECIPIENT_FIRNAME = 'Victro'
RECIPIENT_FIRNAME1 = 'victro'
RECIPIENT_LASTNAME = 'Pala'
newParser.AddReplacementTouple(RECIPIENT_FIRNAME, 'recipientFirstName')
newParser.AddReplacementTouple(RECIPIENT_FIRNAME1, 'recipientFirstName')
newParser.AddReplacementTouple(RECIPIENT_LASTNAME, 'recipientLastName')
newParser.AddReplacementTouple(NEWLINE, '')
newParser.AddReplacementTouple(NEWLINE2, ' ')
Example #60
0
                            "\\n", " ").replace("\\t", " ").strip()
    for name in names:
        if name in text:
            text = text.replace(name, " ")
    return text


mboxchunk_filename = "chunk_"
msgnum = 0
max = 50000
out = codecs.open("mail.csv", mode="w", encoding="utf-8", errors="ignore")
skipped = 0
for dirpath, dirnames, filenames in os.walk("graymail"):
    for file in filenames:
        if file.startswith(mboxchunk_filename):
            mbox = mailbox.mbox(os.path.join(dirpath, file))

            for message in mbox:
                body = getbody(message)
                skip = False
                if body is None:
                    print("no body!")
                    body = ""
                else:
                    try:
                        body = body.decode("utf-8")
                    except:
                        skip = True
                        pass
                if not skip:
                    emailout = codecs.open(os.path.join(