def do_split(argv): path_name = argv[0] if not os.path.isfile(path_name): print("Error: file " + path_name + " not found.") return file_name = os.path.basename(path_name)[:-5] dir_name = file_name + "-split" if os.path.isdir(dir_name): shutil.rmtree(dir_name) os.makedirs(dir_name) print("Loading input mbox...", end="",flush=True) input_mbox = mailbox.mbox(path_name) num_emails = len(input_mbox) print("done.") num_parts = int(argv[1]) if len(argv) == 3: num_emails_per_part = int(argv[2]) else: num_emails_per_part = int(math.ceil(num_emails / num_parts)) num_emails_scanned = 0 for i in range(num_parts): start = num_emails_scanned end = start + num_emails_per_part - 1 if end > num_emails - 1: end = num_emails - 1 new_mbox_path = dir_name + "/" + file_name + str(start) + "-" + str(end) + ".mbox" print("Creating " + new_mbox_path + " for emails " + str(start) + "-" + str(end) + "...",end="", flush=True) new_mbox = mailbox.mbox(new_mbox_path, create=True) for j in range(start, end + 1): new_mbox.add(input_mbox[j]) num_emails_scanned += 1 new_mbox.flush() print("done.")
def splitbox(boxfile, fmt, filtermsg=None, copy=True, dry_run=False): box = mailbox.mbox(boxfile) for k, m in box.iteritems(): if filtermsg is None or not filtermsg(m): continue h = dict(m.items()) t = email.utils.parsedate_tz(m.get('Date')) h['Date'] = datetime.utcfromtimestamp(email.utils.mktime_tz(t)) f = fmt.format(**h) logger.info("Saving message %s in mailbox %s", k, f) if not dry_run: outbox = mailbox.mbox(f, create=True) outbox.lock() outbox.add(m) outbox.unlock() outbox.close() if not copy: logger.info("Removing message %s", k) if not dry_run: box.lock() box.discard(k) box.unlock() box.close()
def do_list(argv): path_name = argv[0] if not os.path.isfile(path_name): print("Error: file " + path_name + " not found.") return file_name = os.path.basename(path_name)[:-5] dir_name = file_name + "-split" if os.path.isdir(dir_name): shutil.rmtree(dir_name) os.makedirs(dir_name) print("Loading input mbox...", end="",flush=True) input_mbox = mailbox.mbox(path_name) num_emails = len(input_mbox) print("done.") groups = [int(a) for a in argv[1:]] num_emails_scanned = 0 for i in range(len(groups)): start = num_emails_scanned end = start + groups[i] - 1 last_group = False if end >= num_emails - 1: end = num_emails - 1 last_group = True new_mbox_path = dir_name + "/" + file_name + str(start) + "-" + str(end) + ".mbox" print("Creating " + new_mbox_path + " for emails " + str(start) + "-" + str(end) + "...",end="", flush=True) new_mbox = mailbox.mbox(new_mbox_path, create=True) for j in range(start, end + 1): new_mbox.add(input_mbox[j]) new_mbox.flush() print("done.") if last_group: break num_emails_scanned += groups[i]
def test_since_override(self): # When there's mail already and the "since" option is not used, it # defaults to the last email's date msg1 = Message() msg1["From"] = "*****@*****.**" msg1["Message-ID"] = "<msg1>" msg1["Date"] = "2015-01-01 12:00:00" msg1.set_payload("msg1") add_to_list("*****@*****.**", msg1) mailbox.mbox(os.path.join(self.tmpdir, "test.mbox")) # do the import output = StringIO() with patch("hyperkitty.management.commands.hyperkitty_import.DbImporter" ) as DbImporterMock: instance = Mock() instance.impacted_thread_ids = [] DbImporterMock.side_effect = lambda *a, **kw: instance self.command.execute(os.path.join(self.tmpdir, "test.mbox"), verbosity=2, stdout=output, stderr=output, list_address="*****@*****.**", since="2010-01-01 00:00:00 UTC", no_download=True, no_sync_mailman=True, ) self.assertEqual(DbImporterMock.call_args[0][1]["since"], datetime(2010, 1, 1, tzinfo=utc))
def separate_old_from_new_chats(chats_all_mbox_file, chats_old_mbox_file, chats_new_mbox_file): # Somewhere around 2013-05-01 Google changed its chat format. Old chat is # custom XMPP-like XML, new chat is mail message-based text/html. print('Separating old-style from new-style chats... ', file=sys.stdout) sys.stdout.flush() chats_all_mbox = mailbox.mbox(chats_all_mbox_file) chats_old_mbox = mailbox.mbox(chats_old_mbox_file) chats_new_mbox = mailbox.mbox(chats_new_mbox_file) num_messages = 0 num_old_chats = 0 num_new_chats = 0 for message in chats_all_mbox: num_messages += 1 if message.is_multipart(): # ALL old-style chats have the message in a 2-part multipart # payload: the first part containts the full XML chat, the second # contains a useless HTML representation of the chat num_old_chats += 1 chats_old_mbox.add(message) else: # ALL new-style chats have the message in a non-multipart payload: # the payload is just a string containing the chat content num_new_chats += 1 chats_new_mbox.add(message) print(' Chat messages: {0}'.format(num_messages), file=sys.stdout) print(' Old-style: {0} chat messages stored in \'{1}\''.format(num_old_chats, os.path.basename(chats_old_mbox_file)), file=sys.stdout) print(' New-style: {0} chat messages stored in \'{1}\''.format(num_new_chats, os.path.basename(chats_new_mbox_file)), file=sys.stdout) print('DONE', file=sys.stdout)
def parse_mbox(filename=None, fileobj=None): 'parse a mbox file' if not filename and not fileobj: raise ValueError('one of "filename" or "fileobj" is required') if filename: mbox = mailbox.mbox(filename) for message in mbox: yield simplify_message(message) else: # create a tempfile because mbox needs a path with NamedTemporaryFile() as tempfile: for chunk in iter(lambda: fileobj.read(BUFFER_SIZE), bytes()): tempfile.write(chunk) # make sure there is something to read tempfile.flush() mbox = mailbox.mbox(tempfile.name) for message in mbox: # skip corrupted messages if not message.get('Message-Id'): continue yield simplify_message(message)
def initialize(self, mbox_file): self.emails = [] self.mbox_file = '%s.dsn' % mbox_file self.mbox = mailbox.mbox(self.mbox_file) self.mbox.clear() self.mbox_temp_file = '%s.dsn-temp' % mbox_file self.mbox_temp = mailbox.mbox(self.mbox_temp_file) self.mbox_temp.clear()
def __init__( self, dbman, address=None, aliases=None, realname=None, gpg_key=None, signature=None, signature_filename=None, signature_as_attachment=False, sent_box=None, sent_tags=["sent"], draft_box=None, draft_tags=["draft"], abook=None, ): self.dbman = dbman self.address = address self.abook = abook self.aliases = [] if aliases: self.aliases = aliases.split(";") self.realname = realname self.gpg_key = gpg_key self.signature = signature self.signature_filename = signature_filename self.signature_as_attachment = signature_as_attachment self.sent_box = None if sent_box: mburl = urlparse(sent_box) if mburl.scheme == "mbox": self.sent_box = mailbox.mbox(mburl.path) elif mburl.scheme == "maildir": self.sent_box = mailbox.Maildir(mburl.path) elif mburl.scheme == "mh": self.sent_box = mailbox.MH(mburl.path) elif mburl.scheme == "babyl": self.sent_box = mailbox.Babyl(mburl.path) elif mburl.scheme == "mmdf": self.sent_box = mailbox.MMDF(mburl.path) self.sent_tags = sent_tags self.draft_box = None if draft_box: mburl = urlparse(draft_box) if mburl.scheme == "mbox": self.draft_box = mailbox.mbox(mburl.path) elif mburl.scheme == "maildir": self.draft_box = mailbox.Maildir(mburl.path) elif mburl.scheme == "mh": self.draft_box = mailbox.MH(mburl.path) elif mburl.scheme == "babyl": self.draft_box = mailbox.Babyl(mburl.path) elif mburl.scheme == "mmdf": self.draft_box = mailbox.MMDF(mburl.path) self.draft_tags = draft_tags
def read_comment_emails(cls, mbox): """ reads mbox for emails and adds valid comment-reply emails into the database if fails to add, adds to fail-box. For email replys to comments only, use the subject: :param mbox: :return: """ if mbox is None: return m = mailbox.mbox(mbox) m.lock() fail = mailbox.mbox(mbox + ".failed") fail.lock() processed = mailbox.mbox(mbox + ".processed") processed.lock() try: for key in m.iterkeys(): try: message = m[key] comment, user = cls.parse_subject(message.get("subject")) if comment is None or user is None: key = fail.add(message) fail.flush() m.discard(key) m.flush() logger.info("Failed to add comment-reply-email {subject} key:{key} added to ".format(subject=message.get("subject"), key=key) + fail._file.name) continue new_comment = rt.models.comments.Comment() new_comment.owner = comment.owner new_comment.reply_to = comment new_comment.user = user new_comment.short_text = message.get_payload() new_comment.full_clean() new_comment.save() key = processed.add(message) logger.info("New comment {} via email key: {}".format(new_comment.id,key)) processed.flush() m.discard(key) m.flush() except Exception as e: key = fail.add(message) fail.flush() logger.exception("Failed to add comment-reply-email {subject} key:{key} added to ".format(subject=message.get("subject"), key=key) + fail._file.name) m.discard(key) m.flush() except Exception as e: raise e finally: for b in (m, fail, processed): b.flush() b.unlock() b.close()
def __init__ (self, mbox_file, nmbox_file): ''' Constructor ''' self.src_mbox = mailbox.mbox(mbox_file) self.dest_mbox = mailbox.mbox(nmbox_file, create=True) self.faker = Faker() self.emails_name = {} self.domains = {} self.re_email = re.compile(r'[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}')
def clone(mbox_file_path, num_times_size, out_file_path): if os.path.exists(out_file_path): os.remove(out_file_path) mbox_to_clone = mailbox.mbox(mbox_file_path) mbox_clone = mailbox.mbox(out_file_path, create=True) percentage = 0 for i in range(len(mbox_to_clone)): if (i % (len(mbox_to_clone) // 10)) == 0: print(str(percentage) + "%") percentage += 10 for _ in range(num_times_size): mbox_clone.add(mbox_to_clone[i]) mbox_clone.flush()
def _load_messages(course, stream, mailbox=None, input_=None, output=None, continue_after_invalid_message=False, trust_email_infrastructure=False, respond=None, dry_run=False): if mailbox is None: _LOG.debug('loading message from {}'.format(stream)) mbox = None messages = [(None,_message_from_file(stream))] if output is not None: ombox = _mailbox.Maildir(output, factory=None, create=True) elif mailbox == 'mbox': mbox = _mailbox.mbox(input_, factory=None, create=False) messages = mbox.items() if output is not None: ombox = _mailbox.mbox(output, factory=None, create=True) elif mailbox == 'maildir': mbox = _mailbox.Maildir(input_, factory=None, create=False) messages = [] for key,msg in mbox.items(): subpath = mbox._lookup(key) if subpath.endswith('.gitignore'): _LOG.debug('skipping non-message {}'.format(subpath)) continue messages.append((key, msg)) if output is not None: ombox = _mailbox.Maildir(output, factory=None, create=True) else: raise ValueError(mailbox) messages.sort(key=_get_message_time) for key,msg in messages: try: ret = _parse_message( course=course, message=msg, trust_email_infrastructure=trust_email_infrastructure) except _InvalidMessage as error: error.message = msg _LOG.warn('invalid message {}'.format(error.message_id())) if not continue_after_invalid_message: raise _LOG.warn('{}'.format(error)) if respond: response = _get_error_response(error) if response is not None: respond(response) continue if output is not None and dry_run is False: # move message from input mailbox to output mailbox ombox.add(msg) if mbox is not None: del mbox[key] yield ret
def copyMboxFiles(opera,tbird): if not os.path.exists(tbird+"/Migration"): tBox = mailbox.mbox(tbird+"/Migration") else: print("The mailbox Migration exists! \nPlease remove it before starting the migration") sys.exit() for root, dirs, files in os.walk(opera): for fn in files: mb = mailbox.mbox(root+"/"+fn) for message in mb: tBox.add(message) tBox.flush()
def main(args=None): try: # Setup locale # Set LC_TIME to "C" so that imaplib.Time2Internaldate() # uses English month name. locale.setlocale(locale.LC_ALL, "") locale.setlocale(locale.LC_TIME, "C") # Encoding of the sys.stderr enc = locale.getlocale()[1] or "utf_8" sys.stderr = codecs.lookup(enc)[-1](sys.stderr, errors="ignore") # Parse arguments if args is None: args = sys.argv[1:] parser = MyOptionParser() options = parser.parse_args(args) if len(str(options.user)) == 0: print "User name: ", options.user = sys.stdin.readline().rstrip("\n") if len(str(options.password)) == 0: options.password = getpass.getpass() options = options.__dict__ src = options.pop("src") err = options.pop("error") time_fields = options.pop("time_fields") recurse = options.pop("r") # Connect to the server and login print >>sys.stderr, \ "Connecting to %s:%s." % (options["host"], options["port"]) uploader = IMAPUploader(**options) uploader.open() if(not recurse): # Prepare source and error mbox src = mailbox.mbox(src, create=False) if err: err = mailbox.mbox(err) upload(uploader, options["box"], src, err, time_fields) else: recursive_upload(uploader, "", src, err, time_fields) return 0 except optparse.OptParseError, e: print >>sys.stderr, e return 2
def parse(self): logging.info('Email file: {}'.format(self.email_file)) out = [] i = 0 for msg in mailbox.mbox(self.email_file): logging.info('--------- Parsing message {} ---------'.format(i)) msg_data = dict(type='email') msg_data['date'] = self.decode_field(msg['date']) msg_data['date'] = format_basic(msg_data['date']) for field in ['from', 'to', 'cc', 'bcc']: msg_data[field] = self.decode_field(msg[field]) msg_data[field] = format_address(msg_data[field]) logging.debug('{}: {}'.format(field, msg_data[field])) try: payload = format_text(self.get_payload_text(msg)) except Exception as e: raise e subject = self.decode_field(msg['subject']) subject = format_text(subject) text = ' '.join([subject, payload]) msg_data['text'] = text logging.debug(text) out.append(msg_data) i += 1 return out
def open_list_archives(url,base_arc_dir="archives"): """ Returns a list of all email messages contained in the specified directory. The argument *url* here is taken to be the name of a subdirectory of the directory specified in argument *base_arc_dir*. This directory is expected to contain files with extensions .txt, .mail, or .mbox. These files are all expected to be in mbox format-- i.e. a series of blocks of text starting with headers (colon-separated key-value pairs) followed by an email body. """ list_name = get_list_name(url) arc_dir = archive_directory(base_arc_dir,list_name) file_extensions = [".txt", ".mail", ".mbox"] txts = [os.path.join(arc_dir,fn) for fn in os.listdir(arc_dir) if any([fn.endswith(extension) for extension in file_extensions])] print 'Opening %d archive files' % (len(txts)) arch = [mailbox.mbox(txt, create=False).values() for txt in txts] messages = [item for sublist in arch for item in sublist] return messages
def test_to_message_from_message_with_spam(): mb = mailbox.mbox("tests/spam") fails = 0 total = 0 for msg in mb: try: m = encoding.from_message(msg) out = encoding.to_message(m) assert repr(out) m2 = encoding.from_message(out) for k in m: if '@' in m[k]: assert_equal(parseaddr(m[k]), parseaddr(m2[k])) else: assert m[k].strip() == m2[k].strip(), "%s: %r != %r" % (k, m[k], m2[k]) assert not m[k].startswith(u"=?") assert not m2[k].startswith(u"=?") assert m.body == m2.body, "Bodies don't match" assert_equal(len(m.parts), len(m2.parts), "Not the same number of parts.") for i, part in enumerate(m.parts): assert part.body == m2.parts[i].body, "Part %d isn't the same: %r \nvs\n. %r" % (i, part.body, m2.parts[i].body) total += 1 except encoding.EncodingError, exc: fails += 1
def parse_prod(logdate): global stats global users maildate = ''.join([x[-2:] for x in logdate.split('-')]) mailarchive = join(utils.get_conf()['Dir::Base'], 'mail/archive', 'mail-%s.xz' % maildate) if not isfile(mailarchive): return (fd, tmpfile) = utils.temp_filename(utils.get_conf()['Dir::TempPath']) system('xzcat %s > %s' % (mailarchive, tmpfile)) for message in mbox(tmpfile): if (message['subject'] and message['subject'].startswith('Comments regarding')): try: member = users[' '.join(message['From'].split()[:-1])] except KeyError: continue ts = mktime_tz(parsedate_tz(message['date'])) timestamp = datetime.fromtimestamp(ts).strftime("%Y%m%d%H%M%S") date = parse_timestamp(timestamp) if date not in stats: stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0, 'REJECT': 0, 'PROD': 0}, 'members': {}} if member not in stats[date]['members']: stats[date]['members'][member] = {'ACCEPT': 0, 'REJECT': 0, 'PROD': 0} if member not in stats['history']['members']: stats['history']['members'][member] = {'ACCEPT': 0, 'REJECT': 0, 'PROD': 0} stats[date]['stats']['PROD'] += 1 stats[date]['members'][member]['PROD'] += 1 stats['history']['stats']['PROD'] += 1 stats['history']['members'][member]['PROD'] += 1 unlink(tmpfile)
def convert(db_path, mbox_path, device_name): mbox = mailbox.mbox(mbox_path, create=True) conn = sqlite3.connect(db_path) conn.row_factory = sqlite3.Row curs = conn.cursor() curs.execute("SELECT * FROM ZNOTE") for row in curs.fetchall(): curs.execute("SELECT * FROM ZNOTEBODY WHERE Z_PK=?", (row['Z_PK'],)) content = curs.fetchone()['ZCONTENT'].encode('utf8') msg = MIMEText(content, 'html', 'utf8') msg.set_unixfrom("From %s %s" % (device_name, shift_date(row['ZMODIFICATIONDATE']).ctime())) msg['From'] = device_name subject = row['ZTITLE'] print subject print row['ZSUMMARY'] try: subject.encode('ascii') except UnicodeEncodeError: try: subject = email.header.Header(subject.encode('GB2312'), 'GB2312') except UnicodeEncodeError: subject = email.header.Header(subject.encode('utf8'), 'UTF-8') msg['Subject'] = subject msg['X-Universally-Unique-Identifier'] = row['ZGUID'] msg.set_charset('utf-8') msg['X-Uniform-Type-Identifier'] = 'com.apple.mail-note' msg['MIME-Version'] = "1.0 (Apple Message framework v1244.3)" msg['X-Mail-Created-Date'] = rfc_2822(shift_date(row['ZCREATIONDATE'])) msg['Date'] = rfc_2822(shift_date(row['ZMODIFICATIONDATE'])) msg['X-Mail-Generated-Subject'] = 'YES' mbox.add(msg)
def parse_mbox(path, list_id): results = {models.Patch: 0, models.CoverLetter: 0, models.Comment: 0} duplicates = 0 dropped = 0 mbox = mailbox.mbox(path) for msg in mbox: try: obj = parsemail.parse_mail(msg, list_id) if obj: results[type(obj)] += 1 else: dropped += 1 except django.db.utils.IntegrityError: duplicates += 1 print( "Processed %(total)d messages -->\n" " %(covers)4d cover letters\n" " %(patches)4d patches\n" " %(comments)4d comments\n" " %(duplicates)4d duplicates\n" " %(dropped)4d dropped\n" "Total: %(new)s new entries" % { "total": len(mbox), "covers": results[models.CoverLetter], "patches": results[models.Patch], "comments": results[models.Comment], "duplicates": duplicates, "dropped": dropped, "new": len(mbox) - duplicates - dropped, } )
def frontloader(*args): tid = args[0] tlocks[tid].acquire() c = LMTPClient("localhost", 10024) c.lhlo("host") mb = mailbox.mbox(MAILBOX, factory=None, create=False) i = 1 while i < MESSAGES: for msg in mb.values(): addr = string.split(msg.get_from())[0] c.send(addr, USERNAME, msg.as_string()) if not i % RECONNECT: c.quit() c = LMTPClient("localhost", 10024) c.lhlo("host") sys.stdout.write("_") else: sys.stdout.write(".") sys.stdout.flush() i = i + 1 if i >= MESSAGES: break c.quit() tlocks[tid].release()
def send_mbox(mbox_filename, args): try: mbox = mailbox.mbox(mbox_filename) send_mailbox(mbox, args) finally: if mbox: mbox.close()
def initialize(self, mbox_file): self.mbox_file = '%s.ooo' % mbox_file self.mbox = mailbox.mbox(self.mbox_file) self.mbox.clear() subject_re = [ r'^Absen(t|ce)', r'^(AUTO: )?Out of (the )?office', r'^Auto( ?): ', r'^AutoRe( ?):', r'^Automatic reply: ', r'automatique d\'absence', r'Automated Reply', r'AutoReply', r'(est|is) absent', r'^En dehors du bureau', r'I am out of town', r'I am currently away', r'(am|is) out of (the )?office', r' n\'est pas joignable', r'Notification d\'absence', r'^Out of email reach', r'R.{1,2}ponse automatique( :)?', # There may be encoding error of e acute r'^Respuesta de Estoy ausente:', ] self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re]
def maildir2mailbox(maildirname, mboxfilename): """ slightly adapted from maildir2mbox.py, Nathan R. Yergler, 6 June 2010 http://yergler.net/blog/2010/06/06/batteries-included-or-maildir-to-mbox-again/ Port to Python 3 by Philippe Fremy """ # open the existing maildir and the target mbox file maildir = mailbox.Maildir(maildirname, email.message_from_binary_file) mbox = mailbox.mbox(mboxfilename) # lock the mbox # mbox.lock() # iterate over messages in the maildir and add to the mbox n = len(maildir) for i, v in enumerate(maildir.iteritems()): key, msg = v if (i % 10) == 9: print( 'Progress: msg %d of %d' % (i+1,n)) try: mbox.add(msg) except Exception: print( 'Exception while processing msg with key: %s' % key ) traceback.print_exc() # close and unlock mbox.close() maildir.close()
def _get_otp(self): """Internal method to get the OTP, either interactively over the commandline or by checking a mailbox (mbox). """ interactive = get_from_tconfig(['sms_token', 'interactive'], required=True) mbox_filepath = get_from_tconfig(['sms_token', 'mbox_filepath'], default="/var/mail/jenkins") otp = None if interactive.lower() == 'true': otp = raw_input("OTP (check your e-mail): ") else: time.sleep(10) # Wait for sms to arrive mybox = mailbox.mbox(mbox_filepath) mybox.lock() try: print "Mailbox length: " + str(len(mybox)) def get_mail_delivery_date(key_mail_pair): mail = key_mail_pair[1] date_tuple = parsedate(mail['Delivery-date']) return time.mktime(date_tuple) newest_mail_key, newest_mail = max(mybox.iteritems(), key=get_mail_delivery_date) self.assertTrue(newest_mail is not None, "No sms in mbox") payload = newest_mail.get_payload() matches = re.search(r"\d{6}", payload) self.assertTrue(matches is not None, "No OTP in sms message %r" % newest_mail) otp = matches.group(0) mybox.remove(newest_mail_key) except Exception as exc: raise exc finally: mybox.close() mybox.unlock() return otp
def generate_mbox(messages, full_tags): mbox_dir = config.get_mbox_path() mid = escape_message_id(messages[0][0].get_message_id()) tmp_mbox_path = "%s/tmp-%s" % (mbox_dir, mid) mbox_path = "%s/mbox-%s" % (mbox_dir, mid) mbox = mailbox.mbox(tmp_mbox_path, create=True) for message, tags in messages: new_payload = add_tags(message, merge_tags(full_tags, tags)) msg = message.get_message_parts()[0] # Drop content transfer encoding so msg.set_payload() will re-encode if "content-transfer-encoding" in msg: del msg["content-transfer-encoding"] # Change charset to UTF-8 to guarantee that we can represent all # characters. Think about the case where the patch email was ASCII and # a reviewer with a non-ASCII name replied with a Reviewed-by tag, now # the patch can no longer be represented by ASCII. msg.set_payload(new_payload.encode("utf-8"), "utf-8") mbox.add(msg) mbox.flush() mbox.close() os.rename(tmp_mbox_path, mbox_path) return config.get_mbox_prefix() + ("mbox-%s" % mid)
def process(file, config, rcontext, columns=None): fullpath = file.fullpath try: with open(fullpath, "rb") as file: # Read the beginning of the file to check if it looks like # an mbox file. If not, stop. if not file.read(5).startswith("From "): return None messages = [] mbox = mailbox.mbox(fullpath, create=False) for mboxmessage in mbox: message = {} message['flags'] = mboxmessage.get_flags() message['content'] = mboxmessage.get_payload() # Stores each line such as To:, Subject:, etc. (if present) for key, value in mboxmessage.items(): message[key] = value messages.append(message) # Print some data that is stored in # the database if debug is true if config.DEBUG: print "\nMbox file data:" print "%-18s %s" % (columns[0], messages) print return [messages] except: traceback.print_exc(file=sys.stderr) return None
def arch_month_mbox(request, list_name, year, month_name): store = get_store(request) mlist = get_list_by_name(list_name, store, request) if mlist is None: raise Http404("No archived mailing-list by that name.") month = month_name_to_num(month_name) year = int(year) begin_date = datetime.datetime(year, month, 1) if month != 12: end_month = month + 1 else: end_month = 1 end_date = datetime.datetime(year, end_month, 1) messages = store.get_messages(mlist.name, start=begin_date, end=end_date) messages.reverse() # they are sorted recent first by default mboxfile, mboxfilepath = tempfile.mkstemp(prefix="hyperkitty-", suffix=".mbox.gz") os.close(mboxfile) mbox = mailbox.mbox(mboxfilepath) for message in messages: mbox.add(message.full) mbox.close() content = StringIO() zipped_content = gzip.GzipFile(fileobj=content) with gzip.GzipFile(fileobj=content, mode="wb") as zipped_content: with open(mboxfilepath, "rb") as mboxfile: zipped_content.write(mboxfile.read()) response = HttpResponse(content.getvalue()) content.close() response['Content-Type'] = "application/mbox+gz" response['Content-Disposition'] = 'attachment; filename=%d-%s.txt.gz' \ % (year, month_name) response['Content-Length'] = len(response.content) os.remove(mboxfilepath) return response
def check_mail(label, nomail, ignore, colors): """ Check mail in mailbox "label" and return report and color """ name = label if not os.path.isabs(name): name = os.path.join(os.environ['HOME'], name) if not os.path.exists(name) or os.path.getsize(name) == 0: return nomail, colors[0] if os.path.isfile(name): import mailbox try: mbox = mailbox.mbox(name, create = False) messages = 0 for msg in mbox: if msg.get_flags() == '': messages += 1 if messages > 0: return '{0}:{1}'.format(os.path.basename(name), messages), colors[1] else: return nomail, colors[0] except IOError, exception: return '{0}: {1}'.format(name, exception.strerror), colors[2] except:
def initialize(self, mbox_file): self.seen = 0 self.bad_problems = 0 self.emails = [] self.mbox_file = '%s.bounced' % mbox_file self.mbox = mailbox.mbox(self.mbox_file) self.mbox.clear()
def handle(self, *args, **options): results = { models.Patch: 0, models.CoverLetter: 0, models.Comment: 0, } duplicates = 0 dropped = 0 errors = 0 # TODO(stephenfin): Support passing via stdin path = args and args[0] or options['infile'] if not os.path.exists(path): self.stdout.write('Invalid path: %s' % path) sys.exit(1) # assume if <infile> is a directory, then we're passing a maildir if os.path.isfile(path): mbox = mailbox.mbox(path, create=False) else: mbox = mailbox.Maildir(path, create=False) count = len(mbox) # Iterate through the mbox. This will pick up exceptions that are only # thrown when a broken email is found part way through. Without this # block, we'd get the exception thrown in enumerate(mbox) below, which # is harder to catch. This is due to a bug in the Python 'email' # library, as described here: # # https://lists.ozlabs.org/pipermail/patchwork/2017-July/004486.html # # The alternative is converting the mbox to a list of messages, but # that requires holding the entire thing in memory, which is wateful. try: for m in mbox: pass except AttributeError: logger.warning('Broken mbox/Maildir, aborting') return logger.info('Parsing %d mails', count) for i, msg in enumerate(mbox): try: obj = parse_mail(msg, options['list_id']) if obj: results[type(obj)] += 1 else: dropped += 1 except django.db.utils.IntegrityError: duplicates += 1 except ValueError: # TODO(stephenfin): Perhaps we should store the broken patch # somewhere for future reference? errors += 1 if (i % 10) == 0: self.stdout.write('%06d/%06d\r' % (i, count), ending='') self.stdout.flush() self.stdout.write( 'Processed %(total)d messages -->\n' ' %(covers)4d cover letters\n' ' %(patches)4d patches\n' ' %(comments)4d comments\n' ' %(duplicates)4d duplicates\n' ' %(dropped)4d dropped\n' ' %(errors)4d errors\n' 'Total: %(new)s new entries' % { 'total': count, 'covers': results[models.CoverLetter], 'patches': results[models.Patch], 'comments': results[models.Comment], 'duplicates': duplicates, 'dropped': dropped, 'errors': errors, 'new': count - duplicates - dropped - errors, }) mbox.close()
def generate_kmeans_clustering(mbox_filename, output_filename, author_uid_filename, json_filename, top_n=None): """ From the .MBOX file, this function extracts the email content is extracted using two predefined classes available in the Python Standard Library: Mailbox and Message. Feature vectors are created for all the authors by obtaining meaningful words from the mail content, after removing the stop words, using NLTK libraries. The words obtained are transformed using stemming or lemmatization before adding these words to the word list of the corresponding authors. A matrix is created out of these word lists such that row set is the union of terms of all the authors and the column set contains the authors. If a term does not appear in a document, the corresponding matrix entry would be zero. The resulting matrix is called term-document matrix. Then tf-idf analysis is performed on the term-document matrix. Finally the top-10 words of each author is listed by their weight values. Each entry corresponds to the tf-idf normalized coefficient of the keyword for a user. If a keyword is not present in the top-10 keywords of a user, then the corresponding matrix entry would be zero. Also returns the feature names. :param mbox_filename: Contains the absolute or relative address of the MBOX file to be opened. :return: Term Document Matrix: The columns of the matrix are the users and the rows of the matrix are the keywords. """ english_stopwords = set( stopwords.words('english') ) | custom_stopwords.common_words | custom_stopwords.custom_words email_re = re.compile(r'[\w\.-]+@[\w\.-]+') wnl = WordNetLemmatizer() print("Reading messages from MBOX file...") mailbox_obj = mailbox.mbox(mbox_filename) with open(author_uid_filename, 'r') as map_file: author_uid_map = json.load(map_file) map_file.close() top_n = min(len(author_uid_map), top_n) top_authors, top_authors_index = get_top_authors(top_n, json_filename) keywords_list = [list() for x in range(top_n + 1)] i = 0 # Number of emails processed for message in mailbox_obj: temp = email_re.search(str(message['From'])) from_addr = temp.group(0) if temp is not None else message['From'] if top_n is not None and from_addr not in top_authors: continue if top_n is None and from_addr not in author_uid_map.keys(): continue msg_body = get_message_body(message) if from_addr is None: from_addr = message['From'] msg_tokens = [ x.lower() for x in re.sub('\W+', ' ', msg_body).split() if 2 < len(x) < 30 ] # Toggle comment below if numbers and underscores should also be removed. # msg_tokens = [x for x in re.sub('[^a-zA-Z]+', ' ', msg_body).split() if 2 < len(x) < 30] msg_tokens = [ wnl.lemmatize(x) for x in msg_tokens if not x.isdigit() and x not in from_addr ] msg_tokens = [x for x in msg_tokens if x not in english_stopwords] keywords_list[top_authors_index[from_addr]].extend(msg_tokens) i += 1 if not i % 10000: print(i, "of", len(mailbox_obj), "messages processed.") for num in range(len(keywords_list)): keywords_list[num] = " ".join(keywords_list[num]) print("Performing tf-idf analysis on the term-document matrix...") vectorizer = TfidfVectorizer(analyzer='word', stop_words=english_stopwords, max_features=200000, use_idf=True, ngram_range=(1, 4)) tfidf_matrix = vectorizer.fit_transform(keywords_list).toarray() # with open("author_top_index.json", 'w') as json_file: # json.dump(top_authors_index, json_file) # print(feature_names) kmeans_classifier = KMeans(n_clusters=8, n_init=4) labels = kmeans_classifier.fit_predict(tfidf_matrix) clustering = dict() for i in range(len(labels)): x = None for k, v in author_uid_map.items(): if v == i: x = k if clustering.get(str(labels[i]), None) is None: clustering[str(labels[i])] = [x] else: clustering[str(labels[i])].append(x) with open(output_filename, 'w') as out_file: json.dump(clustering, out_file) out_file.close()
def startup(self): self.box = mailbox.mbox(self.filename)
"""Extracts: To, From, Subject and Date from email.Message() or mailbox.Message() origin -- Message() object Returns tuple(From, To, Subject, Date) If message doesn't contain one/more of them, the empty strings will be returned. """ Date = "" if origin.has_key("date"): Date = origin["date"].strip() From = "" if origin.has_key("from"): From = origin["from"].strip() To = "" if origin.has_key("to"): To = origin["to"].strip() Subject = "" if origin.has_key("subject"): Subject = origin["subject"].strip() return From, To, Subject, Date f = open("emailMailbox", "rb") open("noEmailsToMailbox.txt", "w") mailbox = mailbox.mbox(f.name) ommitedEmailAdresses = ["*****@*****.**", "*****@*****.**"] for message in mailbox: msg = pullout(message, f.name) emailCaption = caption(message) if (not any(emailAddress in emailCaption[0] for emailAddress in ommitedEmailAdresses)) and ("*****@*****.**" in emailCaption[1]): file = open("noEmailsToMailbox.txt", "a") file.write(msg[0]) file.close f.close()
def open_mbox_file(): my_file = Path(args.mbox_path) if not my_file.is_file(): print("path '%s' is not a file" % args.mbox_path) exit(0) return mailbox.mbox(args.mbox_path)
# -*- coding: utf-8 -*- import mailbox from bs4 import BeautifulSoup from dateutil import parser mbox = mailbox.mbox('Indeed.mbox') for message in mbox: if message.is_multipart(): content = ''.join( part.get_payload(decode=True) for part in message.get_payload()) else: content = message.get_payload(decode=True) dt = parser.parse(message['Date']).strftime("%Y-%m-%d") soup = BeautifulSoup(content, "lxml") for block in soup.select(".job_company_location_wrapper .nolink"): print dt + ";" + block.text.rstrip().lstrip().encode('utf-8') for block in soup.select('span.sg-paragraph-large.db'): print dt + ";" + block.text.replace( '-', '').rstrip().lstrip().encode('utf-8') for block in soup.select(".job-company-location"): print dt + ";" + block.text.replace( '-', '').rstrip().lstrip().encode('utf-8')
def clasificar_correo(self, fichero): correos = mailbox.mbox(fichero) return [self.clasificar_mensaje(mensaje) for mensaje in correos]
def process_mbox_files(group_name, service): """Iterates over the mbox files found in the user's subdir and imports them. Args: group_name: The email address of the group to import into. service: A Gmail API service object. Returns: A tuple of: Number of labels imported without error, Number of labels imported with some errors, Number of labels that failed completely, Number of messages imported without error, Number of messages that failed. """ number_of_labels_imported_without_error = 0 number_of_labels_imported_with_some_errors = 0 number_of_labels_failed = 0 number_of_messages_imported_without_error = 0 number_of_messages_failed = 0 base_path = os.path.join(args.dir, group_name) for root, dirs, files in os.walk(base_path): for dir in dirs: try: labelname = os.path.join(root[len(base_path) + 1:], dir) except Exception: logging.error("Labels under '%s' may not nest correctly", dir) for file in files: filename = root[len(base_path) + 1:] if filename: filename += '/' filename += file labelname, ext = os.path.splitext(filename) full_filename = os.path.join(root, file) if labelname.endswith('.mbox/mbox'): logging.error("It's seem to be Apple Mail export. It's not handled by the script") # Assume this is an Apple Mail export, so there's an mbox file inside a # dir that ends with .mbox. # labelname = labelname[:-10] # logging.info("File '%s' looks like an Apple Mail export, importing it " # "into label '%s'", # full_filename, # labelname) elif ext != '.mbox': logging.info("Skipping '%s' because it doesn't have a .mbox extension", full_filename) continue if os.path.isdir(full_filename): # This "shouldn't happen" but it does, sometimes. # Assume this is an Apple Mail export, so there's an mbox file inside the dir. full_filename += os.path.join(full_filename, 'mbox') logging.info("Using '%s' instead of the directory", full_filename) logging.info("Starting processing of '%s'", full_filename) number_of_successes_in_label = 0 number_of_failures_in_label = 0 mbox = mailbox.mbox(full_filename) logging.info("Using label name '%s'", labelname) total = len(mbox) for index, message in enumerate(mbox): if index < args.from_message: continue logging.info("Processing message %d/%d in label '%s'", index, total, labelname) try: # Use media upload to allow messages more than 5mb. # See https://developers.google.com/api-client-library/python/guide/media_upload # and http://google-api-python-client.googlecode.com/hg/docs/epy/apiclient.http.MediaIoBaseUpload-class.html. if sys.version_info.major == 2: message_data = io.BytesIO(message.as_string()) else: message_data = io.StringIO(message.as_string()) media = MediaIoBaseUpload(message_data, mimetype='message/rfc822') service.archive().insert( groupId=group_name, media_body=media).execute() number_of_successes_in_label += 1 except Exception: number_of_failures_in_label += 1 logging.exception('Failed to import mbox message') logging.info("Finished processing '%s'. %d messages imported " "successfully, %d messages failed.", full_filename, number_of_successes_in_label, number_of_failures_in_label) if number_of_failures_in_label == 0: number_of_labels_imported_without_error += 1 elif number_of_successes_in_label > 0: number_of_labels_imported_with_some_errors += 1 else: number_of_labels_failed += 1 number_of_messages_imported_without_error += number_of_successes_in_label number_of_messages_failed += number_of_failures_in_label return (number_of_labels_imported_without_error, # 0 number_of_labels_imported_with_some_errors, # 1 number_of_labels_failed, # 2 number_of_messages_imported_without_error, # 3 number_of_messages_failed) # 4
# get mbox file mbox_file = raw_input("name of mbox file in current directory (ex. my_file.mbox): ") # get name to filter name_filter = raw_input("name of sender that you want to filter (ex. Jarrod Parkes): ") # get email to filter email_filter = raw_input("email of sender that you want to filter (ex. [email protected]): ") # create CSV file writer = csv.writer(open(export_file_name, "wb")) # create header row writer.writerow(["subject", "from", "date", "body"]) # add rows based on mbox file for message in mailbox.mbox(mbox_file): contents = get_message(message) contents = html2text.html2text(contents) # does message contain name or email filter? if name_filter != "" and name_filter in message["from"]: writer.writerow([message["subject"], message["from"], message["date"], contents]) elif email_filter != "" and email_filter in message["from"]: writer.writerow([message["subject"], message["from"], message["date"], contents]) else: continue # print finish message print "generated csv file called " + export_file_name
#!/usr/bin/env python """ This example shows how to restore a backup created by backup_mailbox.py" """ from ProcImap.ImapMailbox import ImapMailbox from ProcImap.ImapMessage import ImapMessage from ProcImap.Utils.MailboxFactory import MailboxFactory from mailbox import mbox import sys # usage: restore_mailbox.py backupmbox imapmailbox mailboxes = MailboxFactory('/home/goerz/.procimap/mailboxes.cfg') server = mailboxes.get_server('Gmail') mailbox = ImapMailbox((server, sys.argv[2])) backupsource = mbox(sys.argv[1], factory=ImapMessage) for message in backupsource: if message.has_key("X-ProcImap-Imapflags"): message.flags_from_string(message["X-ProcImap-Imapflags"]) del message["X-ProcImap-Imapflags"] if message.has_key("X-ProcImap-ImapInternalDate"): message.internaldate_from_string( message["X-ProcImap-ImapInternalDate"]) del message["X-ProcImap-ImapInternalDate"] mailbox.add(message) mailbox.close() backupsource.close() sys.exit(0)
def collect_from_url(url, base_arch_dir="archives"): url = normalize_mailing_list_url(url) list_name = mailman.get_list_name(url) logging.info("Getting W3C list archive for %s" % list_name) response = urllib2.urlopen(url) html = response.read() soup = BeautifulSoup(html) time_period_indices = list() rows = soup.select('tbody tr') for row in rows: link = row.select('td:nth-of-type(1) a')[0].get('href') logging.info("Found time period archive page: %s" % link) time_period_indices.append(link) # directory for downloaded files arc_dir = mailman.archive_directory(base_arch_dir, list_name) for link in time_period_indices: link_url = urlparse.urljoin(url, link) response = urllib2.urlopen(link_url) html = response.read() soup = BeautifulSoup(html) end_date_string = soup.select( '#end')[0].parent.parent.select('em')[0].get_text() end_date = dateutil.parser.parse(end_date_string) year_month_mbox = end_date.strftime('%Y-%m') + '.mbox' mbox_path = os.path.join(arc_dir, year_month_mbox) # looks like we've already downloaded this timeperiod if os.path.isfile(mbox_path): logging.info( 'Looks like %s already exists, moving on.' % mbox_path) continue logging.info('Downloading messages to archive to %s.' % mbox_path) message_links = list() messages = list() anchors = soup.select('div.messages-list a') for anchor in anchors: if anchor.get('href'): message_url = urlparse.urljoin(link_url, anchor.get('href')) message_links.append(message_url) for message_link in message_links: response = urllib2.urlopen(message_link) html = response.read() message = W3cMailingListArchivesParser().parsestr(html) messages.append(message) time.sleep(1) # wait between loading messages, for politeness mbox = mailbox.mbox(mbox_path) mbox.lock() try: for message in messages: mbox.add(message) mbox.flush() finally: mbox.unlock() logging.info('Saved ' + year_month_mbox)
import mailbox import re import csv mbox_file = "data/BLOCKED.mbox" mbox = mailbox.mbox(mbox_file) def getbody(message): #getting plain text 'email body' body = None if message.is_multipart(): for part in message.walk(): if part.is_multipart(): for subpart in part.walk(): if subpart.get_content_type() == 'text/plain': body = subpart.get_payload(decode=True) elif part.get_content_type() == 'text/plain': body = part.get_payload(decode=True) elif message.get_content_type() == 'text/plain': body = message.get_payload(decode=True) return body def main(): total_messages = 1 claim_list_file = "data/block_list.csv" with open(claim_list_file, 'w', newline='', encoding='utf-8') as new_file: out_file_headers = [ 'channel_name', 'video_title', 'copyrighted_content', 'claimed_by', 'claim_note', 'claim_url', 'claim_date' ]
from detector import Detector import mailbox import re PATH_TO_INBOX_MBOX = "~/Documents/Fall15/research/Inbox.mbox" PATH_TO_ARCHIVE_MBOX = "~/Documents/Fall15/research/Archived.mbox" inbox = mailbox.mbox(PATH_TO_INBOX_MBOX) #archive = mailbox.mbox(PATH_TO_ARCHIVE_MBOX) #Has timezone abbrev #att2 = re.compile("\(?[A-Z][A-Z][A-Z]\)?") #Has timezone in offset timezone_re = re.compile("([+-][0-9][0-9][0-9][0-9])") sender_to_email_map = {} sender_to_date_data = {} num_emails = 0 class Timezone: def __init__(self, date_string): # Timezone self.timezone = Timezone.convert_to_timezone_string(date_string) def same_timezone(self, other): if (self.timezone != other.timezone): return False return True
import mailbox import uuid import email.utils import sqlite3 import config # config.py file from email_reply_parser import EmailReplyParser # https://github.com/zapier/email-reply-parser # Load config variables from config.py mbox = mailbox.mbox(config.DWAYNE_CONFIG['mailbox']) NQUESTIONS = config.DWAYNE_CONFIG['NQUESTIONS'] db_name = config.DWAYNE_CONFIG['db_name'] question_1 = config.DWAYNE_CONFIG['question_1'] question_2 = config.DWAYNE_CONFIG['question_2'] question_3 = config.DWAYNE_CONFIG['question_3'] len1 = len(question_1) len2 = len(question_2) len3 = len(question_3) uidstring = config.DWAYNE_CONFIG['uidstring'] lenid = len(uidstring) def get_shortid(subject_line): """ Returns the short id from the email subject """ shortid = None # TODO check parsing method
messages = [] archived = [] servername = hostname() msquarantine = get_config_option('QuarantineDir') cutoff = parse('01-01-05') qdirs = ["spam", "nonspam"] for (dirname, dirs, files) in os.walk(mboxdir): #print_ '*' * 100 print 'Processing: %(d)s' % dict(d=dirname) for mail in files: if mail.startswith('.'): continue count = 0 filename = os.path.join(dirname, mail) print "Processing mailbox: %s" % mail for message in mailbox.mbox(filename): try: msgdatetime = parse(message['date'], ignoretz=True) msgtimestamp = msgdatetime.strftime( "%Y-%m-%d %H:%M:%S") msgdate = msgdatetime.strftime("%Y-%m-%d") msgtime = msgdatetime.strftime("%H:%M:%S") dirdate = msgdate.replace('-', '') # quarantinedir = os.path.join(basepath, 'data', # 'quarantine', dirdate) quarantinedir = os.path.join(msquarantine, dirdate) if not os.path.exists(quarantinedir): os.mkdir(quarantinedir) os.mkdir(os.path.join(quarantinedir, 'spam')) os.mkdir(os.path.join(quarantinedir, 'nonspam')) messageid = parseaddr(message['message-id'])[1]
def parseMbox(file): for message in mailbox.mbox(file): msg = parseMessage(message, file) if (msg): parseContacts(message, msg) parseHeaders(message, msg)
def mbox(self, location): # FIXME: inconsistent with maildir() mboxfile = mailbox.mbox(location) mboxfile.lock() for item in self.items(): mboxfile.add(item.eml()) mboxfile.unlock()
def readmbox(self, location): for message in mailbox.mbox(location): if sys.hexversion >= 0x03000000: _item.Item(self, eml=message.as_bytes(unixfrom=True), create=True) else: # pragma: no cover _item.Item(self, eml=message.as_string(unixfrom=True), create=True)
if __name__ == '__main__': import networkx as nx try: import matplotlib.pyplot as plt except: pass if len(sys.argv)==1: filePath = "unix_email.mbox" else: filePath = sys.argv[1] mbox = mailbox.mbox(filePath, msgfactory) # parse unix mailbox G=nx.MultiDiGraph() # create empty graph # parse each messages and build graph for msg in mbox: # msg is python email.Message.Message object (source_name,source_addr) = parseaddr(msg['From']) # sender # get all recipients # see http://www.python.org/doc/current/lib/module-email.Utils.html tos = msg.get_all('to', []) ccs = msg.get_all('cc', []) resent_tos = msg.get_all('resent-to', []) resent_ccs = msg.get_all('resent-cc', []) all_recipients = getaddresses(tos + ccs + resent_tos + resent_ccs) # now add the edges for this mail message for (target_name,target_addr) in all_recipients:
def process_mbox(filename, stats): msgs = mailbox.mbox(filename) for (j, mail) in enumerate(msgs): process_mail(mail, filename, stats)
yield msg def _read_email_text(self, msg): content_type = 'NA' if isinstance(msg, str) else msg.get_content_type() encoding = 'NA' if isinstance(msg, str) else msg.get( 'Content-Transfer-Encoding', 'NA') if 'text/plain' in content_type and 'base64' not in encoding: msg_text = msg.get_payload() elif 'text/html' in content_type and 'base64' not in encoding: msg_text = get_html_text(msg.get_payload()) elif content_type == 'NA': msg_text = get_html_text(msg) else: msg_text = None return (content_type, encoding, msg_text) ######################### End of library, example of use below mbox_obj = mailbox.mbox('data/input/personal_email/Personnel.mbox') num_entries = len(mbox_obj) message_list = [] for idx, email_obj in enumerate(mbox_obj): email_data = GmailMboxMessage(email_obj) message_list.append(email_data.parse_email().email_date) print('Parsing email {0} of {1}'.format(idx, num_entries)) message_list[:3]
sys.exit(0) filename = sys.argv[1] directory = os.path.curdir if not os.path.exists(filename): print "File doesn't exist:", filename sys.exit(1) if len(sys.argv) == 3: directory = sys.argv[2] if not os.path.exists(directory) or not os.path.isdir(directory): print "Directory doesn't exist:", directory sys.exit(1) mb = mailbox.mbox(filename) nmes = len(mb) os.chdir(directory) for i in range(len(mb)): if (VERBOSE >= 2): print "Analyzing message number", i mes = mb.get_message(i) em = email.message_from_string(mes.as_string()) subject = em.get('Subject') if subject and subject.find('=?') != -1: ll = email.header.decode_header(subject) subject = ""
def _get_mbox(name): """Open an mbox file. :param name: Name of mbox file """ return mailbox.mbox(os.path.join(TEST_SERIES_DIR, name), create=False)
def main(): ### get the arguments parser = argparse.ArgumentParser() parser.add_argument('-b', dest='needBash', action='store_true') parser.add_argument('-p', dest='needTxt', action='store_true') parser.add_argument('-m', action="store", dest="mboxFileName") args = parser.parse_args() mbox = mailbox.mbox(args.mboxFileName) writeBash = args.needBash writeParameter = args.needTxt ### if write to bash, open the bash script if writeBash: myBash = open("pbookFromMail.sh", "w") myBash.write("#!/bin/bash\n") myBash.write("pbook << EOF\n") #### preparing the output file name depending on the input file name fileInName = args.mboxFileName withoutSuffix = fileInName.split('.mbox')[0] brokenJobs = open(withoutSuffix + '.txt', 'w') brokenJob = 0 unfinishedJob = 0 totalJobCount = 0 ### looping through each mail for message in mbox: ### count the number of mails totalJobCount = totalJobCount + 1 #print "from :", message['from'] subjectLine = message['subject'] print "subject:", subjectLine ### getting different parts of the subject line eachWord = subjectLine.split() taskId = eachWord[3].split(':')[1] status = eachWord[4].split('(')[1] doneJob = eachWord[4].split('(')[1].split('/')[0] totalJob = eachWord[4].split('(')[1].split('/')[1] if message.is_multipart(): content = ''.join( part.get_payload(decode=True) for part in message.get_payload()) else: content = message.get_payload(decode=True) ### if there is a broken job, no point retrying it, rather it needs to be submitted fresh. if ('Final Status : broken' in content): brokenJobs.write("\n") brokenJobs.write("*****mail number " + str(brokenJob) + " ************************\n") contentList = content.splitlines() print content for contentLine in contentList: if 'In :' in contentLine: print contentLine brokenJobs.write(contentLine.split('In :')[1] + '\n') if (writeParameter): if 'Parameters :' in contentLine: brokenJobs.write('----\n') print contentLine brokenJobs.write(contentLine + '\n') time.sleep(0) brokenJob += 1 ### the job is not broken else: #### the doneJob is less than the totalJob, these jobs can be retried. if (int(doneJob) != int(totalJob)): print "Task ID: ", taskId print "status: ", status print "number done: ", doneJob print "number total: ", totalJob print taskId, ": ", status, ": ", doneJob, ": ", totalJob if writeBash: myBash.write("retry(" + taskId + ")\n") unfinishedJob += 1 ### close the bash script if writeBash: myBash.write("EOF") myBash.close() brokenJobs.close() print "Number of broken jobs: ", brokenJob print "Number of unfinished jobs: ", unfinishedJob print "Total number of jobs: ", totalJobCount
return datetime.fromtimestamp( email.utils.mktime_tz(email.utils.parsedate_tz(raw_timestamp))) parser = argparse.ArgumentParser( description= 'Get only the most recent files from an mbox, printing a new mbox file') parser.add_argument('mboxs', metavar="f", type=str, nargs='+', help="an mbox file to search") ns = parser.parse_args() tmpf = tempfile.NamedTemporaryFile() output = mailbox.mbox(tmpf.name, create=True) for mbox in ns.mboxs: messages = mailbox.mbox(mbox) for msg in messages: # If there is a date header, use it, otherwise, user the first line, which # always contains a timestamp.... raw_timestamp = msg['Date'] if not raw_timestamp: raw_timestamp = re.sub("^From \w+? ", "", str(msg).partition("\n")[0]) try: if parse_email_timestamp( raw_timestamp) > datetime.now() - timedelta(hours=36): output.add(msg) # if parsing a timestamp fails, log it, but just ignore that message. # ignoring a few messages is fine, since we're just using this for training
for k in ['To', 'Cc', 'Bcc', 'Received']: if not json_msg.get(k): continue json_msg[k] = json_msg[k].replace('\n', '').replace('\t', '').replace( '\r', '').replace(' ', '').split(',') try: for part in msg.walk(): if part.get_content_type() == 'text/plain': content = part.get_payload() json_msg['text'] = content except: sys.stderr.write('Skipping message - error encountered\n') finally: return json_msg def gen_json_msgs(mb): li = [] for msg in mbox: if msg is None: break li.append(jsonifyMessage(msg)) return li mbox = mailbox.mbox(MBOX) with open(OUT_FILE, 'w') as f: json.dump(gen_json_msgs(mbox), f, indent=4)
#!/usr/bin/env python import mailbox, random, string, os from email.mime.text import MIMEText from email.utils import formatdate mbox_out = 'testmbox' mbox_tmp = '/tmp/testmbox' mbox = mailbox.mbox(mbox_tmp) mailfrom = '*****@*****.**' mailto = '*****@*****.**' subject = 'Testmsg of %s kB' # You might want to adjust your size_distribution dictionary according to your needs. # The following will create an mbox with 5 mails of 10kB, 80kB, 150kB and 250kB each # in a randomized order. Imaptest will go through the mbox sequentially so the # randomness has to be in the mbox file. #size_distribution = {} #size_distribution[10] = 5 #size_distribution[80] = 5 #size_distribution[150] = 5 #size_distribution[250] = 5 size_distribution = {} size_distribution[5] = 5 size_distribution[10] = 10 size_distribution[20] = 25 size_distribution[40] = 15 size_distribution[60] = 13
def __init__(self, filename): self.mbox = mailbox.mbox(filename)
This creates a key "Candidate," and before I used google takeout to download the data I applied labels to each candidate's emails. This can then be extracted from gmail's X-label header and used to populate the Candidate key. I signed up as "Victro Pala" because I figured it was easy to pick out and unlikely to actually be included anywhere in an email. This outputs to a .json. """ import re import mailbox from mboxEmailParseAndScrub import emailMboxParser mBox = mailbox.mbox('obj/ConsolidatedPoliticalEmails.mbox') newParser = emailMboxParser(mBox) newParser.AddRegexTouple(re.compile(r'\=\w\w'), '') newParser.AddRegexTouple(re.compile(r'\[.*\]')) newParser.declareDefaultRegex() NEWLINE = '=\n' NEWLINE2 = '\n' RECIPIENT_FIRNAME = 'Victro' RECIPIENT_FIRNAME1 = 'victro' RECIPIENT_LASTNAME = 'Pala' newParser.AddReplacementTouple(RECIPIENT_FIRNAME, 'recipientFirstName') newParser.AddReplacementTouple(RECIPIENT_FIRNAME1, 'recipientFirstName') newParser.AddReplacementTouple(RECIPIENT_LASTNAME, 'recipientLastName') newParser.AddReplacementTouple(NEWLINE, '') newParser.AddReplacementTouple(NEWLINE2, ' ')
"\\n", " ").replace("\\t", " ").strip() for name in names: if name in text: text = text.replace(name, " ") return text mboxchunk_filename = "chunk_" msgnum = 0 max = 50000 out = codecs.open("mail.csv", mode="w", encoding="utf-8", errors="ignore") skipped = 0 for dirpath, dirnames, filenames in os.walk("graymail"): for file in filenames: if file.startswith(mboxchunk_filename): mbox = mailbox.mbox(os.path.join(dirpath, file)) for message in mbox: body = getbody(message) skip = False if body is None: print("no body!") body = "" else: try: body = body.decode("utf-8") except: skip = True pass if not skip: emailout = codecs.open(os.path.join(